├── .gitignore
├── LICENSE
├── README.md
├── UPDATE.md
├── generator.png
├── generator
    ├── README.md
    ├── f5
    │   └── generator_ad.yml
    ├── h3c
    │   ├── generator-demo.yml
    │   ├── generator.yml
    │   ├── switch
    │   │   └── generator_h3c_sw.yml
    │   └── wireless
    │   │   ├── README.md
    │   │   ├── generator-demo.yml
    │   │   ├── generator_h3c_wireless.yml
    │   │   ├── grafana.json
    │   │   ├── h3c-ac.yml
    │   │   └── prometheus.yml
    ├── hillstone
    │   └── firewall
    │   │   └── generator_hillstone_firewall.yml
    ├── huawei
    │   ├── generator.yml
    │   ├── generator_v3.yml
    │   ├── switch
    │   │   ├── README.md
    │   │   ├── generator-demo.yml
    │   │   ├── generator_huawei_switch.yml
    │   │   ├── grafana.json
    │   │   ├── network-switch.yml
    │   │   └── prometheus.yml
    │   └── wireless
    │   │   ├── README.md
    │   │   ├── generator-demo.yml
    │   │   ├── grafana.json
    │   │   ├── huawei-ac.yml
    │   │   └── prometheus.yml
    ├── ruijie
    │   └── wireless
    │   │   ├── README.md
    │   │   ├── generator-demo.yml
    │   │   ├── generator-ruijie.yml
    │   │   ├── grafana-ruijie.json
    │   │   ├── prometheus.yml
    │   │   └── ruijie-ac.yml
    ├── sangfor
    │   ├── ac
    │   │   ├── README.md
    │   │   ├── generator.yml
    │   │   └── sangfor-ac.txt
    │   ├── ad
    │   │   ├── README.md
    │   │   ├── generator.yml
    │   │   ├── grafana.json
    │   │   └── prometheus.yml
    │   └── af
    │   │   └── README.md
    ├── synology
    │   ├── README.md
    │   ├── generator
    │   │   └── generator_synology_nas.yml
    │   ├── grafana
    │   │   ├── Synology NAS Details Dashboard for Prometheus.json
    │   │   └── Synology NAS Overview Dashboard for Prometheus.json
    │   ├── img
    │   │   ├── 1.jpg
    │   │   ├── image-1.png
    │   │   ├── image.png
    │   │   └── qrcode.jpg
    │   └── snmp
    │   │   └── snmp_synology_nas.yml
    └── test
    │   ├── generator_demo.yaml
    │   ├── generator_demo.yml
    │   ├── generator_haikang_monitor.yml
    │   ├── generator_huawei_switch.yml
    │   └── snmp.yml
├── prometheus
    └── rules
    │   ├── prod
    │       ├── blackbox.yml
    │       ├── idrac-status.yml
    │       ├── node-exporter.yml
    │       ├── sangfor-ad-status.yml
    │       ├── switch-status.yml
    │       └── windows-status.yml
    │   └── vm
    │       ├── alerts-health.yml
    │       ├── alerts-vmagent.yml
    │       ├── alerts-vmalert.yml
    │       ├── alerts-vmauth.yml
    │       └── alerts.yml
└── victoriametrics
    ├── README.md
    ├── binary
        ├── PrometheusAlert
        │   ├── README.md
        │   └── install-prometheusalert.sh
        ├── alertmanager
        │   ├── README.md
        │   └── install-alertmanager.sh
        ├── blackbox_exporter
        │   ├── README.md
        │   └── install-blackbox.sh
        ├── categraf
        │   ├── README.md
        │   ├── install-categraf-cgo.sh
        │   ├── install-categraf.sh
        │   └── update-config.sh
        ├── grafana
        │   └── README.md
        ├── network_exporter
        │   ├── README.md
        │   └── install-network.sh
        ├── node_exporter
        │   ├── README.md
        │   └── install-node.sh
        ├── prometheus
        │   ├── README.md
        │   └── install-promsingle.sh
        ├── victoriametrics
        │   ├── README.md
        │   ├── install-vmsingle.sh
        │   └── vmsingle.conf
        ├── vmagent
        │   ├── README.md
        │   ├── install-vmagent.sh
        │   └── vmagent.conf
        ├── vmalert
        │   ├── README.md
        │   ├── install-vmalert.sh
        │   └── vmalert.conf
        ├── vmauth
        │   ├── README.md
        │   ├── install-vmauth.sh
        │   ├── vmauth.conf
        │   └── vmauth.service
        ├── vminsert
        │   ├── README.md
        │   ├── install-vminsert.sh
        │   ├── vminsert.conf
        │   └── vminsert.service
        ├── vmselect
        │   ├── README.md
        │   ├── install-vmselect.sh
        │   ├── vmselect.conf
        │   └── vmselect.service
        └── vmstorage
        │   ├── README.md
        │   ├── install-vmstorage.sh
        │   ├── vmstorage.config
        │   └── vmstorage.service
    ├── deploy-cluster
        ├── alertmanager
        │   └── alertmanager.yml
        ├── dashboards
        │   ├── victoriametrics-cluster.json
        │   ├── vmagent.json
        │   └── vmalert.json
        ├── docker-compose.yml
        ├── provisioning
        │   ├── dashboards
        │   │   └── dashboard.yml
        │   └── datasources
        │   │   └── prometheus-datasource
        │   │       └── prometheus-datasource.yml
        ├── vmagent
        │   └── prometheus-cluster.yml
        ├── vmalert
        │   ├── alerts-cluster.yml
        │   ├── alerts-health.yml
        │   ├── alerts-vmagent.yml
        │   └── alerts-vmalert.yml
        └── vmauth
        │   └── auth-cluster.yml
    ├── deploy-n9e
        ├── compose.yml
        ├── dashboards
        │   └── victoriametrics.json
        ├── initsql
        │   ├── a-n9e.sql
        │   └── c-init.sql
        ├── mysql
        │   └── my.cnf
        ├── nightingale
        │   ├── config.toml
        │   ├── metrics.yml
        │   └── script
        │   │   ├── notify.py
        │   │   ├── notify_feishu.py
        │   │   └── rule_converter.py
        └── provisioning
        │   ├── dashboards
        │       └── dashboard.yml
        │   └── datasources
        │       └── prometheus-datasource
        │           └── prometheus-datasource.yml
    ├── deploy
        ├── docker-prometheus
        │   ├── README.md
        │   ├── alertmanager
        │   │   └── alertmanager.yml
        │   ├── docker-compose.yml
        │   ├── grafana
        │   │   └── provisioning
        │   │   │   ├── dashboards
        │   │   │       └── dashboard.yml
        │   │   │   └── datasources
        │   │   │       └── prometheus-datasource
        │   │   │           └── prometheus-datasource.yml
        │   └── prometheus
        │   │   ├── alert.yml
        │   │   └── prometheus.yml
        └── victoriametrics
        │   ├── README.md
        │   ├── alert
        │       ├── alerts-health.yml
        │       ├── alerts-vmagent.yml
        │       ├── alerts-vmalert.yml
        │       └── alerts.yml
        │   ├── alertmanager
        │       └── alertmanager.yml
        │   ├── dashboards
        │       ├── victoriametrics.json
        │       ├── vmagent.json
        │       └── vmalert.json
        │   ├── docker-compose.yml
        │   ├── provisioning
        │       ├── dashboards
        │       │   └── dashboard.yml
        │       └── datasources
        │       │   ├── prometheus-datasource
        │       │       └── prometheus-datasource.yml
        │       │   └── victoriametrics-datasource
        │       │       └── victoriametrics-datasource.yml
        │   ├── scrape
        │       └── prometheus.yml
        │   └── single-victoriametrics.yml
    └── promxy
        ├── alert
            ├── alerts-health.yml
            ├── alerts-vmagent.yml
            ├── alerts-vmalert.yml
            └── alerts.yml
        ├── alertmanager
            └── alertmanager.yml
        ├── cmd
            └── promxy
            │   └── config.yaml
        ├── dashboards
            ├── victoriametrics.json
            ├── vmagent.json
            └── vmalert.json
        ├── docker-compose.yaml
        ├── install-promxy.sh
        ├── provisioning
            ├── dashboards
            │   └── dashboard.yml
            └── datasources
            │   └── prometheus-datasource
            │       └── prometheus-datasource.yml
        └── scrape
            └── prometheus.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | # 忽略所有 .DS_Store 文件
2 | .DS_Store 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 yanghua
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### 关于网络设备的监控和告警（基于Prometheus + SNMP Exporter + Grafana）
  2 | 
  3 | - 适配品牌类型：华为无线、华三无线、锐捷无线；欢迎有相关品牌的无线产品资源的联系我
  4 | - 目前还更新适配：深信服AD，华为交换机，华三交换机，锐捷交换机等
  5 | - [snmp_exporter](https://github.com/prometheus/snmp_exporter)版本：0.25.0
  6 | 
  7 | #### 更新日志
  8 | 
  9 | 版本更新日志：
 10 | 
 11 | [更新日志](UPDATE.md)
 12 | 
 13 | #### 目录介绍
 14 | 
 15 | - 顶级目录下面是各品牌的英文名称，如：h3c、huawei、ruijie等
 16 | - 品牌名称下就是mibs文件夹，放置了相关品牌的mib库文件
 17 | - 品牌目录下的info.txt是说明信息，generator.yml文件是已经适配对应品牌并测试好的常规SNMP导出配置生成器，里面的指标都是常用无线数据指标：AC的CPU使用率、内存使用率、温度、启动时间等，AP的内存使用率、CPU使用率、温度、状态、上线时间、承载用户数、型号、名称、IP、MAC等指标数据，详情指标直接到generator.yml中查看。
 18 | 
 19 | > 如果generator.yml文件中的指标不满足你的监控需求，可自定义编写，添加自定义指标，满足自身监控需求，也可以反馈issue中，如果我觉得合适会添加适配。
 20 | 
 21 | #### 使用配置
 22 | 
 23 | ##### 前提
 24 | 
 25 | - Prometheus搭建好，这里我不提供搭建教程，如有需要可到我知乎和微信公众号查看：网络小斐。
 26 | - AC配置好SNMP Agent，推荐使用v2c版本，如果对安全需求很大可开启v3版本。
 27 | - 准备好一台单独的Linux服务器，系统推荐CentOS 7.9，用来单独部署SNMP Exporter。
 28 | 
 29 | ##### 搭建
 30 | 
 31 | Linux首先需要部署git，当然你也可以直接从github下载源码包，上传到服务器中，这里默认用git拉snmp_exporter源码包到服务器本地。
 32 | 
 33 | ```bash
 34 | Ubuntu下载依赖包：
 35 | sudo apt-get install unzip build-essential libsnmp-dev
 36 | 
 37 | CentOS下载依赖包：
 38 | sudo yum install gcc gcc-g++ make net-snmp net-snmp-utils net-snmp-libs net-snmp-devel
 39 | ```
 40 | 
 41 | 这里用CentOS 7.9作为演示：
 42 | 
 43 | ```bash
 44 | # 下载git
 45 | sudo yum install -y git curl wget
 46 | # curl 更新
 47 | yum -y install epel-release 
 48 | wget http://mirror.city-fan.org/ftp/contrib/yum-repo/rhel7/x86_64/city-fan.org-release-3-9.rhel7.noarch.rpm
 49 | rpm -ivh city-fan.org-release-3-9.rhel7.noarch.rpm
 50 | 
 51 | vim /etc/yum.repos.d/city-fan.org.repo
 52 | 
 53 | # 把enabled=0修改为enabled=1
 54 | [city-fan.org]
 55 | name=city-fan.org repository for Red Hat Enterprise Linux (and clones) $releasever ($basearch)
 56 | #baseurl=http://mirror.city-fan.org/ftp/contrib/yum-repo/rhel$releasever/$basearch
 57 | mirrorlist=http://mirror.city-fan.org/ftp/contrib/yum-repo/mirrorlist-rhel$releasever
 58 | enabled=1
 59 | gpgcheck=1
 60 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-city-fan.org file:///etc/pki/rpm-gpg/RPM-GPG-KEY-city-fan.org-rhel-7
 61 | 
 62 | 
 63 | yum update curl --enablerepo=city-fan.org -y
 64 | curl --version
 65 | 
 66 | # 安装golang 1.20.x https://golang.google.cn/dl/
 67 | wget https://golang.google.cn/dl/go1.20.8.linux-amd64.tar.gz
 68 | # 解压安装
 69 | tar -zxvf go1.20.8.linux-amd64.tar.gz -C /usr/local
 70 | # 将go添加到环境变量
 71 | vim /etc/profile
 72 | 
 73 | if [ -n "${BASH_VERSION-}" ] ; then
 74 |         if [ -f /etc/bashrc ] ; then
 75 |                 # Bash login shells run only /etc/profile
 76 |                 # Bash non-login shells run only /etc/bashrc
 77 |                 # Check for double sourcing is done in /etc/bashrc.
 78 |                 . /etc/bashrc
 79 |        fi
 80 | fi
 81 | #go 环境变量
 82 | export GO111MODULE=on
 83 | export GOPROXY=https://goproxy.cn,direct
 84 | export GOROOT=/usr/local/go
 85 | export GOPATH=$HOME/go
 86 | export PATH=$PATH:$GOROOT/bin:$GOPATH/bin
 87 | 
 88 | # 应用环境变量
 89 | source /etc/profile
 90 | 
 91 | # 拉取snmp_exporter
 92 | git clone https://github.com/prometheus/snmp_exporter.git
 93 | # 进入目录snmp_exporter
 94 | cd snmp_exporter/
 95 | # 构建snmp_exporter二进制可执行文件
 96 | go build
 97 | # 查看生成的二进制可执行文件
 98 | ls -lsh  snmp_exporter
 99 | 
100 | # 进入生成器目录构建二进制可执行文件
101 | cd snmp_exporter/generator/
102 | # 国内网络下载mib公共库报错 忽略即可 make: *** [mibs/apc-powernet-mib] 错误 22
103 | make generator mibs
104 | 
105 | # mibs文件夹中放入对应品牌的无线设备mib库文件即可
106 | # 把对应的generator.yml文件放入 ../snmp_exporter/generator/ 目录下
107 | export MIBDIRS=/root/snmp_exporter/generator/mibs
108 | ./generator --fail-on-parse-errors generate
109 | 
110 | mv snmp.yml ../
111 | 
112 | # 重启snmp_exporter
113 | systemctl restart snmp_exporter
114 | ```
115 | ##### ./generator generate 案例
116 | 
117 | ![generate](generator.png)
118 | 
119 | ##### Prometheus.yml如何添加Job
120 | 
121 | 查看目录中prometheus.yml文件中配置案例
122 | 
123 | grafana.json只是根据案例中的指标写出的json模版，适配每个环境下的监控需要做一定的修改。
124 | 
125 | grafana模版针对AP上的在线终端数，AP的CPU利用率和内存利用率做了排序，前20优先显示在Grafana中。


--------------------------------------------------------------------------------
/UPDATE.md:
--------------------------------------------------------------------------------
1 | #### 记录版本更新日志
2 | 
3 | - 2023年9月19日 17:00 第一版发布华为和华三无线监控
4 | - 2023年9月19日 20:00 更新grafana查询语句，修改AP的终端数、CPU、内存等展示排序前20 解决多AP(30以上)下展示混乱问题
5 | - 2023年9月20日 15:30 更新深信服AD相关版本的监控
6 | - 2023年9月28日 11:00 更新第一版锐捷无线相关的 generator.yml 和 prometheus.yml 的配置信息
7 | - 2023年11月16日 14:00 更新第二版锐捷无线相关的 generator.yml 和 grafana.json 的配置信息


--------------------------------------------------------------------------------
/generator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator.png


--------------------------------------------------------------------------------
/generator/README.md:
--------------------------------------------------------------------------------
1 | # 关于物理机器和网络设备利用SNMP协议获取设备信息，通过snmp_exporter生成器生成snmp.yaml采集OID配置文件。
2 | 
3 | ## 模块说明
4 | 
5 | generator.yml 文件是针对 generator 目录下所有设备信息采集的配置文件，统一生成 SNMP 设备的 snmp.yml 指标采集文件。


--------------------------------------------------------------------------------
/generator/f5/generator_ad.yml:
--------------------------------------------------------------------------------
 1 | auths:
 2 |   f5_auth:
 3 |     version: 2
 4 |     community: public
 5 | 
 6 | modules:
 7 |   # F5负载均衡常规指标
 8 |   f5_common:
 9 |     walk:
10 |       - 1.3.6.1.4.1.3375.2.1.6.7            # sysSystemUptimeInSec 运行时间
11 |       - 1.3.6.1.2.1.1.5                     # sysName 主机名
12 |       - 1.3.6.1.2.1.1.1                     # sysDescr 系统信息
13 |       - 1.3.6.1.4.1.3375.2.1.4.2            # sysProductVersion 系统版本
14 |       - 1.3.6.1.4.1.3375.2.1.4.4            # sysProductEdition 系统补丁版本
15 |       - 1.3.6.1.4.1.3375.2.1.3.2.1.2.1.2    # sysChassisFanStatus 风扇状态
16 |       - 1.3.6.1.4.1.3375.2.1.3.2.1.2.1.3    # sysChassisFanSpeed 风扇速度
17 |       - 1.3.6.1.4.1.3375.2.1.3.2.2.2.1.2    # sysChassisPowerSupplyStatus 电源状态
18 |       - 1.3.6.1.4.1.3375.2.1.3.2.3.2.1.2    # sysChassisTempTemperature 设备温度传感器
19 |     
20 |     max_repetitions: 25
21 |     retries: 3
22 |     timeout: 5s
23 | 
24 |     lookups:
25 |       # 风扇下标 sysChassisFanIndex
26 |       # 电源下标 sysChassisPowerSupplyIndex
27 |       # 温度传感器下标 sysChassisTempIndex
28 |       - source_indexes: [sysChassisFanIndex]
29 |         lookup: sysChassisFanSpeed
30 |     
31 |     overrides:
32 |       sysChassisFanSpeed:
33 |         ignore: false
34 | 
35 |   # F5负载均衡接口信息
36 |   f5_interface:
37 |     walk:
38 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.1    # sysInterfaceStatName 接口名称
39 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.2    # sysInterfaceStatPktsIn 接口下行包总数
40 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.3    # sysInterfaceStatBytesIn 接口下行包总字节
41 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.4    # sysInterfaceStatPktsOut 接口上行包总数
42 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.5    # sysInterfaceStatBytesOut 接口上行包总字节
43 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.8    # sysInterfaceStatErrorsIn 接口下行错包
44 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.9    # sysInterfaceStatErrorsOut 接口上行错包
45 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.10   # sysInterfaceStatDropsIn 接口下行丢包
46 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.11   # sysInterfaceStatDropsOut 接口上行丢包
47 |       - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.15   # sysInterfaceStatPauseActive 接口流控制帧状态
48 | 
49 |     max_repetitions: 25
50 |     retries: 3
51 |     timeout: 5s
52 |   
53 |     lookups:
54 |       # 接口下标 sysInterfaceStatName
55 |       - source_indexes: [sysInterfaceStatName]
56 |         lookup: sysInterfaceStatPauseActive
57 |     
58 |     overrides:
59 |       sysInterfaceStatPauseActive:
60 |         ignore: true
61 |     
62 |   # F5负载均衡应用信息
63 |   f5_app:
64 |     walk:
65 |       - 1.3.6.1.4.1.3375.2.2.5.1.2.1.23     # ltmPoolMemberCnt  指定池成员总数
66 |       - 1.3.6.1.4.1.3375.2.2.5.1.2.1.8      # ltmPoolActiveMemberCnt 指定池活跃成员总数
67 |       - 1.3.6.1.4.1.3375.2.2.5.2.3.1.31     # ltmPoolStatCurSessions 指定池的当前会话数量
68 |       - 1.3.6.1.4.1.3375.2.2.10.1.2.1.9     # ltmVirtualServEnabled 虚拟服务器状态
69 |       - 1.3.6.1.4.1.3375.2.2.10.2.3.1.12    # ltmVirtualServStatClientCurConns 客户端到当前虚拟服务器连接数
70 |       - 1.3.6.1.4.1.3375.2.2.10.2.3.1.33    # ltmVirtualServStatVsUsageRatio5m 虚拟服务器5分钟使用率
71 | 
72 |     max_repetitions: 25
73 |     retries: 3
74 |     timeout: 5s
75 |   
76 |     lookups:
77 |     
78 |     overrides:


--------------------------------------------------------------------------------
/generator/h3c/generator-demo.yml:
--------------------------------------------------------------------------------
  1 | auths:
  2 |   # 认证模块名
  3 |   h3c_auth:
  4 |     # SNMP版本使用v2c版本
  5 |     version: 2
  6 |     # SNMP协议v2c版本设置团体名为public
  7 |     community: public
  8 | 
  9 | modules:
 10 |   # 华三公共指标模块名称 核心层/汇聚层/接入层
 11 |   h3c_common:
 12 |     walk:
 13 |       # 交换机基础信息
 14 |       - 1.3.6.1.2.1.1.1                       # sysDescr - 设备描述
 15 |       - 1.3.6.1.2.1.1.5                       # sysName - 系统名称
 16 |       - 1.3.6.1.2.1.1.3                       # sysUpTime - 设备上电时间
 17 |       # 实体CPU和内存信息
 18 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.6       # hh3cEntityExtCpuUsage - 实体 CPU 实时利用率
 19 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.8       # hh3cEntityExtMemUsage - 实体内存实时利用率百分比
 20 |       # 实体风扇和电源状态信息
 21 |       - 1.3.6.1.2.1.47.1.1.1.1.5              # entPhysicalClass - 实体类型
 22 |       - 1.3.6.1.2.1.47.1.1.1.1.7              # entPhysicalName - 实体名称
 23 |       # prometheus 通过合并查询实现
 24 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.19      # hh3cEntityExtErrorStatus - 实体错误状态
 25 |       # 实体传感器温度信息
 26 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.12      # hh3cEntityExtTemperature - 实体温度
 27 |       # 存储介质信息
 28 |       - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.4     # hh3cFlhPartSpace - 存储设备分区容量 单位byte
 29 |       - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.5     # hh3cFlhPartSpaceFree - 存储介质分区大小
 30 |       - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.10    # hh3cFlhPartName - 存储设备分区名称
 31 | 
 32 |     max_repetitions: 20
 33 |     retries: 3
 34 |     timeout: 5s
 35 | 
 36 |     lookups:
 37 |       # hh3cEntityExtPhysicalIndex = entPhysicalIndex
 38 |       - source_indexes: [entPhysicalIndex]
 39 |         lookup: entPhysicalClass
 40 |       - source_indexes: [entPhysicalIndex]
 41 |         lookup: entPhysicalName
 42 | 
 43 |     overrides:
 44 |       entPhysicalClass:
 45 |         ignore: true
 46 |   
 47 |   # 华三交换机堆叠模块
 48 |   h3c_stack:  # 核心层/汇聚层
 49 |     walk:
 50 |       # 堆叠信息
 51 |       - 1.3.6.1.4.1.25506.2.91.1.7            # hh3cStackTopology - 堆叠系统的拓扑类型
 52 |       - 1.3.6.1.4.1.25506.2.91.1.2            # hh3cStackMemberNum - 本IRF系统目前包含的堆叠设备数量
 53 |       
 54 |     max_repetitions: 10
 55 |     retries: 3
 56 |     timeout: 5s
 57 | 
 58 |   # 华三交换机接口信息模块
 59 |   h3c_interface:  # 核心层/汇聚层/接入层
 60 |     walk:
 61 |       # 接口信息 - 索引 ifIndex
 62 |       - 1.3.6.1.2.1.2.2.1.2                   # ifDescr - 接口描述
 63 |       - 1.3.6.1.2.1.31.1.1.1.18               # ifAlias - 接口别名
 64 |       - 1.3.6.1.2.1.31.1.1.1.1                # ifName - 接口名字
 65 |       - 1.3.6.1.2.1.2.2.1.6                   # ifPhysAddress - 接口物理地址
 66 |       - 1.3.6.1.2.1.2.2.1.7                   # ifAdminStatus - 接口默认状态
 67 |       - 1.3.6.1.2.1.2.2.1.8                   # ifOperStatus - 接口运行状态
 68 |       - 1.3.6.1.2.1.2.2.1.13                  # ifInDiscards - 入方向丢包统计
 69 |       - 1.3.6.1.2.1.2.2.1.14                  # ifInErrors - 入方向错包统计
 70 |       - 1.3.6.1.2.1.2.2.1.19                  # ifOutDiscards - 出方向丢包统计
 71 |       - 1.3.6.1.2.1.2.2.1.20                  # ifOutErrors - 出方向错包统计
 72 |       - 1.3.6.1.2.1.31.1.1.1.6                # ifHCInOctets - 入方向报文统计
 73 |       - 1.3.6.1.2.1.31.1.1.1.10               # ifHCOutOctets - 出方向报文统计
 74 |       - 1.3.6.1.2.1.31.1.1.1.15               # ifHighSpeed - 接口当前带宽
 75 | 
 76 |       # 光模块信息 - 索引 ifIndex
 77 |       - 1.3.6.1.4.1.25506.2.70.1.1.1.9        # hh3cTransceiverCurTXPower 光模块当前的发送光功率 单位为百分之一dBM
 78 |       - 1.3.6.1.4.1.25506.2.70.1.1.1.12       # hh3cTransceiverCurRXPower 光模块当前的接收功率 单位为百分之一dBM
 79 |       - 1.3.6.1.4.1.25506.2.70.1.1.1.15       # hh3cTransceiverTemperature 光模块当前的温度 单位为摄氏度
 80 |       - 1.3.6.1.4.1.25506.2.70.1.1.1.20       # hh3cTransceiverTempHiWarn 温度预警上限值，单位为千分之一摄氏度
 81 |       - 1.3.6.1.4.1.25506.2.70.1.1.1.32       # hh3cTransceiverPwrOutHiWarn 输出功率预警上限值 单位为十分之一微瓦 为0时代表不支持
 82 |       - 1.3.6.1.4.1.25506.2.70.1.1.1.33       # hh3cTransceiverPwrOutLoWarn 输出功率预警下限值,单位为十分之一微瓦
 83 |       - 1.3.6.1.4.1.25506.2.70.1.1.1.36       # hh3cTransceiverRcvPwrHiWarn 输入功率预警上限值,单位为十分之一微瓦
 84 |       - 1.3.6.1.4.1.25506.2.70.1.1.1.37       # hh3cTransceiverRcvPwrLoWarn 输入功率预警下限值,单位为十分之一微瓦 
 85 | 
 86 |     max_repetitions: 50
 87 |     retries: 3
 88 |     timeout: 5s
 89 |     
 90 |     lookups:
 91 |       - source_indexes: [ifIndex]
 92 |         lookup: ifDescr
 93 |       - source_indexes: [ifIndex]
 94 |         lookup: ifAlias
 95 |       - source_indexes: [ifIndex]
 96 |         lookup: ifName
 97 |       - source_indexes: [ifIndex]
 98 |         lookup: ifPhysAddress
 99 |       - source_indexes: [ifIndex]
100 |         lookup: ifAdminStatus
101 |       - source_indexes: [ifIndex]
102 |         lookup: ifOperStatus
103 |       - source_indexes: [ifIndex]
104 |         lookup: ifHighSpeed
105 |     
106 |     overrides:
107 |       ifDescr:
108 |         ignore: true
109 |       ifAlias:
110 |         ignore: true
111 |       ifName:
112 |         ignore: true
113 |       ifPhysAddress:
114 |         ignore: true
115 |       ifAdminStatus:
116 |         ignore: true
117 |       ifOperStatus:
118 |         ignore: true
119 |       ifHighSpeed:
120 |         ignore: true


--------------------------------------------------------------------------------
/generator/h3c/switch/generator_h3c_sw.yml:
--------------------------------------------------------------------------------
  1 | auths:
  2 |   h3c_auth:  # 认证模块名称
  3 |     version: 2  # snmp v2c版本
  4 |     community: public  # snmp 团体名
  5 | 
  6 | modules:
  7 |   h3c_common:  # 华三公共指标模块名称
  8 |     walk:
  9 |       # 交换机基础信息
 10 |       - 1.3.6.1.2.1.1.1                       # sysDescr - 设备描述
 11 |       - 1.3.6.1.2.1.1.5                       # sysName - 系统名称
 12 |       - 1.3.6.1.2.1.1.3                       # sysUpTime - 设备上电时间
 13 |       # 实体CPU和内存信息
 14 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.6       # hh3cEntityExtCpuUsage - 实体 CPU 实时利用率
 15 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.8       # hh3cEntityExtMemUsage - 实体内存实时利用率百分比
 16 |       # 实体风扇和电源状态信息
 17 |       - 1.3.6.1.2.1.47.1.1.1.1.5              # entPhysicalClass - 实体类型
 18 |       - 1.3.6.1.2.1.47.1.1.1.1.7              # entPhysicalName - 实体名称
 19 |       # prometheus 通过合并查询实现
 20 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.19      # hh3cEntityExtErrorStatus - 实体错误状态
 21 |       # 实体传感器温度信息
 22 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.12      # hh3cEntityExtTemperature - 实体温度
 23 |       # 存储介质信息
 24 |       - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.4     # hh3cFlhPartSpace - 存储设备分区容量 单位byte
 25 |       - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.5     # hh3cFlhPartSpaceFree - 存储介质分区大小
 26 |       - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.10    # hh3cFlhPartName - 存储设备分区名称
 27 | 
 28 |     max_repetitions: 20
 29 |     retries: 3
 30 |     timeout: 5s
 31 | 
 32 |     lookups:
 33 |       # hh3cEntityExtPhysicalIndex = entPhysicalIndex
 34 |       - source_indexes: [entPhysicalIndex]
 35 |         lookup: entPhysicalClass
 36 |       - source_indexes: [entPhysicalIndex]
 37 |         lookup: entPhysicalName
 38 | 
 39 |     overrides:
 40 |       entPhysicalClass:
 41 |         ignore: true
 42 |   
 43 |   h3c_stack:  # 华三堆叠信息
 44 |     walk:
 45 |       # 堆叠信息
 46 |       - 1.3.6.1.4.1.25506.2.91.1.7            # hh3cStackTopology - 堆叠系统的拓扑类型
 47 |       - 1.3.6.1.4.1.25506.2.91.1.2            # hh3cStackMemberNum - 本IRF系统目前包含的堆叠设备数量
 48 |       
 49 |     max_repetitions: 20
 50 |     retries: 3
 51 |     timeout: 5s
 52 | 
 53 |   h3c_interface:
 54 |     walk:
 55 |       # 接口信息 - 索引 ifIndex
 56 |       - 1.3.6.1.2.1.2.2.1.2                   # ifDescr - 接口描述
 57 |       - 1.3.6.1.2.1.31.1.1.1.18               # ifAlias - 接口别名
 58 |       - 1.3.6.1.2.1.31.1.1.1.1                # ifName - 接口名字
 59 |       - 1.3.6.1.2.1.2.2.1.6                   # ifPhysAddress - 接口物理地址
 60 |       - 1.3.6.1.2.1.2.2.1.7                   # ifAdminStatus - 接口默认状态
 61 |       - 1.3.6.1.2.1.2.2.1.8                   # ifOperStatus - 接口运行状态
 62 |       - 1.3.6.1.2.1.2.2.1.13                  # ifInDiscards - 入方向丢包统计
 63 |       - 1.3.6.1.2.1.2.2.1.14                  # ifInErrors - 入方向错包统计
 64 |       - 1.3.6.1.2.1.2.2.1.19                  # ifOutDiscards - 出方向丢包统计
 65 |       - 1.3.6.1.2.1.2.2.1.20                  # ifOutErrors - 出方向错包统计
 66 |       - 1.3.6.1.2.1.31.1.1.1.6                # ifHCInOctets - 入方向报文统计
 67 |       - 1.3.6.1.2.1.31.1.1.1.10               # ifHCOutOctets - 出方向报文统计
 68 |       - 1.3.6.1.2.1.31.1.1.1.15               # ifHighSpeed - 接口当前带宽
 69 | 
 70 |       # 光模块信息 - 索引 ifIndex
 71 |       - hh3cTransceiverInfoTable
 72 | 
 73 |     max_repetitions: 60
 74 |     retries: 3
 75 |     timeout: 5s
 76 |     
 77 |     lookups:
 78 |       - source_indexes: [ifIndex]
 79 |         lookup: ifDescr
 80 |       - source_indexes: [ifIndex]
 81 |         lookup: ifAlias
 82 |       - source_indexes: [ifIndex]
 83 |         lookup: ifName
 84 |       - source_indexes: [ifIndex]
 85 |         lookup: ifPhysAddress
 86 |       - source_indexes: [ifIndex]
 87 |         lookup: ifAdminStatus
 88 |       - source_indexes: [ifIndex]
 89 |         lookup: ifOperStatus
 90 |       - source_indexes: [ifIndex]
 91 |         lookup: ifHighSpeed
 92 |     
 93 |     overrides:
 94 |       ifDescr:
 95 |         ignore: true
 96 |       ifAlias:
 97 |         ignore: true
 98 |       ifName:
 99 |         ignore: true
100 |       ifPhysAddress:
101 |         ignore: true
102 |       ifAdminStatus:
103 |         ignore: true
104 |       ifOperStatus:
105 |         ignore: true
106 |       ifHighSpeed:
107 |         ignore: true


--------------------------------------------------------------------------------
/generator/h3c/wireless/README.md:
--------------------------------------------------------------------------------
 1 | 本目录中generator.yml是适配了华三无线H3C WX3500X系列的无线控制器。
 2 | 已完成测试：H3C WX3510X 其他型号未做测试，理论上讲WX3500X系列通用。
 3 | 
 4 | 华三无线产品mib库下载链接：
 5 | 根据对应的系统版本下载对应的MIB，如：Comware V7
 6 | 
 7 | 下载路径：首页 > 产品与解决方案 > 智能联接 > 操作系统 > ComwareV7 > MIB > MIB
 8 | 
 9 | 链接：https://www.h3c.com/cn/d_201806/1089291_473262_0.htm
10 | 
11 | 
12 | 根据对应的系统版本下载对应的MIB，如：Comware V5
13 | 
14 | 下载路径：首页 > 产品与解决方案 > 智能联接 > 操作系统 > ComwareV5 > MIB > MIB
15 | 
16 | 链接：https://www.h3c.com/cn/d_200905/635750_473262_0.htm
17 | 
18 | 华三无线产品mib OID信息参考链接：
19 | 根据对应的版本做参考，如：Comware V7 或者找客服拿对应的mib OID对照表
20 | 链接：
21 | 
22 | mibs文件夹中，我已经提前下载好Comware V7的mib库文件，需要自行解压得到mib后缀的文件。
23 | 
24 | 推荐版本1(Comware V7)：Comware-V7-MIB.zip
25 | 推荐版本2(Comware V5)：Comware-V5-MIB.zip
26 | 
27 | 至于下载那个版本的，需要查看你AC中目前的对应什么版本号。
28 | 通过测试的H3C WX2560H、H3C WX3510X用的是Comware-V7-MIB.zip 理论上其他版本AC也通用。


--------------------------------------------------------------------------------
/generator/h3c/wireless/generator-demo.yml:
--------------------------------------------------------------------------------
 1 | auths:    # 认证模块
 2 |   public_v2:  # 认证模块名称 可自定义 在prometheus.yml中需要配置参数auth对应这个名称
 3 |     version: 2  # 定义SNMP Agent的版本为v2c 支持v3
 4 |     community: public   # SNMP Agent的团体名设置和AC中设置的团体名需一致
 5 | 
 6 | modules:    # 指标模块
 7 |   H3C_AC:   # 指标模块名称 可自定义
 8 |     walk:
 9 |       - 1.3.6.1.2.1.1.3                       # SysUpTime - 设备运行时间
10 |       - 1.3.6.1.2.1.2.2.1.1                   # ifIndex - 接口索引
11 |       - 1.3.6.1.2.1.2.2.1.2                   # ifDescr - 接口描述
12 |       - 1.3.6.1.2.1.2.2.1.5                   # ifSpeed - 接口带宽
13 |       - 1.3.6.1.2.1.2.2.1.8                   # ifOperStatus - 接口当前状态
14 |       - 1.3.6.1.2.1.2.2.1.13                  # ifInDiscards - 接口接收丢弃包
15 |       - 1.3.6.1.2.1.2.2.1.14                  # ifInErrors - 接口接收错误包数
16 |       - 1.3.6.1.2.1.2.2.1.19                  # ifOutDiscards - 接口发送丢弃包
17 |       - 1.3.6.1.2.1.2.2.1.20                  # ifOutErrors - 接受发生错误包
18 |       - 1.3.6.1.2.1.31.1.1.1.1                # ifName - AC接口名称
19 |       - 1.3.6.1.2.1.31.1.1.1.6                # ifHCInOctets - AC接口接收字节数
20 |       - 1.3.6.1.2.1.31.1.1.1.10               # ifHCOutOctets - AC接口发送字节数
21 |       - 1.3.6.1.2.1.31.1.1.1.15               # ifHighSpeed - 接口带宽
22 |       - 1.3.6.1.2.1.31.1.1.1.18               # ifAlias - 接口别名
23 |       # - 1.3.6.1.4.1.25506.2.75.1.1.2.3.1.1  # hh3cDot11ACIfIndex - AC接口索引
24 |       #- 1.3.6.1.4.1.25506.2.75.2.1.1.1.2      # hh3cDot11APIPAddress - AP的IP
25 |       #- 1.3.6.1.4.1.25506.2.75.2.1.1.1.3      # hh3cDot11APMacAddress -AP的Mac
26 |       #- 1.3.6.1.4.1.25506.2.75.2.1.1.1.7      # hh3cDot11APCpuUsage - AP的CPU实时利用率     
27 |       - 1.3.6.1.4.1.25506.2.75.2.1.1.1.1      # hh3cDot11APID - AP接口索引ID
28 |       - 1.3.6.1.4.1.25506.2.75.2.1.1.1.4      # hh3cDot11APOperationStatus - AP与AC的关联状态
29 |       - 1.3.6.1.4.1.25506.2.75.2.1.1.1.5      # hh3cDot11APTemplateNameOfAP - 设定的AP名称
30 |       - 1.3.6.1.4.1.25506.2.75.2.1.6.1.1      # hh3cDot11APIfIndex hh3cDot11APObjID - AP接口索引
31 |       - 1.3.6.1.4.1.25506.2.75.2.1.2.1.2      # hh3cDot11CurrAPIPAddress - AP的IP
32 |       - 1.3.6.1.4.1.25506.2.75.2.1.2.1.3      # hh3cDot11CurrAPMacAddress - AP的Mac
33 |       - 1.3.6.1.4.1.25506.2.75.2.1.2.1.9      # hh3cDot11CurrAPModelName - AP类型名称
34 |       - 1.3.6.1.4.1.25506.2.75.1.1.2.1        # hh3cDot11APConnectCount - 当前AC连接的AP总数量 - gauge
35 |       - 1.3.6.1.4.1.25506.2.75.1.1.2.2        # hh3cDot11StationConnectCount - 当前所有AP在线终端总数 - gauge
36 |       - 1.3.6.1.4.1.25506.2.75.2.1.2.1.7      # hh3cDot11CurrAPStationAssocCount - AP当前连接STA数量 - gauge
37 |       - 1.3.6.1.4.1.25506.2.75.2.1.10.1.1     # hh3cDot11APSysUpTime2 - AP启动时间 - gauge
38 |       - 1.3.6.1.4.1.25506.2.75.2.1.10.1.2     # hh3cDot11APCPURTUsage2 - AP实时CPU利用率(周期1分钟)
39 |       - 1.3.6.1.4.1.25506.2.75.2.1.10.1.4     # hh3cDot11APMemRTUsage2 - AP实时内存利用率(周期1分钟)
40 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.6       # hh3cEntityExtCpuUsage - AC的CPU实时利用率 - gauge
41 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.8       # hh3cEntityExtMemUsage - AC的内存实时利用率 - gauge      
42 | 
43 |     max_repetitions: 25
44 |     retries: 3
45 |     timeout: 5s
46 |   
47 |     lookups:
48 |       - source_indexes: [ifIndex]
49 |         lookup: ifAlias
50 |       - source_indexes: [ifIndex]
51 |         lookup: ifDescr
52 |       - source_indexes: [ifIndex]
53 |         lookup: ifOperStatus
54 |       - source_indexes: [hh3cDot11APObjID]
55 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.1.1.5
56 |         drop_source_indexes: true
57 |       - source_indexes: [hh3cDot11APObjID]
58 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.1.1.4
59 |         drop_source_indexes: true
60 |       - source_indexes: [hh3cDot11APObjID]
61 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.9
62 |         drop_source_indexes: true
63 |       - source_indexes: [hh3cDot11APObjID]
64 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.2
65 |         drop_source_indexes: true
66 |       - source_indexes: [hh3cDot11APObjID]
67 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.3
68 |         drop_source_indexes: true
69 | 
70 |     overrides:
71 |       ifAlias:
72 |         ignore: true
73 |       ifDescr:
74 |         ignore: true
75 |       ifOperStatus:
76 |         ignore: true
77 |       hh3cDot11APTemplateNameOfAP:
78 |         ignore: true
79 |         type: DisplayString
80 |       hh3cDot11APOperationStatus:
81 |         ignore: true
82 |       hh3cDot11CurrAPMacAddress:
83 |         ignore: true
84 |       hh3cDot11CurrAPModelName:
85 |         ignore: true
86 |         type: DisplayString
87 |       hh3cDot11CurrAPIPAddress:
88 |         ignore: true


--------------------------------------------------------------------------------
/generator/h3c/wireless/generator_h3c_wireless.yml:
--------------------------------------------------------------------------------
 1 | auths:
 2 |   h3c_ac:
 3 |     version: 2
 4 |     community: public
 5 | 
 6 | modules:
 7 |   h3c_wireless:
 8 |     walk:
 9 |       # AC基本信息
10 |       - 1.3.6.1.2.1.1.1                       # sysDescr - 设备描述
11 |       - 1.3.6.1.2.1.1.5                       # sysName - 系统名称
12 |       - 1.3.6.1.2.1.1.3                       # sysUpTime - 设备上电时间
13 |       # AC接口信息
14 |       - 1.3.6.1.2.1.2.2.1.1                   # ifIndex - 接口索引
15 |       - 1.3.6.1.2.1.2.2.1.2                   # ifDescr - 接口描述
16 |       - 1.3.6.1.2.1.31.1.1.1.18               # ifAlias - 接口别名
17 |       - 1.3.6.1.2.1.31.1.1.1.1                # ifName - 接口名称
18 |       - 1.3.6.1.2.1.31.1.1.1.15               # ifHighSpeed - 接口带宽
19 |       - 1.3.6.1.2.1.2.2.1.8                   # ifOperStatus - 接口当前状态
20 |       - 1.3.6.1.2.1.2.2.1.13                  # ifInDiscards - 入方向丢包统计
21 |       - 1.3.6.1.2.1.2.2.1.14                  # ifInErrors - 入方向错包统计
22 |       - 1.3.6.1.2.1.2.2.1.19                  # ifOutDiscards - 出方向丢包统计
23 |       - 1.3.6.1.2.1.2.2.1.20                  # ifOutErrors - 出方向错包统计
24 |       - 1.3.6.1.2.1.31.1.1.1.6                # ifHCInOctets - 入方向报文统计
25 |       - 1.3.6.1.2.1.31.1.1.1.10               # ifHCOutOctets - 出方向报文统计
26 |       # AP基础信息
27 |       - 1.3.6.1.4.1.25506.2.75.2.1.1.1.1      # hh3cDot11APID - AP接口索引ID
28 |       - 1.3.6.1.4.1.25506.2.75.2.1.1.1.4      # hh3cDot11APOperationStatus - AP与AC的关联状态
29 |       - 1.3.6.1.4.1.25506.2.75.2.1.1.1.5      # hh3cDot11APTemplateNameOfAP - 设定的AP名称
30 |       - 1.3.6.1.4.1.25506.2.75.2.1.6.1.1      # hh3cDot11APIfIndex hh3cDot11APObjID - AP接口索引
31 |       - 1.3.6.1.4.1.25506.2.75.2.1.2.1.2      # hh3cDot11CurrAPIPAddress - AP的IP
32 |       - 1.3.6.1.4.1.25506.2.75.2.1.2.1.3      # hh3cDot11CurrAPMacAddress - AP的Mac
33 |       - 1.3.6.1.4.1.25506.2.75.2.1.2.1.9      # hh3cDot11CurrAPModelName - AP类型名称
34 |       - 1.3.6.1.4.1.25506.2.75.1.1.2.1        # hh3cDot11APConnectCount - 当前AC连接的AP总数量
35 |       - 1.3.6.1.4.1.25506.2.75.1.1.2.2        # hh3cDot11StationConnectCount - 当前所有AP在线终端总数
36 |       - 1.3.6.1.4.1.25506.2.75.2.1.2.1.7      # hh3cDot11CurrAPStationAssocCount - AP当前连接STA数量
37 |       - 1.3.6.1.4.1.25506.2.75.2.1.10.1.1     # hh3cDot11APSysUpTime2 - AP启动时间
38 |       - 1.3.6.1.4.1.25506.2.75.2.1.10.1.2     # hh3cDot11APCPURTUsage2 - AP实时CPU利用率(周期1分钟)
39 |       - 1.3.6.1.4.1.25506.2.75.2.1.10.1.4     # hh3cDot11APMemRTUsage2 - AP实时内存利用率(周期1分钟)
40 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.6       # hh3cEntityExtCpuUsage - AC的CPU实时利用率
41 |       - 1.3.6.1.4.1.25506.2.6.1.1.1.1.8       # hh3cEntityExtMemUsage - AC的内存实时利用率
42 | 
43 |     max_repetitions: 25
44 |     retries: 3
45 |     timeout: 5s
46 |   
47 |     lookups:
48 |       - source_indexes: [ifIndex]
49 |         lookup: ifAlias
50 |       - source_indexes: [ifIndex]
51 |         lookup: ifDescr
52 |       - source_indexes: [ifIndex]
53 |         lookup: ifOperStatus
54 |       - source_indexes: [hh3cDot11APObjID]
55 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.1.1.5
56 |         drop_source_indexes: true
57 |       - source_indexes: [hh3cDot11APObjID]
58 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.1.1.4
59 |         drop_source_indexes: true
60 |       - source_indexes: [hh3cDot11APObjID]
61 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.9
62 |         drop_source_indexes: true
63 |       - source_indexes: [hh3cDot11APObjID]
64 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.2
65 |         drop_source_indexes: true
66 |       - source_indexes: [hh3cDot11APObjID]
67 |         lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.3
68 |         drop_source_indexes: true
69 | 
70 |     overrides:
71 |       ifAlias:
72 |         ignore: true
73 |       ifDescr:
74 |         ignore: true
75 |       ifOperStatus:
76 |         ignore: true
77 |       hh3cDot11APTemplateNameOfAP:
78 |         ignore: true
79 |         type: DisplayString
80 |       hh3cDot11APOperationStatus:
81 |         ignore: true
82 |       hh3cDot11CurrAPMacAddress:
83 |         ignore: true
84 |       hh3cDot11CurrAPModelName:
85 |         ignore: true
86 |         type: DisplayString
87 |       hh3cDot11CurrAPIPAddress:
88 |         ignore: true


--------------------------------------------------------------------------------
/generator/h3c/wireless/h3c-ac.yml:
--------------------------------------------------------------------------------
 1 | # Prometheus通过文件发现机制定义的采集目标
 2 | # /root/monitor/prometheus/targets/h3c-ac.yml
 3 | - labels:
 4 |     module: h3c_ac    # generator.yml中定义的指标模块名称，如果有多个可以写多个模块名
 5 |     auth: public_v2     # generator.yml中定义的认证模块名
 6 |     brand: H3C       # 可删除可自定义
 7 |     hostname: XX-XXXX-CORE  # 可删除可自定义
 8 |     model: H3C WX3510X        # 可删除可自定义
 9 |   targets:
10 |     - 172.17.14.1   # 核心  # 需要采集的无线控制器管理IP


--------------------------------------------------------------------------------
/generator/h3c/wireless/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # 全局配置文件
 2 | global:
 3 |   # ......
 4 | # 告警插件配置
 5 | alerting:
 6 |   # ......
 7 | # 按照设定参数进行扫描加载，用于自定义报警规则，其报警媒介和route路由由alertmanager插件实现
 8 | rule_files:
 9 |   # ......
10 | 
11 | # 设定采集对象，这里既有静态设置也有设置服务发现
12 | scrape_configs:
13 |   # ......
14 |   
15 |   - job_name: "h3c_wireless"
16 |     scrape_interval: 15s
17 |     scrape_timeout: 10s
18 |     file_sd_configs:
19 |       - files:
20 |         - /root/monitor/prometheus/targets/h3c-ac.yml
21 |         refresh_interval: 2m
22 |     metrics_path: /snmp
23 |     relabel_configs:
24 |     - source_labels: ["__address__"]
25 |       target_label: __param_target
26 |     - source_labels: ["__param_target"]
27 |       target_label: instance
28 |     - target_label: __address__
29 |       replacement: 172.17.40.54:9116 # snmp_exporter 服务IP地址
30 |     - source_labels: ["module"] # 从自定义的目标标签获取指标模块名称
31 |       target_label: __param_module
32 |     - source_labels: ["auth"] # 从自定义的目标标签获取认证模块名称
33 |       target_label: __param_auth


--------------------------------------------------------------------------------
/generator/huawei/switch/README.md:
--------------------------------------------------------------------------------
 1 | # 华为交换机通过SNMP协议配置OID信息，采集对应OID指标信息生成器的配置文件
 2 | 
 3 | ## 模块说明
 4 | 
 5 | generator-demo.yml中的文件就是配置生成器配置文件，如果需要自定义采集指标，请自行查阅官方MIB库信息，根据对应的OID拿到自己需要的指标信息。
 6 | 
 7 | 华为官方MIB信息查询：https://info.support.huawei.com/info-finder/tool/zh/enterprise/mib
 8 | 
 9 | - huawei_common 这对华为通用交换机的常规指标采集
10 | - huawei_core 针对华为核心交换机的指标采集 基于CloudEngine S12700E-4
11 | 
12 | ## MIB库文件
13 | 
14 | - MIB_V200R022C00SPC500.zip 压缩包基于CloudEngine S12700E-4版本的MIB库文件，文件解压缩有一个文件说明该MIB库支持的交换机固件版本列表


--------------------------------------------------------------------------------
/generator/huawei/switch/network-switch.yml:
--------------------------------------------------------------------------------
 1 | # Prometheus通过文件发现机制定义的采集目标
 2 | - labels:
 3 |     module: huawei_common,huawei_core   # generator.yml中定义的指标模块名称，如果有多个可以写多个模块名
 4 |     auth: public_v2     # generator.yml中定义的认证模块名
 5 |     brand: Huawei       # 可删除可自定义
 6 |     hostname: XX-XXXX-CORE  # 可删除可自定义
 7 |     model: S12700E-4        # 可删除可自定义
 8 |   targets:
 9 |     - 172.17.14.1   # 核心  # 需要采集的交换机管理IP
10 | - labels:
11 |     module: HUAWEI
12 |     auth: public_v2
13 |     brand: Huawei
14 |     hostname: XXXX-XXX-XX-AG
15 |     model: S5720-36C-EI-AC
16 |   targets:
17 |     - 172.17.14.2   # 汇聚


--------------------------------------------------------------------------------
/generator/huawei/switch/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # 全局配置文件
 2 | global:
 3 |   # ......
 4 | # 告警插件配置
 5 | alerting:
 6 |   # ......
 7 | # 按照设定参数进行扫描加载，用于自定义报警规则，其报警媒介和route路由由alertmanager插件实现
 8 | rule_files:
 9 |   # ......
10 | 
11 | # 设定采集对象，这里既有静态设置也有设置服务发现
12 | scrape_configs:
13 |   # ......
14 |   
15 |   # 采集华为交换机信息
16 |   - job_name: "snmp"
17 |     scrape_interval: 30s  # 针对SNMP采集节点 覆盖全局配置15s
18 |     scrape_timeout: 20s
19 |     file_sd_configs:
20 |       - files: 
21 |         - /root/monitor/prometheus/targets/network-*.yml
22 |         refresh_interval: 2m
23 |     metrics_path: /snmp
24 |     relabel_configs:
25 |     - source_labels: ["__address__"]
26 |       target_label: __param_target
27 |     - source_labels: ["__param_target"]
28 |       target_label: instance
29 |     # prometheus采集目标直接修改为snmp_exporter 服务IP地址
30 |     - target_label: __address__
31 |       replacement: 172.17.40.54:9116 # snmp_exporter 服务IP地址
32 |     - source_labels: ["module"] # 从自定义的目标标签获取指标模块名称
33 |       target_label: __param_module
34 |     - source_labels: ["auth"] # 从自定义的目标标签获取认证模块名称
35 |       target_label: __param_auth


--------------------------------------------------------------------------------
/generator/huawei/wireless/README.md:
--------------------------------------------------------------------------------
 1 | 本目录中generator.yml是适配了华为无线AC6000系列的无线控制器。
 2 | 已完成测试：AC6003、AC6005、AC6508、AC6605，其他型号未做测试，理论上讲AC6000系列通用。
 3 | 
 4 | 华为无线产品mib库下载链接：
 5 | 根据对应的版本下载对应的MIB，如：AC6605 V200R019C00SPC500
 6 | 
 7 | 下载路径：技术支持 > 无线局域网 > AC > AC6000 > 软件 > 选择版本过滤
 8 | 
 9 | 链接：https://support.huawei.com/enterprise/zh/software/250730566-ESW2000205621
10 | 
11 | 
12 | 华为无线产品mib OID信息参考链接：
13 | 根据对应的版本做参考，如：WLAN AC V200R019C10 MIB参考
14 | 链接：https://support.huawei.com/enterprise/zh/doc/EDOC1100156646
15 | 
16 | mibs文件夹中，我已经提前下载好对应两个比较推荐的版本的mib库文件，需要自行解压得到mib后缀的文件。
17 | 
18 | 推荐版本1(V200R019C00SPC500)：MIB_WLAN_V200R019C00SPC500.zip
19 | 推荐版本2(V200R022C00SPC100)：MIB_WLAN_V200R022C00SPC100.zip
20 | 
21 | 至于下载那个版本的，需要查看你AC中目前的对应什么版本号。
22 | 由于我测试的AC6005、AC6508、AC6605都升级到V200R019C00SPC500版本，
23 | 故都是使用MIB_WLAN_V200R019C00SPC500.zip中的库文件。


--------------------------------------------------------------------------------
/generator/huawei/wireless/generator-demo.yml:
--------------------------------------------------------------------------------
 1 | auths:
 2 |   public_v2:  # 认证模块名称
 3 |     version: 2  # snmp v2c版本
 4 |     community: public  # snmp 团体名
 5 | 
 6 | modules:
 7 |   huawei_ac:
 8 |     walk:
 9 |       - 1.3.6.1.4.1.2011.5.25.31.1.1.1.1.5    # AC的CPU利用率
10 |       - 1.3.6.1.4.1.2011.5.25.31.1.1.1.1.7    # AC的内存利用率
11 |       - 1.3.6.1.4.1.2011.5.25.31.1.1.1.1.10   # AC的启动时间
12 |       - 1.3.6.1.2.1.1.5                       # AC的设备名称
13 |       - 1.3.6.1.4.1.2011.5.25.31.1.1.1.1.11   # AC的温度
14 |       - 1.3.6.1.2.1.2.2.1.1                   # ifIndex 接口索引
15 |       - 1.3.6.1.2.1.2.2.1.2                   # IfDescr 描述接口的字符串
16 |       #- 1.3.6.1.2.1.2.2.1.3                   # ifType 接口类型
17 |       - 1.3.6.1.2.1.2.2.1.5                   # ifSpeed 估计的接口当前带宽 单位是bit/s
18 |       - 1.3.6.1.2.1.31.1.1.1.15               # ifHighSpeed 接口当前带宽的估计值 单位为1,000,000 bit/s
19 |       - 1.3.6.1.2.1.31.1.1.1.18               # ifAlias 网络管理员指定的接口别名
20 |       - 1.3.6.1.2.1.2.2.1.8                   # ifOperStatus 接口当前的状态
21 |       - 1.3.6.1.2.1.2.2.1.13                  # ifInDiscards 入方向的被丢弃的报文个数
22 |       - 1.3.6.1.2.1.2.2.1.14                  # ifInErrors 出错而不会被送往上层协议的报文/传输单元个数
23 |       - 1.3.6.1.2.1.2.2.1.19                  # ifOutDiscards 出方向的被丢弃的报文个数
24 |       - 1.3.6.1.2.1.2.2.1.20                  # ifOutErrors 出错而不会被传送的报文/传输单元个数
25 |       #- 1.3.6.1.2.1.31.1.1.1.1                # ifName 由本地设备分配的接口名
26 |       - 1.3.6.1.2.1.31.1.1.1.6                # ifHCInOctets 接口上接收到的字节总数 
27 |       - 1.3.6.1.2.1.31.1.1.1.10               # ifHCOutOctets 接口发送的字节总数
28 |       - hwWlanIDIndexedApId                   # AP的ID
29 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.5    # AP的名称
30 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.7    # AP的状态
31 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.2    # AP的Mac
32 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.4    # AP的型号
33 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.14   # AP的IP
34 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.19   # AP的运行时间
35 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.22   # AP的上线时长
36 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.41   # AP的内存使用率
37 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.42   # AP的CPU使用率
38 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.44   # AP的温度
39 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.45   # AP的在线用户数
40 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.48   # AP的用户上线失败率
41 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.49   # AP的用户掉线率
42 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.50   # AP的粘性用户比率
43 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.57   # AP的以太接口上行速率
44 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.58   # AP的以太接口下行速率
45 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.65   # AP上行端口接收总的字节数
46 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.66   # AP上行端口发送总的字节数
47 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.65   # AP上行端口接收总的字节数
48 |       - 1.3.6.1.4.1.2011.6.139.13.3.10.1.66   # AP上行端口发送总的字节数
49 | 
50 |     max_repetitions: 50
51 |     retries: 3
52 |     timeout: 5s
53 |   
54 |     lookups:
55 |       - source_indexes: [ifIndex]
56 |         lookup: ifAlias
57 |       - source_indexes: [ifIndex]
58 |         lookup: ifDescr
59 |       - source_indexes: [ifIndex]
60 |         lookup: ifOperStatus
61 |       - source_indexes: [hwWlanIDIndexedApId]
62 |         lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.5
63 |       - source_indexes: [hwWlanIDIndexedApId]
64 |         lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.7
65 |       - source_indexes: [hwWlanIDIndexedApId]
66 |         lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.2
67 |       - source_indexes: [hwWlanIDIndexedApId]
68 |         lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.4
69 |       - source_indexes: [hwWlanIDIndexedApId]
70 |         lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.14
71 | 
72 |     overrides:
73 |       ifAlias:
74 |         ignore: true
75 |       ifDescr:
76 |         ignore: true
77 |       ifOperStatus:
78 |         ignore: true
79 |       hwWlanIDIndexedApName:
80 |         ignore: true
81 |         type: DisplayString
82 |       hwWlanIDIndexedApRunState:
83 |         ignore: true
84 |       hwWlanIDIndexedApMac:
85 |         ignore: true
86 |       hwWlanIDIndexedApTypeInfo:
87 |         ignore: true
88 |         type: DisplayString
89 |       hwWlanIDIndexedApIpAddress:
90 |         ignore: true


--------------------------------------------------------------------------------
/generator/huawei/wireless/huawei-ac.yml:
--------------------------------------------------------------------------------
 1 | # Prometheus通过文件发现机制定义的采集目标
 2 | # /root/monitor/prometheus/targets/h3c-ac.yml
 3 | - labels:
 4 |     module: huawei_ac    # generator.yml中定义的指标模块名称，如果有多个可以写多个模块名
 5 |     auth: public_v2     # generator.yml中定义的认证模块名
 6 |     brand: H3C       # 可删除可自定义
 7 |     hostname: XX-XXXX-CORE  # 可删除可自定义
 8 |     model: AC6005        # 可删除可自定义
 9 |   targets:
10 |     - 172.17.14.1   # 需要采集的无线控制器管理IP


--------------------------------------------------------------------------------
/generator/huawei/wireless/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # 全局配置文件
 2 | global:
 3 |   # ......
 4 | # 告警插件配置
 5 | alerting:
 6 |   # ......
 7 | # 按照设定参数进行扫描加载，用于自定义报警规则，其报警媒介和route路由由alertmanager插件实现
 8 | rule_files:
 9 |   # ......
10 | 
11 | # 设定采集对象，这里既有静态设置也有设置服务发现
12 | scrape_configs:
13 |   # ......
14 |   
15 |   # 采集华为AC信息
16 |   - job_name: "huawei_wireless"
17 |     scrape_interval: 15s
18 |     scrape_timeout: 10s
19 |     file_sd_configs:
20 |       - files:
21 |         - /root/monitor/prometheus/targets/huawei-*.yml
22 |         refresh_interval: 2m
23 |     metrics_path: /snmp
24 |     relabel_configs:
25 |     - source_labels: ["__address__"]
26 |       target_label: __param_target
27 |     - source_labels: ["__param_target"]
28 |       target_label: instance
29 |     - target_label: __address__
30 |       replacement: 172.17.40.54:9116 # snmp_exporter 服务IP地址
31 |     - source_labels: ["module"] # 从自定义的目标标签获取指标模块名称
32 |       target_label: __param_module
33 |     - source_labels: ["auth"] # 从自定义的目标标签获取认证模块名称
34 |       target_label: __param_auth


--------------------------------------------------------------------------------
/generator/ruijie/wireless/README.md:
--------------------------------------------------------------------------------
 1 | 本目录中generator.yml是适配了锐捷无线RG-WS6XXX系列的无线控制器。
 2 | 已完成测试：RG-WS6008、RG-WS6108、RG-WS6512其他型号未做测试，理论上讲RGOS 11.X系列通用。
 3 | 
 4 | 锐捷无线产品mib库下载链接：
 5 | 根据对应的版本下载对应的MIB
 6 | 
 7 | 下载路径：智能客服 > 转人工 > 提交需要获取的MIB文件的设备型号与版本号 > 等待人工回复
 8 | 
 9 | 说明：
10 | 
11 | 锐捷无线AC AP MIB OID节点获取？
12 | MIB库文件需要申请并签署保密协议，详细咨询4008111000。


--------------------------------------------------------------------------------
/generator/ruijie/wireless/generator-demo.yml:
--------------------------------------------------------------------------------
 1 | auths:  # 认证模块
 2 |   public_v2:  # 认证模块名称 可自定义 在prometheus.yml中需要配置参数auth对应这个名称
 3 |     version: 2  # 定义SNMP Agent的版本为v2c 支持v3
 4 |     community: public   # SNMP Agent的团体名设置和AC中设置的团体名需一致
 5 | 
 6 | modules:  # 指标模块
 7 |   ruijie_ac:  # 指标模块名称 可自定义
 8 |     walk:
 9 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.1.1.8      # AC的设备名称 - ruijieAcAcName
10 |       #- 1.3.6.1.4.1.4881.1.1.10.2.56.1.1.21     # AC的温度 - ruijieAcTemp
11 |       - ruijieCPUUtilization5Sec                # AC的CPU利用率 - ruijieCPUUtilization5Sec
12 |       - ruijieCPUUtilization1Min                # AC的CPU利用率 - ruijieCPUUtilization1Min
13 |       - ruijieCPUUtilization5Min                # AC的CPU利用率 - ruijieCPUUtilization5Min
14 |       - 1.3.6.1.4.1.4881.1.1.10.2.35.1.1.1.3    # AC的内存利用率 - ruijieMemoryPoolCurrentUtilization
15 |       - 1.3.6.1.4.1.4881.1.1.10.2.1.1.27        # AC的运行时长 - ruijieSystemUptime
16 |       - 1.3.6.1.2.1.2.2.1.1                     # ifIndex - 接口索引
17 |       - 1.3.6.1.2.1.2.2.1.2                     # IfDescr - 描述接口的字符串
18 |       - 1.3.6.1.2.1.2.2.1.5                     # ifSpeed - 估计的接口当前带宽，单位是bit/s
19 |       - 1.3.6.1.2.1.31.1.1.1.15                 # ifHighSpeed - 接口当前带宽的估计值 单位为1,000,000 bit/s
20 |       - 1.3.6.1.2.1.31.1.1.1.18                 # ifAlias - 网络管理员指定的接口别名
21 |       - 1.3.6.1.2.1.2.2.1.8                     # ifOperStatus - 接口当前的状态
22 |       - 1.3.6.1.2.1.2.2.1.13                    # ifInDiscards - 入方向的被丢弃的报文个数
23 |       - 1.3.6.1.2.1.2.2.1.14                    # ifInErrors - 出错而不会被送往上层协议的报文/传输单元个数
24 |       - 1.3.6.1.2.1.2.2.1.19                    # ifOutDiscards - 出方向的被丢弃的报文个数
25 |       - 1.3.6.1.2.1.2.2.1.20                    # ifOutErrors - 出错而不会被传送的报文/传输单元个数
26 |       - 1.3.6.1.2.1.31.1.1.1.6                  # ifHCInOctets - 接口上接收到的字节总数 包括成帧的字符 该节点有64bit 是ifInOctets的扩充
27 |       - 1.3.6.1.2.1.31.1.1.1.10                 # ifHCOutOctets - 接口上发送到的字节总数 包括成帧字符 该节点有64bit 是ifOutOctets的扩充
28 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.2  # AP的名称 - ruijieApApName
29 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.48 # AP的状态 - ruijieApState
30 |       #- 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.1  # AP的Mac - ruijieApMacAddr
31 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.39 # AP的型号 - ruijieApPID
32 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.33 # AP的IP - ruijieApIp
33 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.52 # AP的上线时长 - ruijieApUptimeMs
34 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.51 # AP的启动时间 - ruijieApLinkOnTimeIntervalMs
35 |       - 1.3.6.1.4.1.4881.1.1.10.2.1.1.49.1.6    # AP的内存使用率 - ruijieSystemApMemoryPoolCurrentUtilization
36 |       - 1.3.6.1.4.1.4881.1.1.10.2.1.1.49.1.4    # AP的CPU使用率(5s) - ruijieSystemApCPUUtilizationCurrent
37 |       - 1.3.6.1.4.1.4881.1.1.10.2.1.1.49.1.5    # AP的CPU使用率(5m) - ruijieSystemApCPUUtilizationAverage
38 |       #- 1.3.6.1.4.1.4881.1.1.10.2.1.1.49.1.9    # AP的温度 - ruijieSystemAPDeviceTemperature
39 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.34 # AP的在线用户数 - ruijieApStaNum
40 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.1.1.15     # 当前连接到本AC的无线用户数 - ruijieAcStaNum
41 |       - 1.3.6.1.4.1.4881.1.1.10.2.56.1.1.11     # 当前连接到本AC的AP数 - ruijieAcApNum
42 | 
43 |     max_repetitions: 25
44 |     retries: 3
45 |     timeout: 5s
46 |   
47 |     lookups:
48 |       - source_indexes: [ifIndex]
49 |         lookup: ifAlias
50 |       - source_indexes: [ifIndex]
51 |         lookup: ifDescr
52 |       - source_indexes: [ifIndex]
53 |         lookup: ifOperStatus
54 |       - source_indexes: [ruijieApMacAddr]
55 |         lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.2
56 |       - source_indexes: [ruijieApMacAddr]
57 |         lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.48
58 |       - source_indexes: [ruijieApMacAddr]
59 |         lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.39
60 |       - source_indexes: [ruijieApMacAddr]
61 |         lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.33
62 |       - source_indexes: [ruijieSystemApStatMacAddr]
63 |         lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.2
64 |       - source_indexes: [ruijieSystemApStatMacAddr]
65 |         lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.48
66 |       - source_indexes: [ruijieSystemApStatMacAddr]
67 |         lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.39
68 |       - source_indexes: [ruijieSystemApStatMacAddr]
69 |         lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.33
70 | 
71 |     overrides:
72 |       ifAlias:
73 |         ignore: true
74 |       ifDescr:
75 |         ignore: true
76 |       ifOperStatus:
77 |         ignore: true
78 |       ruijieApApName:
79 |         ignore: true
80 |         type: DisplayString
81 |       ruijieApState:
82 |         ignore: true
83 |       ruijieApPID:
84 |         ignore: true
85 |         type: DisplayString
86 |       ruijieApIp:
87 |         ignore: true


--------------------------------------------------------------------------------
/generator/ruijie/wireless/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # 全局配置文件
 2 | global:
 3 |   # ......
 4 | # 告警插件配置
 5 | alerting:
 6 |   # ......
 7 | # 按照设定参数进行扫描加载，用于自定义报警规则，其报警媒介和route路由由alertmanager插件实现
 8 | rule_files:
 9 |   # ......
10 | 
11 | # 设定采集对象，这里既有静态设置也有设置服务发现
12 | scrape_configs:
13 |   # ......
14 |   
15 |   # 采集华为AC信息
16 |   - job_name: "ruijie_wireless"
17 |     scrape_interval: 15s
18 |     scrape_timeout: 10s
19 |     file_sd_configs:
20 |       - files:
21 |         - /root/monitor/prometheus/targets/ruijie-*.yml
22 |         refresh_interval: 2m
23 |     metrics_path: /snmp
24 |     relabel_configs:
25 |     - source_labels: ["__address__"]
26 |       target_label: __param_target
27 |     - source_labels: ["__param_target"]
28 |       target_label: instance
29 |     - target_label: __address__
30 |       replacement: 172.17.40.54:9116 # snmp_exporter 服务IP地址
31 |     - source_labels: ["module"] # 从自定义的目标标签获取指标模块名称
32 |       target_label: __param_module
33 |     - source_labels: ["auth"] # 从自定义的目标标签获取认证模块名称
34 |       target_label: __param_auth


--------------------------------------------------------------------------------
/generator/ruijie/wireless/ruijie-ac.yml:
--------------------------------------------------------------------------------
 1 | # Prometheus通过文件发现机制定义的采集目标
 2 | # /root/monitor/prometheus/targets/h3c-ac.yml
 3 | - labels:
 4 |     module: ruijie_ac    # generator.yml中定义的指标模块名称，如果有多个可以写多个模块名
 5 |     auth: public_v2     # generator.yml中定义的认证模块名
 6 |     brand: Ruijie       # 可删除可自定义
 7 |     hostname: XX-XXXX-XXX  # 可删除可自定义
 8 |     model: RG-WS6008        # 可删除可自定义
 9 |   targets:
10 |     - 172.17.14.1   # 需要采集的无线控制器管理IP


--------------------------------------------------------------------------------
/generator/sangfor/ac/README.md:
--------------------------------------------------------------------------------
1 | 深信服AC采集配置
2 | 
3 | 配置无法生成，深信服AC还是通过API获取数据并监控比较合适


--------------------------------------------------------------------------------
/generator/sangfor/ac/generator.yml:
--------------------------------------------------------------------------------
 1 | auths:  # 认证模块
 2 |   public_v2:  # 认证模块名称 可自定义 在prometheus.yml中需要配置参数auth对应这个名称
 3 |     version: 2  # 定义SNMP Agent的版本为v2c 支持v3
 4 |     community: public   # SNMP Agent的团体名设置和AC中设置的团体名需一致
 5 | 
 6 | modules:  # 指标模块
 7 |   # 深信服AD设备信息抓取
 8 |   sangfor_ac: # 指标模块名称 可自定义
 9 |     walk:
10 |       - 1.3.6.1.4.1.35047.1.1         # 深信服AC设备名称和model
11 |       - 1.3.6.1.4.1.35047.1.2         # 深信服AC 系统时间
12 |       - 1.3.6.1.4.1.35047.1.3         # 深信服AC CPU使用率
13 |       - 1.3.6.1.4.1.35047.1.10        # 深信服AC 剩余内存
14 |       - 1.3.6.1.4.1.35047.1.11        # 深信服AC 总内存
15 |       - 1.3.6.1.4.1.35047.1.5.1.4     # 深信服AC 磁盘已使用空间
16 |       - 1.3.6.1.4.1.35047.1.5.1.5     # 深信服AC 磁盘可使用空间
17 |       - 1.3.6.1.4.1.35047.1.5.1.6     # 深信服AC 磁盘占用率
18 |       - 1.3.6.1.4.1.35047.1.7         # 深信服AC 双机状态
19 |       - 1.3.6.1.4.1.35047.2.1.1.1     # 深信服AC numOfCurOnLine 实时在线用户数
20 |       - 1.3.6.1.4.1.35047.2.1.1.2     # 深信服AC numOfMaxOnLine 最大在线用户数
21 |       - 1.3.6.1.4.1.35047.2.1.1.3     # 深信服AC numOfCurOU 当前用户组数量
22 |       - 1.3.6.1.4.1.35047.2.1.1.4     # 深信服AC numOfMaxOU 最大用户组数量
23 |       - 1.3.6.1.4.1.35047.2.1.1.5     # 深信服AC numOfMaxSession 最大会话数
24 |       - 1.3.6.1.4.1.35047.2.1.1.6     # 深信服AC numOfSession 实时会话数
25 |       - version                       # 深信服AC version 版本
26 |       - 1.3.6.1.4.1.35047.2.1.1.8     # 深信服AC 网关模式
27 |       - 1.3.6.1.4.1.35047.2.1.2.1.2   # 深信服AC 接口名称
28 |       - 1.3.6.1.4.1.35047.2.1.2.1.3   # 深信服AC 接口区域
29 |       - 1.3.6.1.4.1.35047.2.1.2.1.4   # 深信服AC 接口状态
30 |       - 1.3.6.1.4.1.35047.2.1.2.1.5   # 深信服AC 每秒发送数据包数量
31 |       - 1.3.6.1.4.1.35047.2.1.2.1.6   # 深信服AC 每秒接收数据包数量
32 |       - 1.3.6.1.4.1.35047.2.1.2.1.7   # 深信服AC 每秒发送字节
33 |       - 1.3.6.1.4.1.35047.2.1.2.1.8   # 深信服AC 每秒接收字节
34 |       - 1.3.6.1.4.1.35047.2.1.4.1.3   # 深信服AC 网管序列号状态
35 |       - 1.3.6.1.4.1.35047.2.1.4.1.4   # 深信服AC 网关杀毒序列号状态
36 |       - 1.3.6.1.4.1.35047.2.1.4.1.5   # 深信服AC 多功能序列号状态
37 |       - 1.3.6.1.4.1.35047.2.1.4.1.6   # 深信服AC 跨运营商序列号状态
38 |       - 1.3.6.1.4.1.35047.2.1.4.1.7   # 深信服AC 软件升级序列号状态
39 |       - 1.3.6.1.4.1.35047.2.1.4.1.8   # 深信服AC 安全桌面序列号状态
40 |       - 1.3.6.1.4.1.35047.2.1.4.1.9   # 深信服AC URL/应用规则库升级序列号状态
41 |       - 1.3.6.1.4.1.35047.2.1.5.1.2   # 深信服AC 日志信息拦截数/记录数
42 |       - 1.3.6.1.4.1.35047.2.1.5.1.3   # 深信服AC 日志信息网页访问
43 |       - 1.3.6.1.4.1.35047.2.1.5.1.4   # 深信服AC 日志信息邮件收发
44 |       - 1.3.6.1.4.1.35047.2.1.5.1.5   # 深信服AC 日志信息外发文件
45 |       - 1.3.6.1.4.1.35047.2.1.5.1.6   # 深信服AC 日志信息论坛发帖
46 |       - 1.3.6.1.4.1.35047.2.1.5.1.7   # 深信服AC 日志信息聊天内容
47 |     
48 |     max_repetitions: 50
49 |     retries: 3
50 |     timeout: 5s
51 | 
52 |     lookups:
53 |       - source_indexes: [1.3.6.1.4.1.35047.2.1.2.1.1]
54 |         lookup: 1.3.6.1.4.1.35047.2.1.2.1.2
55 |       - source_indexes: [1.3.6.1.4.1.35047.2.1.2.1.1]
56 |         lookup: 1.3.6.1.4.1.35047.2.1.2.1.3
57 | 
58 |     overrides:
59 |       1.3.6.1.4.1.35047.2.1.2.1.2:
60 |         type: DisplayString
61 |         ignore: true
62 |       1.3.6.1.4.1.35047.2.1.2.1.3:
63 |         type: DisplayString
64 |         ignore: true


--------------------------------------------------------------------------------
/generator/sangfor/ac/sangfor-ac.txt:
--------------------------------------------------------------------------------
 1 | .1.3.6.1.4.1.35047.1.3    Cpu使用率
 2 | .1.3.6.1.4.1.35047.1.4    Free memory. 剩余内存
 3 | .1.3.6.1.4.1.35047.1.9    sfSysTotalMemory
 4 | .1.3.6.1.4.1.35047.1.5.1.4   sfDiskUsed 已使用空间
 5 | .1.3.6.1.4.1.35047.1.5.1.5   sfDiskAvail 可使用空间
 6 | .1.3.6.1.4.1.35047.1.5.1.6   sfDiskUsedPercent 磁盘占用率
 7 | 
 8 | .1.3.6.1.4.1.35047.2.1.1.1    实时在线用户数
 9 | .1.3.6.1.4.1.35047.2.1.1.2    最大在线用户数
10 | .1.3.6.1.4.1.35047.2.1.1.3    当前用户组数量
11 | .1.3.6.1.4.1.35047.2.1.1.4    最大用户组数量
12 | .1.3.6.1.4.1.35047.2.1.1.5    最大会话数
13 | .1.3.6.1.4.1.35047.2.1.1.6    实时会话数
14 | 
15 | .1.3.6.1.4.1.35047.2.1.2.1.2   interface name 接口名称
16 | .1.3.6.1.4.1.35047.2.1.2.1.3   link area 接口区域
17 | .1.3.6.1.4.1.35047.2.1.2.1.4   link status 接口状态
18 | .1.3.6.1.4.1.35047.2.1.2.1.5   Number of send packets per second 每秒发送数据包数量
19 | .1.3.6.1.4.1.35047.2.1.2.1.6   Number of receive packets per second 每秒接收数据包数量
20 | .1.3.6.1.4.1.35047.2.1.2.1.7   Number of send Bytes per second 每秒发送字节
21 | .1.3.6.1.4.1.35047.2.1.2.1.8   Number of receive Bytes per second 每秒接收字节
22 | 
23 | 
24 | .1.3.6.1.4.1.35047.2.1.5.1.2    block/record  拦截数/记录数
25 | .1.3.6.1.4.1.35047.2.1.5.1.3    http get 网页访问
26 | .1.3.6.1.4.1.35047.2.1.5.1.4    send mail or receive mail 邮件收发
27 | .1.3.6.1.4.1.35047.2.1.5.1.5    send file or receive file 外发文件
28 | .1.3.6.1.4.1.35047.2.1.5.1.6    web BBS 论坛发帖
29 | .1.3.6.1.4.1.35047.2.1.5.1.7    IM chat 聊天内容


--------------------------------------------------------------------------------
/generator/sangfor/ad/README.md:
--------------------------------------------------------------------------------
1 | 本目录中generator.yml是适配了深信服AD设备。
2 | 已完成测试：AD7.0.8R4版本测试，其他版本未做测试，理论上讲7.0.x系列通用。
3 | 
4 | 深信服mib库下载链接：
5 | 
6 | 下载路径：AD > 系统管理 > SNMP > 下载MIB库


--------------------------------------------------------------------------------
/generator/sangfor/ad/generator.yml:
--------------------------------------------------------------------------------
 1 | auths:  # 认证模块
 2 |   public_v2:  # 认证模块名称 可自定义 在prometheus.yml中需要配置参数auth对应这个名称
 3 |     version: 2  # 定义SNMP Agent的版本为v2c 支持v3
 4 |     community: public   # SNMP Agent的团体名设置和AC中设置的团体名需一致
 5 | 
 6 | modules:  # 指标模块
 7 |   # 深信服AD设备信息抓取
 8 |   sangfor_ad: # 指标模块名称 可自定义
 9 |     walk:
10 |       - adSysName
11 |       - adCpuCostRate         # 深信服AD CPU使用率
12 |       - adMemCostRate         # 深信服AD 内存使用率
13 |       - sfIntCpuTemp          # 深信服 设备温度
14 |       - sfDiskTemp            # 深信服 磁盘
15 |       - sfFanSpeed            # 深信服 设备风扇
16 |       - adDiskCostRate        # 深信服AD 磁盘使用率
17 |       - sfDeviceStatus        # 深信服 磁盘状态 1为正常
18 |       - sfFanState            # 深信服 风扇状态 2 3为正常
19 |       - sfPowerState          # 深信服 电源状态 2为正常
20 |       - adConns               # AD系统并发连接数
21 |       - adNewConns            # AD系统新建连接数
22 |       - adVsHealthStatus      # 虚拟服务的健康状态
23 |       - adVsHealthNodeCnt     # 虚拟服务的健康节点个数
24 |       - adUplinkThroughput    # 所有链路上行流量（整型）
25 |       - adDownlinkThroughput  # 所有链路下行流量 （整型）
26 |       - adUptime              # AD设备运行时间
27 |       - adDevicePattern       # AD运行模式 单机是3
28 |       - adStandByState        # AD双机主备状态
29 |       - adLinkName            # AD链路名称
30 |       - adLinkType            # AD链路类型
31 |       - adLinkIfName          # AD链路引用的网口
32 |       - adLinkStatus          # 链路状态，0为离线，1为正常
33 |       - adLinkBitIn           # 链路上行流量
34 |       - adLinkBitOut          # 链路下行流量
35 |       - adLinkNumber          # 设备链路个数
36 |     
37 |     max_repetitions: 25
38 |     retries: 3
39 |     timeout: 5s
40 | 
41 |     lookups:
42 |       - source_indexes: [LinkIndex]
43 |         lookup: adLinkType
44 |       - source_indexes: [LinkIndex]
45 |         lookup: adLinkIfName
46 |       - source_indexes: [LinkIndex]
47 |         lookup: adLinkName
48 | 
49 |     overrides:
50 |       adSysName:
51 |         type: DisplayString
52 |       adLinkName:
53 |         type: DisplayString
54 |         ignore: true
55 |       adLinkIfName:
56 |         type: DisplayString
57 |         ignore: true
58 |       sfCpuTemp:
59 |         type: DisplayString
60 |       adLinkType:
61 |         type: DisplayString
62 |         ignore: true
63 |       adVsHealthStatus:
64 |         type: DisplayString
65 |       sfFanState:
66 |         type: DisplayString
67 |       sfPowerState:
68 |         type: DisplayString


--------------------------------------------------------------------------------
/generator/sangfor/ad/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # 全局配置文件
 2 | global:
 3 |   # ......
 4 | # 告警插件配置
 5 | alerting:
 6 |   # ......
 7 | # 按照设定参数进行扫描加载，用于自定义报警规则，其报警媒介和route路由由alertmanager插件实现
 8 | rule_files:
 9 |   # ......
10 | 
11 | # 设定采集对象，这里既有静态设置也有设置服务发现
12 | scrape_configs:
13 |   # ......
14 |   
15 |   # 采集华为AC信息
16 |   - job_name: "sangfor_ad"
17 |     scrape_interval: 15s
18 |     scrape_timeout: 10s
19 |     # file_sd_configs:  # 可启动文件自动发现机制
20 |     #  - files:
21 |     #    - /opt/monitor/prometheus/targets/huawei-ac.yml  # 定义AC的目标IP
22 |     #    refresh_interval: 2m
23 |     static_configs:
24 |       - targets:
25 |         - 192.168.1.2  # 深信服设备管理IP 如深信服AD的管理IP
26 |         # - tcp://192.168.1.3:1161  # SNMP设备使用自定义的TCP端口传输
27 |     metrics_path: /snmp
28 |     params:
29 |       auth: [public_v2]   # generator.yml中自定义的认证模块名称
30 |       module: [sangfor_ad]    # generator.yml中自定义的指标模块名称
31 |     relabel_configs:
32 |     - source_labels: ["__address__"]
33 |       target_label: __param_target
34 |     - source_labels: ["__param_target"]
35 |       target_label: instance
36 |     - target_label: __address__
37 |       replacement: 127.0.0.1:9116  # snmp_exporter服务器真实IP地址和端口 需要修改为snmp_exporter运行的监听地址和端口


--------------------------------------------------------------------------------
/generator/sangfor/af/README.md:
--------------------------------------------------------------------------------
1 | 还未完成AF的适配


--------------------------------------------------------------------------------
/generator/synology/README.md:
--------------------------------------------------------------------------------
 1 | ## 群晖 `NAS` 监控
 2 | 💻📊🔥✅
 3 | 监控群晖 `NAS`
 4 | 
 5 | - 整体监控架构基于 Prometheus 体系
 6 | - 指标采集器基于 SNMP Exporter
 7 | - 仪表板基于 Grafana
 8 | 
 9 | ## 监控架构
10 | 
11 | ![监控架构](img/image.png)
12 | 
13 | 上图中的架构，采集对象是群晖 `NAS` 采集器使用 SNMP Exporter，基于 vmagent 采集，可以使用的时序库 `Prometheus` 或者 `VictoriaMetrics`，最终使用 `Grafana` 呈现群晖的整体状态。
14 | 
15 | ## 推荐架构
16 | 
17 | ![推荐架构](img/image-1.png)
18 | 
19 | 我这边建议使用夜莺生态架构，可以满足绝大部分的场景需求，符合大部分企业对可观测性的要求。
20 | 
21 | ## 该采集如何使用可查看公众号文章
22 | 
23 | 该仪表板主要采集群晖 `NAS` 指标，实现对群晖 `NAS` 各组件和内部信息的监控和告警，这里主要以 `SNMP` 协议开启采集。
24 | 
25 | ## SNMP
26 | 
27 | 以 `SNMP Exporter` 作为采集器，群晖 `NAS` 开启 `SNMP` 服务，这里以 `SNMP v3` 版本为例，需要提前在群晖 NAS 上开启 `SNMP v3`，如下所示：
28 | 
29 | ![SNMP v3](img/1.jpg)
30 | 
31 | Grafana Dashboard ID:  [`22265`](https://grafana.com/grafana/dashboards/22265)
32 | 
33 | ## 如何采集
34 | 
35 | - 利用我生成好的配置文件
36 | - 修改 `SNMP` 认证模块参数
37 | - 抓取配置配置好
38 | 
39 | 采集配置文件：[采集配置](snmp/snmp_synology_nas.yml)
40 | 
41 | 修改 `snmp_synology_nas.yml` 文件中的头部认证模块：
42 | 
43 | ```yaml
44 | auths:
45 |   synology_v3:
46 |     community: public
47 |     security_level: authPriv
48 |     username: monitor
49 |     password: Mrot@2024neo
50 |     auth_protocol: SHA
51 |     priv_protocol: AES
52 |     priv_password: Mrot@2024mei
53 |     version: 3
54 | ```
55 | 
56 | 在群晖NAS中你配置的 `SNMP v3` 版本的参数在配置文件中修改下，修改完成后，可以直接保存即可，把你的配置文件放到 SNMP Exporter 采集配置中，即可实现采集。
57 | 
58 | 配置抓取任务：
59 | 
60 | ```yaml
61 | scrape_configs:
62 |   - job_name: "synology"
63 |     scrape_interval: 15s
64 |     scrape_timeout: 10s
65 |     file_sd_configs:
66 |       - files:
67 |         - /etc/victoriametrics/vmagent/synology-nas.yml
68 |         # refresh_interval: 2m vmagent 不支持这个参数 prometheus 中可使用这个参数
69 |     relabel_configs:
70 |     - source_labels: ["__address__"]
71 |       target_label: __param_target
72 |     - source_labels: ["__param_target"]
73 |       target_label: instance
74 |     - target_label: __address__
75 |       replacement: 172.17.40.13:9116
76 |     - source_labels: ["module"]
77 |       target_label: __param_module
78 |     - source_labels: ["auth"]
79 |       target_label: __param_auth
80 |       
81 | 
82 | 
83 | # /etc/victoriametrics/vmagent/synology-nas.yml
84 | # Prometheus 通过文件发现机制定义的采集目标
85 | - labels:
86 |     module: synology_common,synology_interface
87 |     auth: synology_v3
88 |     brand: Synology
89 |   targets:
90 |     - 172.17.40.140
91 | ```
92 | 
93 | ## 更多信息
94 | 
95 | 如果需要了解关于监控的更多信息，还请关注公众号：网络小斐，下面是公众号二维码。
96 | 
97 | ![公众号](img/qrcode.jpg)


--------------------------------------------------------------------------------
/generator/synology/img/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator/synology/img/1.jpg


--------------------------------------------------------------------------------
/generator/synology/img/image-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator/synology/img/image-1.png


--------------------------------------------------------------------------------
/generator/synology/img/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator/synology/img/image.png


--------------------------------------------------------------------------------
/generator/synology/img/qrcode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator/synology/img/qrcode.jpg


--------------------------------------------------------------------------------
/generator/test/generator_haikang_monitor.yml:
--------------------------------------------------------------------------------
 1 | auths:
 2 |   haikang_auth:
 3 |     version: 3
 4 |     username: dnt
 5 |     # noAuthNoPriv authNoPriv authPriv
 6 |     security_level: authPriv
 7 |     password: Dnt@jiankong241
 8 |     # MD5, SHA, SHA224, SHA256, SHA384, or SHA512
 9 |     auth_protocol: SHA
10 |     # DES, AES, AES192, AES256, AES192C, or AES256C
11 |     priv_protocol: AES
12 |     priv_password: Dnt@jiankong241
13 |     # context_name: context
14 | 
15 | modules:  # 指标模块
16 |   haikang_metrics:
17 |     walk:
18 |       - 1.3.6.1.4.1.39165.1.1       # 设备类型
19 |       - 1.3.6.1.4.1.39165.1.2       # 硬件版本
20 |       - 1.3.6.1.4.1.39165.1.3       # 软件版本
21 |       - 1.3.6.1.4.1.39165.1.4       # MAC地址
22 |       - 1.3.6.1.4.1.39165.1.5       # 厂商代码
23 |       - 1.3.6.1.4.1.39165.1.6       # 厂商名称
24 |       - 1.3.6.1.4.1.39165.1.7       # CPU利用率
25 |       # - 1.3.6.1.4.1.39165.1.8       # 硬盘大小
26 |       - 1.3.6.1.4.1.39165.1.9       # 硬盘使用率
27 |       # - 1.3.6.1.4.1.39165.1.10      # 内存大小
28 |       - 1.3.6.1.4.1.39165.1.11      # 内存使用率
29 |       - 1.3.6.1.4.1.39165.1.12      # 设备重启
30 |       - 1.3.6.1.4.1.39165.1.13      # 动态IP地址
31 |       - 1.3.6.1.4.1.39165.1.14      # 动态掩码
32 |       - 1.3.6.1.4.1.39165.1.15      # 动态网关
33 |       - 1.3.6.1.4.1.39165.1.16      # 静态IP地址
34 |       - 1.3.6.1.4.1.39165.1.17      # 静态掩码
35 |       - 1.3.6.1.4.1.39165.1.18      # 静态网关
36 |       - 1.3.6.1.4.1.39165.1.19      # 系统时间
37 |       - 1.3.6.1.4.1.39165.1.20      # 视频输入通道数
38 |       - 1.3.6.1.4.1.39165.1.21      # 视频编码格式
39 |       - 1.3.6.1.4.1.39165.1.22      # 视频网传格式
40 |       - 1.3.6.1.4.1.39165.1.23      # 有无音频能力
41 |       - 1.3.6.1.4.1.39165.1.24      # 音频输入数目
42 |       - 1.3.6.1.4.1.39165.1.25      # 音频输出数目
43 |       - 1.3.6.1.4.1.39165.1.26      # 透明通道数目
44 |       - 1.3.6.1.4.1.39165.1.27      # 是否支持本地存储
45 |       - 1.3.6.1.4.1.39165.1.28      # 是否支持RTST回看
46 |       - 1.3.6.1.4.1.39165.1.29      # 支持的网络接入类型
47 |       - 1.3.6.1.4.1.39165.1.30      # 告警输入通道数目
48 |       - 1.3.6.1.4.1.39165.1.31      # 告警输出通道数目
49 | 
50 |     max_repetitions: 25
51 |     # 查询失败时的最大重复次数，查询的总时间为 timeout * retries
52 |     retries: 3
53 |     # 每个单独的 SNMP 查询返回数据的超时时间（秒）
54 |     timeout: 5s
55 |     allow_nonincreasing_oids: false
56 |     use_unconnected_udp_socket: false
57 |   
58 |     lookups:  # 不是表量
59 | 
60 |     overrides:
61 |       1.3.6.1.4.1.39165.1.7:
62 |         ignore: false
63 |         regex_extracts:
64 |           '':   # 指标名称保持不变
65 |             - regex: '([0-9]+) (.*)'
66 |               value: '$1'   # float64
67 |       1.3.6.1.4.1.39165.1.9:
68 |         ignore: false
69 |         regex_extracts:
70 |           '':   # 指标名称保持不变
71 |             - regex: '([0-9]+) (.*)'
72 |               value: '$1'   # float64
73 |       1.3.6.1.4.1.39165.1.11:
74 |         ignore: false
75 |         regex_extracts:
76 |           '':   # 指标名称保持不变
77 |             - regex: '([0-9]+) (.*)'
78 |               value: '$1'   # float64
79 |       1.3.6.1.4.1.39165.1.13:
80 |         type: InetAddressIPv4
81 |       1.3.6.1.4.1.39165.1.14:
82 |         type: InetAddressIPv4
83 |       1.3.6.1.4.1.39165.1.15:
84 |         type: InetAddressIPv4
85 |       1.3.6.1.4.1.39165.1.16:
86 |         type: InetAddressIPv4
87 |       1.3.6.1.4.1.39165.1.17:
88 |         type: InetAddressIPv4
89 |       1.3.6.1.4.1.39165.1.18:
90 |         type: InetAddressIPv4


--------------------------------------------------------------------------------
/prometheus/rules/prod/blackbox.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: 网络协议服务状态-告警
 3 |   rules:
 4 |   - alert: 站点可用性
 5 |     expr: probe_success{job="blackbox_exporter"} == 0
 6 |     for: 1m
 7 |     labels:
 8 |       alertype: domain
 9 |       severity: Critical
10 |     annotations:
11 |       description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }})：站点无法访问**  \n  > {{ $labels.instance }}"
12 | 
13 |   - alert: 站点1h可用性低于80%
14 |     expr: sum_over_time(probe_success{job="blackbox_exporter"}[1h])/count_over_time(probe_success{job="blackbox_exporter"}[1h]) * 100 < 80
15 |     for: 3m
16 |     labels:
17 |       alertype: domain
18 |       severity: warning
19 |     annotations:
20 |       description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }})：站点1h可用性：{{ $value | humanize }}%**  \n  > {{ $labels.instance }}"
21 | 
22 |   - alert: 站点状态异常
23 |     expr: (probe_success{job="blackbox_exporter"} == 0 and probe_http_status_code > 499) or probe_http_status_code == 0
24 |     for: 1m
25 |     labels:
26 |       alertype: domain
27 |       severity: warning
28 |     annotations:
29 |       description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }})：站点状态异常：{{ $value }}**  \n  > {{ $labels.instance }}"
30 | 
31 |   - alert: 站点耗时过高
32 |     expr: probe_duration_seconds > 0.5
33 |     for: 2m
34 |     labels:
35 |       alertype: domain
36 |       severity: warning
37 |     annotations:
38 |       description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }})：当前站点耗时：{{ $value | humanize }}s**  \n  > {{ $labels.instance }}"
39 | 
40 |   - alert: SSL证书有效期
41 |     expr: (probe_ssl_earliest_cert_expiry-time()) / 3600 / 24 < 15
42 |     for: 2m
43 |     labels:
44 |       alertype: domain
45 |       severity: warning
46 |     annotations:
47 |       description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }})：证书有效期剩余{{ $value | humanize }}天**  \n  > {{ $labels.instance }}"
48 | 
49 |   - alert: 采集状态
50 |     expr: up{job=~"blackbox_exporter|blackbox"} == 0
51 |     for: 3m
52 |     labels:
53 |       alertype: itself
54 |       severity: Critical
55 |     annotations:
56 |       description: "**{{ $labels.job }}：异常**  \n  > {{ $labels.module }}-{{ $labels.name }}-{{ $labels.instance }}"


--------------------------------------------------------------------------------
/prometheus/rules/prod/idrac-status.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: 物理机iDrac状态-告警
 3 |   rules:
 4 |   - alert: 节点存活--杭州IT机房
 5 |     expr:  globalSystemStatus{job="iDrac_SNMP"} != 3
 6 |     for: 1m
 7 |     labels:
 8 |       alertype: dell
 9 |       severity: Critical
10 |     annotations:
11 |       description: "**物理机【{{ $labels.instance }}】状态异常**  \n  > 状态值 = {{ $value }}"
12 | 
13 |   - alert: 内存状态--杭州IT机房
14 |     expr:  memoryDeviceStatus{job="iDrac_SNMP"} != 3
15 |     for: 1m  
16 |     labels:
17 |       alertype: dell
18 |       severity: Critical
19 |     annotations:
20 |       description: "**物理机内存第【{{$labels.memoryDeviceIndex}}】根故障**  \n  > 状态值 = {{ $value }}"
21 | 
22 |   - alert: CPU状态--杭州IT机房
23 |     expr:  processorDeviceStatus{job="iDrac_SNMP"} != 3
24 |     for: 1m
25 |     labels:
26 |       alertype: dell
27 |       severity: Critical
28 |     annotations:
29 |       description: "**物理机CPU第【{{$labels.processorDeviceIndex}}】块故障**  \n  > 状态值 = {{ $value }}"
30 |   
31 |   - alert: 虚拟磁盘状态--杭州IT机房
32 |     expr:  virtualDiskState{job="iDrac_SNMP"} != 2
33 |     for: 1m  
34 |     labels:
35 |       alertype: dell
36 |       severity: warning
37 |     annotations:
38 |       description: "**物理机虚拟磁盘第【{{$labels.virtualDiskNumber}}】块故障**  \n  > 状态值 = {{ $value }}"
39 |   
40 |   - alert: 电源状态--杭州IT机房
41 |     expr:  systemPowerState{job="iDrac_SNMP"} != 4
42 |     for: 1m  
43 |     labels:
44 |       alertype: dell
45 |       severity: Critical
46 |     annotations:
47 |       description: "**物理机【{{ $labels.instance }}】电源故障**  \n  > 状态值 = {{ $value }}"
48 | 
49 |   - alert: 网卡状态--杭州IT机房
50 |     expr:  networkDeviceStatus{job="iDrac_SNMP"} != 3
51 |     for: 1m  
52 |     labels:
53 |       alertype: dell
54 |       severity: Critical
55 |     annotations:
56 |       description: "**物理机网卡第【{{$labels.networkDeviceIndex}}】块故障**  \n  > 状态值 = {{ $value }}"
57 | 
58 |   - alert: 存储状态--杭州IT机房
59 |     expr:  globalStorageStatus{job="iDrac_SNMP"} != 3
60 |     for: 1m  
61 |     labels:
62 |       alertype: dell
63 |       severity: warning
64 |     annotations:
65 |       description: "**物理机存储第【{{$labels.networkDeviceIndex}}】块故障**  \n  > 状态值 = {{ $value }}"
66 | 
67 |   - alert: 采集状态
68 |     expr: up{job=~"idrac_snmp"} == 0
69 |     for: 3m
70 |     labels:
71 |       alertype: itself
72 |       severity: Critical
73 |     annotations:
74 |       description: "**{{ $labels.job }}：异常**  \n  > {{ $labels.brand }}-{{ $labels.module }}-{{ $labels.instance }}"


--------------------------------------------------------------------------------
/prometheus/rules/prod/node-exporter.yml:
--------------------------------------------------------------------------------
  1 | groups:
  2 | - name: node_usage_record_rules
  3 |   interval: 1m
  4 |   rules:
  5 |   - record: cpu:usage:rate1m
  6 |     expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
  7 |   - record: mem:usage:rate1m
  8 |     expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
  9 | 
 10 | - name: Linux服务器状态-告警
 11 |   rules:
 12 |   - alert: VM内存使用率
 13 |     expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
 14 |     for: 5m
 15 |     labels:
 16 |       alertype: system
 17 |       severity: warning
 18 |     annotations:
 19 |       description: "**{{ $labels.name }}：内存使用率{{ $value | humanize }}%**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 20 | 
 21 |   - alert: VM_CPU使用率
 22 |     expr: 100 - (avg by(instance,name,group,account) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
 23 |     for: 5m
 24 |     labels:
 25 |       alertype: system
 26 |       severity: warning
 27 |     annotations:
 28 |       description: "**{{ $labels.name }}：CPU使用率{{ $value | humanize }}%**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 29 | 
 30 |   - alert: VM系统负载
 31 |     expr: node_load5 / on (instance,name,group,account) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance,name,group,account)) by(instance,name,group,account) > 1.7
 32 |     for: 10m
 33 |     labels:
 34 |       alertype: system
 35 |       severity: warning
 36 |     annotations:
 37 |       description: "**{{ $labels.name }}：系统负载{{ $value | humanize }}倍**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 38 | 
 39 |   - alert: VM磁盘使用率
 40 |     expr: |
 41 |       100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype=~"ext.?|xfs",mountpoint!~".*pods.*|/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85
 42 |     for: 5m
 43 |     labels:
 44 |       alertype: system
 45 |       severity: warning
 46 |     annotations:
 47 |       description: "**{{ $labels.name }}_{{ $labels.mountpoint }}：磁盘使用率{{ $value | humanize }}%**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 48 | 
 49 |   - alert: VM主机重启
 50 |     expr: node_time_seconds - node_boot_time_seconds < 600
 51 |     for: 1m
 52 |     labels:
 53 |       alertype: system
 54 |       severity: warning
 55 |     annotations:
 56 |       description: "**{{ $labels.name }}：主机重启**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 57 | 
 58 |   - alert: VM文件系统只读
 59 |     expr: node_filesystem_readonly == 1
 60 |     for: 1m
 61 |     labels:
 62 |       alertype: system
 63 |       severity: warning
 64 |     annotations:
 65 |       description: "**{{ $labels.name }}-{{ $labels.mountpoint }}：文件系统只读**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 66 | 
 67 |   - alert: K8S节点POD磁盘使用率
 68 |     expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{mountpoint=~"/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85
 69 |     for: 5m
 70 |     labels:
 71 |       alertype: system
 72 |       severity: warning
 73 |     annotations:
 74 |       description: "**{{ $labels.name }}_{{ $labels.mountpoint }}：磁盘使用率{{ $value | humanize }}%**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 75 | 
 76 |   - alert: NFS磁盘使用率
 77 |     expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype="nfs"} * 100) > 90
 78 |     for: 5m
 79 |     labels:
 80 |       alertype: system
 81 |       severity: warning
 82 |     annotations:
 83 |       description: "**{{ $labels.name }}_{{ $labels.mountpoint }}：磁盘使用率{{ $value | humanize }}%**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 84 | 
 85 |   - alert: VM磁盘读写容量
 86 |     expr: (irate(node_disk_read_bytes_total[5m]) ) /1024 /1024  > 80 or (irate(node_disk_written_bytes_total[5m]) ) /1024 /1024 > 80
 87 |     for: 8m
 88 |     labels:
 89 |       alertype: disk
 90 |       severity: warning
 91 |     annotations:
 92 |       description: "**{{ $labels.name }}_{{ $labels.device }}：当前IO为{{ $value | humanize }}MB/s**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
 93 | 
 94 |   - alert: VM网络流入（下载）数据过多
 95 |     expr: sum by(device,instance, name, group, account) (irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70
 96 |     for: 5m
 97 |     labels:
 98 |       alertype: network
 99 |       severity: warning
100 |     annotations:
101 |       description: "**{{ $labels.name }}：流入数据为{{ $value | humanize }}MB/s**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
102 | 
103 |   - alert: VM网络流出（上传）数据过多
104 |     expr: sum by(device,instance, name, group, account) (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70
105 |     for: 5m
106 |     labels:
107 |       alertype: network
108 |       severity: warning
109 |     annotations:
110 |       description: "**{{ $labels.name }}：流出数据为{{ $value | humanize }}MB/s**  \n  > {{ $labels.group }}-{{ $labels.instance }}"
111 | 
112 | - name: Exporter服务状态-告警
113 |   rules:
114 |   - alert: Exporter状态
115 |     expr: up{job=~"windows_exporter|node_exporter"} == 0
116 |     for: 3m
117 |     labels:
118 |       alertype: itself
119 |       severity: Critical
120 |     annotations:
121 |       description: "**{{ $labels.job }}：异常**  \n  > {{ $labels.group }}-{{ $labels.name }}-{{ $labels.instance }}"


--------------------------------------------------------------------------------
/prometheus/rules/prod/sangfor-ad-status.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: 出口链路服务状态-告警
 3 |   rules:
 4 |   - alert: CMCC-Office下行出口带宽
 5 |     expr: (adLinkBitOut{adLinkName="CMCC-Office"} / 1000000) > 480
 6 |     for: 15m
 7 |     labels:
 8 |       alertype: network
 9 |       severity: High
10 |     annotations:
11 |       description: "**{{ $labels.adLinkName }}：下行带宽(15分钟内持续)使用超限**  \n  > 当前带宽 = {{ $value }}Mbps  \n  {{ $labels.adLinkIfName }}-{{ $labels.instance }}"
12 | 
13 |   - alert: CTCC-Office下行出口带宽
14 |     expr: (adLinkBitOut{adLinkName="CTCC-Office"} / 1000000) > 280
15 |     for: 10m
16 |     labels:
17 |       alertype: network
18 |       severity: High
19 |     annotations:
20 |       description: "**{{ $labels.adLinkName }}：下行带宽(10分钟内持续)使用超限**  \n  > 当前带宽 = {{ $value }}Mbps  \n  {{ $labels.adLinkIfName }}-{{ $labels.instance }}"
21 | 
22 |   - alert: CTCC-Server下行出口带宽
23 |     expr: (adLinkBitOut{adLinkName="CTCC-Server"} / 1000000) > 95
24 |     for: 10m
25 |     labels:
26 |       alertype: network
27 |       severity: High
28 |     annotations:
29 |       description: "**{{ $labels.adLinkName }}：下行带宽(10分钟内持续)使用超限**  \n  > 当前带宽 = {{ $value }}Mbps  \n  {{ $labels.adLinkIfName }}-{{ $labels.instance }}"
30 | 
31 |   - alert: CMCC-Office上行出口带宽
32 |     expr: (adLinkBitIn{adLinkName="CMCC-Office"} / 1000000) > 480
33 |     for: 15m
34 |     labels:
35 |       alertype: network
36 |       severity: warning
37 |     annotations:
38 |       description: "**{{ $labels.adLinkName }}：上行带宽(15分钟内持续)使用超限**  \n  > 当前带宽 = {{ $value }}Mbps \n  {{ $labels.adLinkName }}-{{ $labels.instance }}"
39 | 
40 |   - alert: CTCC-Office上行出口带宽
41 |     expr: (adLinkBitIn{adLinkName="CTCC-Office"} / 1000000) > 280
42 |     for: 10m
43 |     labels:
44 |       alertype: network
45 |       severity: warning
46 |     annotations:
47 |       description: "**{{ $labels.adLinkName }}：上行带宽(10分钟内持续)使用超限**  \n  > 当前带宽 = {{ $value }}Mbps  \n  {{ $labels.adLinkName }}-{{ $labels.instance }}"
48 | 
49 |   - alert: CTCC-Server上行出口带宽
50 |     expr: (adLinkBitIn{adLinkName="CTCC-Server"} / 1000000) > 95
51 |     for: 10m
52 |     labels:
53 |       alertype: network
54 |       severity: warning
55 |     annotations:
56 |       description: "**{{ $labels.adLinkName }}：上行带宽(10分钟内持续)使用超限**  \n  > 当前带宽 = {{ $value }}Mbps  \n  {{ $labels.adLinkName }}-{{ $labels.instance }}"
57 | 
58 |   - alert: 出口链路状态-故障
59 |     expr: adLinkStatus == 0
60 |     for: 0m
61 |     labels:
62 |       alertype: network
63 |       severity: Critical
64 |     annotations:
65 |       description: "**{{ $labels.adLinkName }}：出口链路离线**  \n  > 故障值 = {{ $value }}  \n  {{ $labels.adLinkName }}-{{ $labels.instance }}"
66 | 
67 |   - alert: 出口链路状态-繁忙
68 |     expr: adLinkStatus == 2
69 |     for: 2m
70 |     labels:
71 |       alertype: network
72 |       severity: High
73 |     annotations:
74 |       description: "**{{ $labels.adLinkName }}：出口链路(10分钟内持续)繁忙**  \n  > 故障值 = {{ $value }}  \n  {{ $labels.adLinkName }}-{{ $labels.instance }}"


--------------------------------------------------------------------------------
/prometheus/rules/prod/switch-status.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: 杭州华为交换机信息-告警
 3 |   rules: 
 4 |   - alert: 设备板卡温度告警
 5 |     expr:  0 < hwEntityTemperature >= hwEntityTemperatureThreshold
 6 |     for: 1m
 7 |     labels:
 8 |       alertype: switch
 9 |       severity: Critical
10 |     annotations:
11 |       description: "**设备板卡温度高门限**  \n  >  状态值 = {{ $value }}%"
12 | 
13 |   - alert: 交换机设备风扇状态
14 |     expr: hwEntityFanSpeed == 0
15 |     for: 1m
16 |     labels:
17 |       alertype: switch
18 |       severity: Critical
19 |     annotations:
20 |       description: "**{{ $labels.hostname }} - 设备风扇转速为 {{ $value }}% **  \n  > 风扇状态值：{{ $labels.hwEntityFanState }} - 风扇在位状态值：{{ $labels.hwEntityFanPresent }}"
21 |   
22 |   - alert: CPU使用率超限
23 |     expr: 0 < hwEntityCpuUsage{job=~"huawei_sw|huawei_sw"} >= 60
24 |     for: 5m
25 |     labels:
26 |       alertype: switch
27 |       severity: Critical
28 |     annotations:
29 |       description: "**{{ $labels.hostname }} - 交换机CPU使用率超限** \n  > 当前使用率 = {{ $value }}%"
30 |   
31 |   - alert: 内存使用率超限
32 |     expr: 0 < hwEntityMemUsage{job=~"huawei_sw|huawei_sw"} >= 85
33 |     for: 10m
34 |     labels:
35 |       alertype: switch
36 |       severity: Critical
37 |     annotations:
38 |       description: "**{{$labels.hwWlanIDIndexedApName}} - 交换机内存使用率超限**  \n  > 当前使用率 = {{ $value }}%"
39 | 
40 |   - alert: 核心交换机CSS集群状态
41 |     expr: hwCssMemberConfigEnable{job=~"huawei_sw"} != 1
42 |     for: 0m
43 |     labels:
44 |       alertype: switch
45 |       severity: Critical
46 |     annotations:
47 |       description: "**{{$labels.hostname}} CSS集群状态异常**  \n  > 当前状态值 = {{ $value }}"


--------------------------------------------------------------------------------
/prometheus/rules/prod/windows-status.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: Windows服务器状态-告警
 3 |   rules: 
 4 |   - alert: Windows Server 收集器不成功
 5 |     expr: windows_exporter_collector_success == 0
 6 |     for: 0m
 7 |     labels:
 8 |       alertype: windows
 9 |       severity: High
10 |     annotations:
11 |       description: "**Collector {{ $labels.collector }} was not successful**  \n  >  状态值 = {{ $value }}%"
12 | 
13 |   - alert: Windows 远程桌面状态不正常
14 |     expr: windows_service_status{exported_name="termservice", status="ok"} != 1
15 |     for: 1m
16 |     labels:
17 |       alertype: windows
18 |       severity: High
19 |     annotations:
20 |       description: "**远程桌面服务状态异常**  \n  > 状态值 = {{ $value }}"
21 |   
22 |   - alert: Windows服务器CPU使用率超过 90%
23 |     expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 90
24 |     for: 0m
25 |     labels:
26 |       alertype: windows
27 |       severity: warning
28 |     annotations:
29 |       description: "**CPU使用率超过 90%**  \n  > 使用率 = {{ $value }}%"
30 |   
31 |   - alert: Windows服务器内存使用率超过 90%
32 |     expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
33 |     for: 2m
34 |     labels:
35 |       alertype: windows
36 |       severity: warning
37 |     annotations:
38 |       description: "**内存使用率超过 90%**  \n  > 使用率 = {{ $value }}%"
39 | 
40 |   - alert: Windows服务器磁盘使用率超过 90%
41 |     expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 90
42 |     for: 2m
43 |     labels:
44 |       alertype: windows
45 |       severity: warning
46 |     annotations:
47 |       description: "**磁盘使用率超过90%**  \n  > 使用率 = {{ $value }}%  \n  volume = {{ $labels.volume }}"


--------------------------------------------------------------------------------
/prometheus/rules/vm/alerts-vmalert.yml:
--------------------------------------------------------------------------------
 1 | # File contains default list of alerts for vmalert service.
 2 | # The alerts below are just recommendations and may require some updates
 3 | # and threshold calibration according to every specific setup.
 4 | groups:
 5 |   # Alerts group for vmalert assumes that Grafana dashboard
 6 |   # https://grafana.com/grafana/dashboards/14950/ is installed.
 7 |   # Pls update the `dashboard` annotation according to your setup.
 8 |   - name: vmalert
 9 |     interval: 30s
10 |     rules:
11 |       - alert: ConfigurationReloadFailure
12 |         expr: vmalert_config_last_reload_successful != 1
13 |         labels:
14 |           severity: warning
15 |         annotations:
16 |           summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}"
17 |           description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}.
18 |             Check vmalert's logs for detailed error message."
19 | 
20 |       - alert: AlertingRulesError
21 |         expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0
22 |         for: 5m
23 |         labels:
24 |           severity: warning
25 |         annotations:
26 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
27 |           summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
28 |           description: "Alerting rules execution is failing for group \"{{ $labels.group }}\".
29 |             Check vmalert's logs for detailed error message."
30 | 
31 |       - alert: RecordingRulesError
32 |         expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0
33 |         for: 5m
34 |         labels:
35 |           severity: warning
36 |         annotations:
37 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
38 |           summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
39 |           description: "Recording rules execution is failing for group \"{{ $labels.group }}\".
40 |             Check vmalert's logs for detailed error message."
41 | 
42 |       - alert: RecordingRulesNoData
43 |         expr: sum(vmalert_recording_rules_last_evaluation_samples) without(id) < 1
44 |         for: 30m
45 |         labels:
46 |           severity: info
47 |         annotations:
48 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}"
49 |           summary: "Recording rule {{ $labels.recording }} ({{ $labels.group }}) produces no data"
50 |           description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" 
51 |             produces 0 samples over the last 30min. It might be caused by a misconfiguration 
52 |             or incorrect query expression."
53 | 
54 |       - alert: TooManyMissedIterations
55 |         expr: increase(vmalert_iteration_missed_total[5m]) > 0
56 |         for: 15m
57 |         labels:
58 |           severity: warning
59 |         annotations:
60 |           summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
61 |           description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\".
62 |             The group evaluation time takes longer than the configured evaluation interval. This may result in missed 
63 |             alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
64 |             group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert/#groups. 
65 |             If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/troubleshooting/#slow-queries."
66 | 
67 |       - alert: RemoteWriteErrors
68 |         expr: increase(vmalert_remotewrite_errors_total[5m]) > 0
69 |         for: 15m
70 |         labels:
71 |           severity: warning
72 |         annotations:
73 |           summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL"
74 |           description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting 
75 |             or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message."
76 | 
77 |       - alert: AlertmanagerErrors
78 |         expr: increase(vmalert_alerts_send_errors_total[5m]) > 0
79 |         for: 15m
80 |         labels:
81 |           severity: warning
82 |         annotations:
83 |           summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager"
84 |           description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\".
85 |             Check vmalert's logs for detailed error message."


--------------------------------------------------------------------------------
/prometheus/rules/vm/alerts-vmauth.yml:
--------------------------------------------------------------------------------
 1 | # File contains default list of alerts for vmauth service.
 2 | # The alerts below are just recommendations and may require some updates
 3 | # and threshold calibration according to every specific setup.
 4 | groups:
 5 |   - name: vmauth
 6 |     interval: 30s
 7 |     rules:
 8 |       - alert: ConcurrentRequestsLimitReached
 9 |         expr: sum(increase(vmauth_concurrent_requests_limit_reached_total[1m])) by (instance) > 0
10 |         for: 3m
11 |         labels:
12 |           severity: warning
13 |         annotations:
14 |           summary: "vmauth ({{ $labels.instance }}) reached concurrent requests limit"
15 |           description: "Possible solutions: increase the limit with flag: -maxConcurrentRequests, 
16 |           deploy additional vmauth replicas, check requests latency at backend service. 
17 |           See more details at https://docs.victoriametrics.com/vmauth/#concurrency-limiting"
18 |       - alert: UserConcurrentRequestsLimitReached
19 |         expr: sum(increase(vmauth_user_concurrent_requests_limit_reached_total[1m])) by (username) > 0
20 |         for: 3m
21 |         labels:
22 |           severity: warning
23 |         annotations:
24 |           summary: "vmauth has reached concurrent requests limit for username {{ $labels.username }}"
25 |           description: "Possible solutions: increase limit with flag: -maxConcurrentPerUserRequests, 
26 |           deploy additional vmauth replicas, check requests latency at backend service."


--------------------------------------------------------------------------------
/victoriametrics/README.md:
--------------------------------------------------------------------------------
1 | ## VictoriaMetrics 生态组件部署方案
2 | 
3 | 这个目录主要记录 VictoriaMetrics 生态组件的部署方案和脚本，可以作为测试环境和生产环境中部署参考。


--------------------------------------------------------------------------------
/victoriametrics/binary/PrometheusAlert/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署单节点 PrometheusAlert 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | PrometheusAlert 二进制文件放置在新创建目录：`/opt/PrometheusAlert` 目录中
 6 | 
 7 | PrometheusAlert 的配置参数文件在：`/opt/PrometheusAlert/conf/app.conf` 文件中，如果需要开启飞书、钉钉、企业微信等 webhook 配置可直接修该文件。
 8 | 
 9 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
10 | 
11 | - 状态：sudo systemctl status prometheusalert.service
12 | - 停止：sudo systemctl stop prometheusalert.service
13 | - 启动：sudo systemctl start prometheusalert.service
14 | - 重启：sudo systemctl restart prometheusalert.service
15 | - 开机自启：sudo systemctl enable prometheusalert.service
16 | 
17 | 更多关于 PrometheusAlert 的教程请查看官方文档：[PrometheusAlert文档](https://feiyu563.gitbook.io/)


--------------------------------------------------------------------------------
/victoriametrics/binary/PrometheusAlert/install-prometheusalert.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget unzip net-tools
  8 |     elif [ "$OS" == "centos" ] || [ "$OS" == "rocky" ]; then
  9 |         dnf update -y && dnf install -y curl wget unzip net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建 PrometheusAlert 安装目录
 19 |     mkdir -p /opt/PrometheusAlert
 20 | 
 21 |     # 检查 prometheusalert 组是否存在，不存在则创建
 22 |     if ! getent group prometheusalert > /dev/null 2>&1; then
 23 |         groupadd --system prometheusalert
 24 |     fi
 25 | 
 26 |     # 检查 prometheusalert 用户是否存在，不存在则创建
 27 |     if ! id -u prometheusalert > /dev/null 2>&1; then
 28 |         useradd --system --home-dir /opt/PrometheusAlert --no-create-home --gid prometheusalert prometheusalert
 29 |     fi
 30 | 
 31 |     chown -R prometheusalert:prometheusalert /opt/PrometheusAlert
 32 | }
 33 | 
 34 | # 确定操作系统类型
 35 | OS="unknown"
 36 | if [ -f /etc/os-release ]; then
 37 |     . /etc/os-release
 38 |     OS=$ID
 39 | fi
 40 | 
 41 | # 安装依赖工具
 42 | install_dependencies
 43 | 
 44 | # 设置系统服务和用户
 45 | setup_system
 46 | 
 47 | # 获取 PrometheusAlert 最新版本
 48 | PA_VERSION=$(curl -s "https://api.github.com/repos/feiyu563/PrometheusAlert/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
 49 | 
 50 | # 下载 PrometheusAlert 二进制文件
 51 | echo "Downloading PrometheusAlert v${PA_VERSION}..."
 52 | wget https://github.com/feiyu563/PrometheusAlert/releases/download/v${PA_VERSION}/linux.zip -O /tmp/prometheusalert.zip
 53 | 
 54 | # 如果下载速度慢，提示用户科学上网
 55 | if [ $? -ne 0 ]; then
 56 |     echo "Download failed or too slow. Consider using a VPN or proxy to download faster."
 57 |     exit 1
 58 | fi
 59 | 
 60 | # 解压到 /tmp
 61 | unzip /tmp/prometheusalert.zip -d /tmp
 62 | 
 63 | # 拷贝 linux 目录中的所有文件到 /opt/PrometheusAlert
 64 | cp -r /tmp/linux/* /opt/PrometheusAlert/
 65 | 
 66 | # 添加执行权限
 67 | chmod +x /opt/PrometheusAlert/PrometheusAlert
 68 | 
 69 | # 删除临时文件
 70 | rm -rf /tmp/prometheusalert.zip /tmp/linux
 71 | 
 72 | # 检查并加载配置文件
 73 | if [ -f /opt/PrometheusAlert/conf/app.conf ]; then
 74 |     echo "Configuration file found in /opt/PrometheusAlert/conf/app.conf"
 75 | else
 76 |     echo "Configuration file not found in /opt/PrometheusAlert/conf/. Please check."
 77 |     exit 1
 78 | fi
 79 | 
 80 | # 确保配置文件权限正确
 81 | chown -R prometheusalert:prometheusalert /opt/PrometheusAlert
 82 | 
 83 | # 创建 systemd 单元文件
 84 | cat > /etc/systemd/system/prometheusalert.service <<EOF
 85 | [Unit]
 86 | Description=PrometheusAlert Service
 87 | After=network.target
 88 | 
 89 | [Service]
 90 | Type=simple
 91 | User=prometheusalert
 92 | Group=prometheusalert
 93 | ExecStart=/opt/PrometheusAlert/PrometheusAlert
 94 | WorkingDirectory=/opt/PrometheusAlert
 95 | Restart=on-failure
 96 | 
 97 | [Install]
 98 | WantedBy=multi-user.target
 99 | EOF
100 | 
101 | # 重新加载 systemd 配置并启动服务
102 | systemctl daemon-reload
103 | systemctl enable prometheusalert.service
104 | systemctl start prometheusalert.service
105 | 
106 | echo "PrometheusAlert installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/alertmanager/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署单节点 alertmanager 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | alertmanager 和 amtool 二进制文件分别放置在系统目录：`/usr/bin` 目录中
 6 | 
 7 | 环境变量启动参数在：`/etc/alertmanager/alertmanager.conf` 文件中，如果需要修改 alertmanager 的启动参数，可直接把参数放在这个文件中的固定格式，然后重启 alertmanager 进程即可。
 8 | 
 9 | alertmanager 的配置参数文件在：`/etc/alertmanager/alertmanager.yml` 文件中，如果需要配置告警分组、告警路由、告警抑制可通过修改这个文件。
10 | 
11 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
12 | 
13 | - 状态：sudo systemctl status alertmanager.service
14 | - 停止：sudo systemctl stop alertmanager.service
15 | - 启动：sudo systemctl start alertmanager.service
16 | - 重启：sudo systemctl restart alertmanager.service
17 | - 开机自启：sudo systemctl enable alertmanager.service


--------------------------------------------------------------------------------
/victoriametrics/binary/alertmanager/install-alertmanager.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ] || [ "$OS" == "rocky" ]; then
  9 |         dnf update -y && dnf install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建 alertmanager 配置文件目录
 19 |     mkdir -p /etc/alertmanager
 20 |     # 创建 alertmanager 临时数据缓存目录
 21 |     mkdir -p /var/lib/alertmanager
 22 | 
 23 |     # 检查 alertmanager 组是否存在，不存在则创建
 24 |     if ! getent group alertmanager > /dev/null 2>&1; then
 25 |         groupadd --system alertmanager
 26 |     fi
 27 | 
 28 |     # 检查 alertmanager 用户是否存在，不存在则创建
 29 |     if ! id -u alertmanager > /dev/null 2>&1; then
 30 |         useradd --system --home-dir /var/lib/alertmanager --no-create-home --gid alertmanager alertmanager
 31 |     fi
 32 | 
 33 |     chown -R alertmanager:alertmanager /var/lib/alertmanager
 34 | }
 35 | 
 36 | # 确定操作系统类型
 37 | OS="unknown"
 38 | if [ -f /etc/os-release ]; then
 39 |     . /etc/os-release
 40 |     OS=$ID
 41 | fi
 42 | 
 43 | # 安装依赖工具
 44 | install_dependencies
 45 | 
 46 | # 设置系统服务和用户
 47 | setup_system
 48 | 
 49 | # 获取 Alertmanager 最新版本
 50 | AM_VERSION=$(curl -s "https://api.github.com/repos/prometheus/alertmanager/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
 51 | 
 52 | # 下载并解压 Alertmanager
 53 | wget "https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/alertmanager-${AM_VERSION}.linux-amd64.tar.gz" -O /tmp/alertmanager.tar.gz
 54 | 
 55 | # 解压文件
 56 | tar -xzvf /tmp/alertmanager.tar.gz -C /tmp
 57 | 
 58 | # 复制解压的 alertmanager.yml 文件到 /etc/alertmanager
 59 | cp /tmp/alertmanager-${AM_VERSION}.linux-amd64/alertmanager.yml /etc/alertmanager/
 60 | 
 61 | # 移动可执行文件到 /usr/bin
 62 | mv /tmp/alertmanager-${AM_VERSION}.linux-amd64/alertmanager /usr/bin/
 63 | mv /tmp/alertmanager-${AM_VERSION}.linux-amd64/amtool /usr/bin/
 64 | 
 65 | # 清理临时文件
 66 | rm -rf /tmp/alertmanager-${AM_VERSION}.linux-amd64
 67 | rm /tmp/alertmanager.tar.gz
 68 | 
 69 | # 确保配置文件权限正确
 70 | chown -R alertmanager:alertmanager /etc/alertmanager
 71 | 
 72 | # 创建 systemd 单元文件
 73 | cat > /etc/systemd/system/alertmanager.service <<EOF
 74 | [Unit]
 75 | Description=Alertmanager for Prometheus
 76 | After=network.target
 77 | 
 78 | [Service]
 79 | Type=simple
 80 | User=alertmanager
 81 | Group=alertmanager
 82 | StartLimitBurst=5
 83 | StartLimitInterval=0
 84 | Restart=on-failure
 85 | RestartSec=1
 86 | EnvironmentFile=-/etc/alertmanager/alertmanager.conf
 87 | ExecStart=/usr/bin/alertmanager \$ARGS
 88 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 89 | ExecReload=/bin/kill -HUP \$MAINPID
 90 | LimitNOFILE=1048576
 91 | LimitNPROC=1048576
 92 | LimitCORE=infinity
 93 | WorkingDirectory=/var/lib/alertmanager
 94 | ReadWritePaths=/var/lib/alertmanager
 95 | StandardOutput=syslog
 96 | StandardError=syslog
 97 | SyslogIdentifier=alertmanager
 98 | PrivateTmp=yes
 99 | ProtectHome=yes
100 | NoNewPrivileges=yes
101 | ProtectSystem=strict
102 | ProtectControlGroups=true
103 | ProtectKernelModules=true
104 | ProtectKernelTunables=yes
105 | 
106 | [Install]
107 | WantedBy=multi-user.target
108 | EOF
109 | 
110 | # 创建 alertmanager.conf 文件
111 | cat > /etc/alertmanager/alertmanager.conf <<EOF
112 | ARGS="--web.listen-address=0.0.0.0:9093 --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/var/lib/alertmanager"
113 | EOF
114 | 
115 | # 重新加载 systemd 配置并启动服务
116 | systemctl daemon-reload
117 | systemctl enable alertmanager.service
118 | systemctl start alertmanager.service
119 | 
120 | echo "Alertmanager installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/blackbox_exporter/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署单节点 node_exporter 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | blackbox_exporter 二进制文件放置在新创建目录：`/opt/blackbox_exporter` 目录中
 6 | 
 7 | blackbox_exporter 的模块配置参数文件在：`/opt/blackbox_exporter/blackbox.yml` 文件中，如果需要修改模块配置，可直接修改该文件。
 8 | 
 9 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
10 | 
11 | - 状态：sudo systemctl status blackbox_exporter.service
12 | - 停止：sudo systemctl stop blackbox_exporter.service
13 | - 启动：sudo systemctl start blackbox_exporter.service
14 | - 重启：sudo systemctl restart blackbox_exporter.service
15 | - 开机自启：sudo systemctl enable blackbox_exporter.service
16 | 
17 | 更多关于 blackbox_exporter 的教程请查看官方文档：[blackbox_exporter文档](https://github.com/prometheus/blackbox_exporter)


--------------------------------------------------------------------------------
/victoriametrics/binary/blackbox_exporter/install-blackbox.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # 函数：安装依赖工具
 5 | install_dependencies() {
 6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
 7 |         apt-get update && apt-get install -y curl wget tar net-tools
 8 |     elif [ "$OS" == "centos" ] || [ "$OS" == "rocky" ]; then
 9 |         dnf update -y && dnf install -y curl wget tar net-tools
10 |     else
11 |         echo "Unsupported operating system."
12 |         exit 1
13 |     fi
14 | }
15 | 
16 | # 函数：设置系统服务和目录
17 | setup_system() {
18 |     # 创建 /opt/blackbox_exporter 目录
19 |     mkdir -p /opt/blackbox_exporter
20 | }
21 | 
22 | # 函数：获取最新版本
23 | get_latest_version() {
24 |     curl -s "https://api.github.com/repos/prometheus/blackbox_exporter/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//'
25 | }
26 | 
27 | # 确定操作系统类型
28 | OS="unknown"
29 | if [ -f /etc/os-release ]; then
30 |     . /etc/os-release
31 |     OS=$ID
32 | fi
33 | 
34 | # 安装依赖工具
35 | install_dependencies
36 | 
37 | # 设置系统服务和目录
38 | setup_system
39 | 
40 | # 获取 blackbox_exporter 最新版本
41 | BB_VERSION=$(get_latest_version)
42 | echo "Downloading blackbox_exporter v${BB_VERSION}..."
43 | 
44 | # 下载 blackbox_exporter 二进制文件
45 | wget https://github.com/prometheus/blackbox_exporter/releases/download/v${BB_VERSION}/blackbox_exporter-${BB_VERSION}.linux-amd64.tar.gz -O /tmp/blackbox_exporter.tar.gz
46 | 
47 | # 解压缩并将二进制文件和配置文件移动到 /opt/blackbox_exporter 目录
48 | tar -xzvf /tmp/blackbox_exporter.tar.gz -C /tmp/
49 | mv /tmp/blackbox_exporter-${BB_VERSION}.linux-amd64/blackbox_exporter /opt/blackbox_exporter/
50 | mv /tmp/blackbox_exporter-${BB_VERSION}.linux-amd64/blackbox.yml /opt/blackbox_exporter/
51 | 
52 | # 添加执行权限
53 | chmod +x /opt/blackbox_exporter/blackbox_exporter
54 | 
55 | # 删除临时文件
56 | rm -rf /tmp/blackbox_exporter.tar.gz /tmp/blackbox_exporter-${BB_VERSION}.linux-amd64
57 | 
58 | # 创建 systemd 单元文件
59 | cat > /etc/systemd/system/blackbox_exporter.service <<EOF
60 | [Unit]
61 | Description=Blackbox Exporter Service
62 | After=network.target
63 | 
64 | [Service]
65 | Type=simple
66 | User=root
67 | Group=root
68 | ExecStart=/opt/blackbox_exporter/blackbox_exporter --config.file=/opt/blackbox_exporter/blackbox.yml
69 | WorkingDirectory=/opt/blackbox_exporter
70 | KillMode=process
71 | Restart=on-failure
72 | 
73 | [Install]
74 | WantedBy=multi-user.target
75 | EOF
76 | 
77 | # 重新加载 systemd 配置并启动服务
78 | sudo systemctl daemon-reload
79 | sudo systemctl enable blackbox_exporter.service
80 | sudo systemctl start blackbox_exporter.service
81 | 
82 | echo "blackbox_exporter installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/categraf/README.md:
--------------------------------------------------------------------------------
1 | ## Categraf 二进制部署脚本
2 | 
3 | 第一次运行直接安装，第二次运行实现版本更新


--------------------------------------------------------------------------------
/victoriametrics/binary/categraf/install-categraf-cgo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 默认Categraf安装目录
 4 | default_dir="/opt/categraf"
 5 | 
 6 | # 提示用户输入安装目录
 7 | read -p "Please enter the installation directory for Categraf (default is /opt/categraf): " user_input
 8 | 
 9 | # 如果用户输入为空，则使用默认目录
10 | categraf_dir="${user_input:-$default_dir}"
11 | 
12 | # 获取最新版本号
13 | latest_version=$(curl -s https://api.github.com/repos/flashcatcloud/categraf/releases/latest | grep "tag_name" | cut -d '"' -f 4)
14 | # 下载最新版本的Categraf特定包链接
15 | latest_url="https://github.com/flashcatcloud/categraf/releases/download/$latest_version/categraf-$latest_version-linux-amd64-with-cgo-plugin.tar.gz"
16 | 
17 | # 检查Categraf是否已经部署
18 | if [ ! -d "$categraf_dir" ]; then
19 |     echo "Categraf is not deployed. Downloading and deploying latest version..."
20 |     # 创建目标目录并切换到目标目录
21 |     mkdir -p "$categraf_dir" && cd "$categraf_dir" || { echo "Error: Failed to create or change directory." >&2; exit 1; }
22 |     # 下载最新版本的Categraf并解压到指定目录
23 |     wget -qO- "$latest_url" | tar xvz --strip-components=1
24 |     echo "Categraf deployed successfully in $categraf_dir."
25 |     # 复制 categraf.service 到 /etc/systemd/system/ 并启动服务及设置开机自启动
26 |     if [ -f "${categraf_dir}/conf/categraf.service" ]; then
27 |         mv "${categraf_dir}/conf/categraf.service" /etc/systemd/system/
28 |         systemctl daemon-reload
29 |         systemctl start categraf
30 |         systemctl enable categraf
31 |         echo "Categraf service is started and enabled on boot."
32 |     else
33 |         echo "The categraf.service file does not exist. Please check the installation."
34 |     fi
35 | else
36 |     echo "Categraf is already deployed in $categraf_dir. Checking for updates..."
37 |     # 获取当前部署的Categraf版本
38 |     current_version=$("$categraf_dir/categraf" --version | awk '{print $3}')
39 |     # 检查是否是最新版本
40 |     if [ "$current_version" != "$latest_version" ]; then
41 |         echo "Updating Categraf from version $current_version to $latest_version..."
42 |         # 使用categraf --update命令更新到最新版本
43 |         cd $categraf_dir
44 |         ./categraf --update_url $latest_url --update 
45 |         echo "Categraf updated successfully to version $latest_version in $categraf_dir."
46 |     else
47 |         echo "Categraf is already up to date in $categraf_dir."
48 |     fi
49 | fi


--------------------------------------------------------------------------------
/victoriametrics/binary/categraf/install-categraf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 定义Categraf安装目录
 4 | categraf_dir="/opt/categraf"
 5 | # 获取最新版本号
 6 | latest_version=$(curl -s https://api.github.com/repos/flashcatcloud/categraf/releases/latest | grep "tag_name" | cut -d '"' -f 4)
 7 | # 下载最新版本的Categraf链接
 8 | # latest_url="https://github.com/flashcatcloud/categraf/releases/download/$latest_version/categraf-$latest_version-linux-amd64.tar.gz"
 9 | latest_url="https://download.flashcat.cloud/categraf-$latest_version-linux-amd64.tar.gz"
10 | # 定义下载文件的名称
11 | categraf_archive="categraf-$latest_version-linux-amd64.tar.gz"
12 | 
13 | # 检查Categraf是否已经部署
14 | if [ ! -d "$categraf_dir" ]; then
15 |     echo "Categraf is not deployed. Downloading and deploying latest version..."
16 |     
17 |     # 创建目标目录
18 |     mkdir -p "$categraf_dir" || { echo "Error: Failed to create directory $categraf_dir." >&2; exit 1; }
19 | 
20 |     # 下载文件到安装目录
21 |     echo "Downloading Categraf $latest_version..."
22 |     wget --show-progress "$latest_url" -O "$categraf_dir/$categraf_archive" || { echo "Error: Failed to download Categraf." >&2; exit 1; }
23 | 
24 |     # 切换到安装目录
25 |     cd "$categraf_dir" || { echo "Error: Failed to change to directory $categraf_dir." >&2; exit 1; }
26 | 
27 |     # 解压下载的文件
28 |     echo "Extracting Categraf..."
29 |     tar -xzf "$categraf_archive" --strip-components=1 || { echo "Error: Failed to extract Categraf." >&2; exit 1; }
30 | 
31 |     # 清理下载的压缩文件
32 |     rm "$categraf_archive"
33 | 
34 |     # 使用新的 --install 命令安装服务
35 |     echo "Installing Categraf as a service..."
36 |     sudo ./categraf --install || { echo "Error: Failed to install Categraf service." >&2; exit 1; }
37 |     
38 |     # 启动并设置 Categraf 服务为开机启动
39 |     sudo systemctl start categraf
40 |     sudo systemctl enable categraf
41 |     echo "Categraf service is started and enabled on boot."
42 | 
43 | else
44 |     echo "Categraf is already deployed. Checking for updates..."
45 | 
46 |     # 获取当前部署的Categraf版本
47 |     current_version=$("$categraf_dir/categraf" --version | awk '{print $3}')
48 |     
49 |     # 检查是否是最新版本
50 |     if [ "$current_version" != "$latest_version" ]; then
51 |         echo "Updating Categraf from version $current_version to $latest_version..."
52 | 
53 |         # 下载新的版本
54 |         wget --show-progress "$latest_url" -O "$categraf_dir/$categraf_archive" || { echo "Error: Failed to download new version." >&2; exit 1; }
55 | 
56 |         # 切换到安装目录
57 |         cd "$categraf_dir" || { echo "Error: Failed to change to directory $categraf_dir." >&2; exit 1; }
58 | 
59 |         # 解压并更新
60 |         tar -xzf "$categraf_archive" --strip-components=1 || { echo "Error: Failed to extract new version." >&2; exit 1; }
61 | 
62 |         # 清理下载的压缩文件
63 |         rm "$categraf_archive"
64 | 
65 |         # 安装新版本
66 |         sudo ./categraf --install || { echo "Error: Failed to update Categraf service." >&2; exit 1; }
67 | 
68 |         echo "Categraf updated successfully."
69 |     else
70 |         echo "Categraf is already up to date."
71 |     fi
72 | fi


--------------------------------------------------------------------------------
/victoriametrics/binary/categraf/update-config.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # 更新后的路径
  4 | BASE_DIR="/opt/categraf"
  5 | 
  6 | # 配置文件路径
  7 | CONFIG_FILE="$BASE_DIR/conf/config.toml"
  8 | NVIDIA_SMI_CONFIG_FILE="$BASE_DIR/conf/input.nvidia_smi/nvidia_smi.toml"
  9 | EXPORTER_CONFIG_FILE="$BASE_DIR/conf/input.dcgm/exporter.toml"
 10 | EXEC_CONFIG_FILE="$BASE_DIR/conf/input.exec/exec.toml"
 11 | 
 12 | # 检查配置文件是否存在
 13 | if [ ! -f "$CONFIG_FILE" ]; then
 14 |   echo "配置文件 $CONFIG_FILE 不存在。"
 15 |   exit 1
 16 | fi
 17 | 
 18 | if [ ! -f "$NVIDIA_SMI_CONFIG_FILE" ]; then
 19 |   echo "配置文件 $NVIDIA_SMI_CONFIG_FILE 不存在。"
 20 |   exit 1
 21 | fi
 22 | 
 23 | if [ ! -f "$EXPORTER_CONFIG_FILE" ]; then
 24 |   echo "配置文件 $EXPORTER_CONFIG_FILE 不存在。"
 25 |   exit 1
 26 | fi
 27 | 
 28 | if [ ! -f "$EXEC_CONFIG_FILE" ]; then
 29 |   echo "配置文件 $EXEC_CONFIG_FILE 不存在。"
 30 |   exit 1
 31 | fi
 32 | 
 33 | # 检查是否安装 dcgmi 命令
 34 | if ! command -v dcgmi &> /dev/null; then
 35 |   echo "系统中未安装 dcgmi 命令，开始安装..."
 36 |   
 37 |   # 删除旧的 apt-key
 38 |   sudo apt-key del 7fa2af80
 39 |   
 40 |   # 获取系统版本信息
 41 |   distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g')
 42 |   
 43 |   # 下载并安装 CUDA keyring
 44 |   wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb
 45 |   sudo dpkg -i cuda-keyring_1.1-1_all.deb
 46 |   
 47 |   # 更新 apt 源
 48 |   sudo apt-get update
 49 |   
 50 |   # 安装 datacenter-gpu-manager
 51 |   sudo apt-get install -y datacenter-gpu-manager
 52 |   
 53 |   # 启用和重启 nvidia-dcgm 服务
 54 |   sudo systemctl --now enable nvidia-dcgm
 55 |   sudo systemctl --now restart nvidia-dcgm
 56 |   
 57 |   echo "dcgmi 命令安装完成。"
 58 | fi
 59 | 
 60 | # 更新 config.toml 文件中的参数
 61 | sed -i 's|^file_name = "stdout"|file_name = "'"$BASE_DIR"'/logs/categraf.log"|' "$CONFIG_FILE"
 62 | sed -i 's|^\(url = \)".*prometheus/v1/write"$|\1"http://10.6.212.9:17000/prometheus/v1/write"|' "$CONFIG_FILE"
 63 | sed -i 's|^\(url = \)".*v1/n9e/heartbeat"$|\1"http://10.6.212.9:17000/v1/n9e/heartbeat"|' "$CONFIG_FILE"
 64 | 
 65 | echo "配置文件已更新：$CONFIG_FILE"
 66 | 
 67 | # 更新 nvidia_smi.toml 文件中的参数
 68 | sed -i 's|^nvidia_smi_command = ""|nvidia_smi_command = "nvidia-smi"|' "$NVIDIA_SMI_CONFIG_FILE"
 69 | sed -i 's|^# interval = 15|interval = 15|' "$NVIDIA_SMI_CONFIG_FILE"
 70 | 
 71 | echo "配置文件已更新：$NVIDIA_SMI_CONFIG_FILE"
 72 | 
 73 | # 更新 exporter.toml 文件中的参数
 74 | sed -i 's|^#\[\[instances\]\]|\[\[instances\]\]|' "$EXPORTER_CONFIG_FILE"
 75 | sed -i 's|^# collectors = "conf/input.dcgm/default-counters.csv"|collectors = "conf/input.dcgm/dcp-metrics-included.csv"|' "$EXPORTER_CONFIG_FILE"
 76 | 
 77 | echo "配置文件已更新：$EXPORTER_CONFIG_FILE"
 78 | 
 79 | # 更新 exec.toml 文件中的参数
 80 | sed -i 's|^# interval = 15|interval = 15|' "$EXEC_CONFIG_FILE"
 81 | sed -i 's|^\(commands = \[\)|\1\n    "'"$BASE_DIR"'/scripts/*.py"|' "$EXEC_CONFIG_FILE"
 82 | sed -i 's|^# data_format = "influx"|data_format = "prometheus"|' "$EXEC_CONFIG_FILE"
 83 | 
 84 | echo "配置文件已更新：$EXEC_CONFIG_FILE"
 85 | 
 86 | # # 新建日志目录和脚本目录
 87 | # LOGS_DIR="$BASE_DIR/logs"
 88 | # SCRIPTS_DIR="$BASE_DIR/scripts"
 89 | 
 90 | # mkdir -pv "$LOGS_DIR" "$SCRIPTS_DIR"
 91 | 
 92 | # echo "目录已创建：$LOGS_DIR 和 $SCRIPTS_DIR"
 93 | 
 94 | # 重启 Categraf 服务
 95 | echo "正在重启 Categraf 服务..."
 96 | sudo systemctl restart categraf
 97 | 
 98 | # 检查服务状态
 99 | echo "检查 Categraf 服务状态..."
100 | sudo systemctl status categraf --no-pager
101 | 
102 | echo "Categraf 配置和重启完成。"
103 | 


--------------------------------------------------------------------------------
/victoriametrics/binary/grafana/README.md:
--------------------------------------------------------------------------------
 1 | ## 本地部署安装 Grafana 教程
 2 | 
 3 | ### Ubuntu
 4 | 
 5 | APT软件库安装
 6 | 
 7 | ```bash
 8 | sudo apt-get install -y apt-transport-https software-properties-common wget
 9 | 
10 | sudo mkdir -p /etc/apt/keyrings/
11 | wget -q -O - https://apt.grafana.com/gpg.key | gpg --dearmor | sudo tee /etc/apt/keyrings/grafana.gpg > /dev/null
12 | 
13 | echo "deb [signed-by=/etc/apt/keyrings/grafana.gpg] https://apt.grafana.com stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list
14 | 
15 | sudo apt-get update
16 | 
17 | sudo apt list --all-versions grafana-enterprise
18 | 
19 | sudo apt-get install grafana-enterprise
20 | ```
21 | 
22 | ### CentOS
23 | 
24 | ```bash
25 | wget -q -O gpg.key https://rpm.grafana.com/gpg.key
26 | sudo rpm --import gpg.key
27 | 
28 | sudo vim /etc/yum.repos.d/grafana.repo
29 | 
30 | [grafana]
31 | name=grafana
32 | baseurl=https://rpm.grafana.com
33 | repo_gpgcheck=1
34 | enabled=1
35 | gpgcheck=1
36 | gpgkey=https://rpm.grafana.com/gpg.key
37 | sslverify=1
38 | sslcacert=/etc/pki/tls/certs/ca-bundle.crt
39 | 
40 | # 开源版
41 | sudo dnf install grafana
42 | # 企业版
43 | sudo dnf install grafana-enterprise
44 | ```
45 | 
46 | 如果下载很慢可寻找国内镜像源进行替换下载


--------------------------------------------------------------------------------
/victoriametrics/binary/network_exporter/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署单节点 network_exporter 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | network_exporter 二进制文件放置在新创建目录：`/opt/network_exporter` 目录中
 6 | 
 7 | network_exporter 的配置参数文件在：`/opt/network_exporter/network_exporter.yml` 文件中，如果需要配置探测目标，以及自定义 network_exporter 目标，可直接修该文件。
 8 | 
 9 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
10 | 
11 | - 状态：sudo systemctl status network_exporter.service
12 | - 停止：sudo systemctl stop network_exporter.service
13 | - 启动：sudo systemctl start network_exporter.service
14 | - 重启：sudo systemctl restart network_exporter.service
15 | - 开机自启：sudo systemctl enable network_exporter.service
16 | 
17 | 更多关于 network_exporter 的教程请查看官方文档：[network_exporter文档](https://github.com/syepes/network_exporter)


--------------------------------------------------------------------------------
/victoriametrics/binary/network_exporter/install-network.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # 函数：安装依赖工具
 5 | install_dependencies() {
 6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
 7 |         apt-get update && apt-get install -y curl wget tar net-tools
 8 |     elif [ "$OS" == "centos" ] || [ "$OS" == "rocky" ]; then
 9 |         dnf update -y && dnf install -y curl wget tar net-tools
10 |     else
11 |         echo "Unsupported operating system."
12 |         exit 1
13 |     fi
14 | }
15 | 
16 | # 函数：设置系统服务和目录
17 | setup_system() {
18 |     # 创建 /opt/network_exporter 目录
19 |     mkdir -p /opt/network_exporter
20 | }
21 | 
22 | # 函数：获取最新版本
23 | get_latest_version() {
24 |     curl -s "https://api.github.com/repos/syepes/network_exporter/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//'
25 | }
26 | 
27 | # 确定操作系统类型
28 | OS="unknown"
29 | if [ -f /etc/os-release ]; then
30 |     . /etc/os-release
31 |     OS=$ID
32 | fi
33 | 
34 | # 安装依赖工具
35 | install_dependencies
36 | 
37 | # 设置系统服务和目录
38 | setup_system
39 | 
40 | # 获取 network_exporter 最新版本
41 | NE_VERSION=$(get_latest_version)
42 | echo "Downloading network_exporter v${NE_VERSION}..."
43 | 
44 | # 下载 network_exporter 二进制文件
45 | wget https://github.com/syepes/network_exporter/releases/download/${NE_VERSION}/network_exporter_${NE_VERSION}.Linux_x86_64.tar.gz -O /tmp/network_exporter.tar.gz
46 | 
47 | # 解压并仅保留需要的文件
48 | tar -xzvf /tmp/network_exporter.tar.gz -C /opt/network_exporter/
49 | 
50 | # 添加执行权限
51 | chmod +x /opt/network_exporter/network_exporter
52 | 
53 | # 删除临时文件
54 | rm -rf /tmp/network_exporter.tar.gz /tmp/network_exporter_${NE_VERSION}.Linux_x86_64
55 | 
56 | # 创建 systemd 单元文件
57 | cat > /etc/systemd/system/network_exporter.service <<EOF
58 | [Unit]
59 | Description=Network Exporter Service
60 | After=network.target
61 | 
62 | [Service]
63 | Type=simple
64 | User=root
65 | Group=root
66 | ExecStart=/opt/network_exporter/network_exporter --web.listen-address=:9427 --config.file=/opt/network_exporter/network_exporter.yml --log.level=info
67 | WorkingDirectory=/opt/network_exporter
68 | KillMode=process
69 | Restart=on-failure
70 | 
71 | [Install]
72 | WantedBy=multi-user.target
73 | EOF
74 | 
75 | # 重新加载 systemd 配置并启动服务
76 | sudo systemctl daemon-reload
77 | sudo systemctl enable network_exporter.service
78 | sudo systemctl start network_exporter.service
79 | 
80 | echo "network_exporter installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/node_exporter/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署单节点 node_exporter 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | node_exporter 二进制文件放置在新创建目录：`/usr/sbin` 目录中
 6 | 
 7 | node_exporter 的配置参数文件在：`/etc/sysconfig/node_exporter` 文件中，如果需要修改启动参数，可直接修改该文件。
 8 | 
 9 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
10 | 
11 | - 状态：sudo systemctl status node_exporter.service
12 | - 停止：sudo systemctl stop node_exporter.service
13 | - 启动：sudo systemctl start node_exporter.service
14 | - 重启：sudo systemctl restart node_exporter.service
15 | - 开机自启：sudo systemctl enable node_exporter.service
16 | 
17 | 更多关于 node_exporter 的教程请查看官方文档：[node_exporter文档](https://github.com/prometheus/node_exporter)


--------------------------------------------------------------------------------
/victoriametrics/binary/node_exporter/install-node.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget tar net-tools
  8 |     elif [ "$OS" == "centos" ] || [ "$OS" == "rocky" ]; then
  9 |         dnf update -y && dnf install -y curl wget tar net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 检查node_exporter组是否存在，不存在则创建
 19 |     if ! getent group node_exporter > /dev/null 2>&1; then
 20 |         groupadd --system node_exporter
 21 |     fi
 22 | 
 23 |     # 检查node_exporter用户是否存在，不存在则创建
 24 |     if ! id -u node_exporter > /dev/null 2>&1; then
 25 |         useradd --system --no-create-home --shell /sbin/nologin --gid node_exporter node_exporter
 26 |     fi
 27 | 
 28 |     # 创建textfile_collector目录并设置权限
 29 |     mkdir -p /var/lib/node_exporter/textfile_collector
 30 |     chown -R node_exporter:node_exporter /var/lib/node_exporter
 31 | }
 32 | 
 33 | # 函数：确定操作系统类型
 34 | setup_config_path() {
 35 |     if [ -f /etc/os-release ]; then
 36 |         . /etc/os-release
 37 |         if [[ "$ID" == "ubuntu" ]]; then
 38 |             CONFIG_PATH="/etc/default/node_exporter"
 39 |         elif [[ "$ID" == "centos" ]] || [[ "$ID" == "rocky" ]]; then
 40 |             CONFIG_PATH="/etc/sysconfig/node_exporter"
 41 |         else
 42 |             echo "Unsupported operating system."
 43 |             exit 1
 44 |         fi
 45 |     else
 46 |         echo "Cannot detect the operating system."
 47 |         exit 1
 48 |     fi
 49 | }
 50 | 
 51 | # 确定操作系统类型
 52 | OS="unknown"
 53 | if [ -f /etc/os-release ]; then
 54 |     . /etc/os-release
 55 |     OS=$ID
 56 | fi
 57 | 
 58 | # 安装依赖工具
 59 | install_dependencies
 60 | 
 61 | # 设置系统服务和用户
 62 | setup_system
 63 | 
 64 | # 获取 node_exporter 最新版本
 65 | NE_VERSION=$(curl -s https://api.github.com/repos/prometheus/node_exporter/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
 66 | 
 67 | # 下载并安装 node_exporter
 68 | wget https://github.com/prometheus/node_exporter/releases/download/v${NE_VERSION}/node_exporter-${NE_VERSION}.linux-amd64.tar.gz -O /tmp/node_exporter.tar.gz
 69 | tar -xzvf /tmp/node_exporter.tar.gz -C /tmp
 70 | mv /tmp/node_exporter-${NE_VERSION}.linux-amd64/node_exporter /usr/sbin/
 71 | chmod +x /usr/sbin/node_exporter
 72 | 
 73 | # 写入配置文件
 74 | setup_config_path
 75 | cat > "$CONFIG_PATH" <<EOF
 76 | # Node Exporter configuration
 77 | OPTIONS="--collector.textfile.directory=/var/lib/node_exporter/textfile_collector"
 78 | EOF
 79 | 
 80 | # 创建 systemd 服务文件
 81 | cat > /etc/systemd/system/node_exporter.service <<EOF
 82 | [Unit]
 83 | Description=Node Exporter
 84 | Requires=node_exporter.socket
 85 | After=network.target
 86 | 
 87 | [Service]
 88 | Type=simple
 89 | User=node_exporter
 90 | Group=node_exporter
 91 | EnvironmentFile=/etc/default/node_exporter
 92 | ExecStart=/usr/sbin/node_exporter --web.systemd-socket \$OPTIONS
 93 | WorkingDirectory=/var/lib/node_exporter
 94 | KillMode=process
 95 | Restart=on-failure
 96 | 
 97 | [Install]
 98 | WantedBy=multi-user.target
 99 | EOF
100 | 
101 | # 创建 systemd socket 文件
102 | cat > /etc/systemd/system/node_exporter.socket <<EOF
103 | [Unit]
104 | Description=Node Exporter
105 | 
106 | [Socket]
107 | ListenStream=9100
108 | 
109 | [Install]
110 | WantedBy=sockets.target
111 | EOF
112 | 
113 | # 创建 /var/lib/node_exporter/textfile_collector 目录并设置权限
114 | mkdir -p /var/lib/node_exporter/textfile_collector
115 | chown -R node_exporter:node_exporter /var/lib/node_exporter/textfile_collector
116 | 
117 | # 启用并启动服务
118 | systemctl daemon-reload
119 | systemctl enable node_exporter.service
120 | systemctl enable node_exporter.socket
121 | systemctl start node_exporter.socket
122 | systemctl start node_exporter.service
123 | 
124 | # 清理临时文件
125 | rm -rf /tmp/node_exporter*
126 | 
127 | echo "Node Exporter installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/prometheus/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署单节点 Prometheus 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | prometheus 和 promtool 二进制文件分别放置在系统目录：`/usr/bin` 目录中
 6 | 
 7 | 环境变量启动参数在：`/etc/prometheus/single/prometheus.conf` 文件中，如果需要修改 prometheus 的启动参数，可直接把参数放在这个文件中的固定格式，然后重启 prometheus 进程即可。
 8 | 
 9 | prometheus 的配置参数文件在：`/etc/prometheus/single/prometheus.yml` 文件中，如果需要配置抓取任务、告警规则文件、以及 Alertmanager 地址和全局标签可在这个文件中修改冲重加载即可。
10 | 
11 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
12 | 
13 | - 状态：sudo systemctl status prometheus.service
14 | - 停止：sudo systemctl stop prometheus.service
15 | - 启动：sudo systemctl start prometheus.service
16 | - 重启：sudo systemctl restart prometheus.service
17 | - 开机自启：sudo systemctl enable prometheus.service


--------------------------------------------------------------------------------
/victoriametrics/binary/prometheus/install-promsingle.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ] || [ "$OS" == "rocky" ]; then
  9 |         dnf update -y && dnf install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建Prometheus配置文件目录
 19 |     mkdir -p /etc/prometheus/single
 20 |     # 创建Prometheus数据保存目录
 21 |     mkdir -p /var/lib/prometheus
 22 | 
 23 |     # 检查Prometheus组是否存在，不存在则创建
 24 |     if ! getent group prometheus > /dev/null 2>&1; then
 25 |         groupadd --system prometheus
 26 |     fi
 27 | 
 28 |     # 检查Prometheus用户是否存在，不存在则创建
 29 |     if ! id -u prometheus > /dev/null 2>&1; then
 30 |         useradd --system --home-dir /var/lib/prometheus --no-create-home --gid prometheus prometheus
 31 |     fi
 32 | 
 33 |     chown -R prometheus:prometheus /var/lib/prometheus
 34 | }
 35 | 
 36 | # 确定操作系统类型
 37 | OS="unknown"
 38 | if [ -f /etc/os-release ]; then
 39 |     . /etc/os-release
 40 |     OS=$ID
 41 | fi
 42 | 
 43 | # 安装依赖工具
 44 | install_dependencies
 45 | 
 46 | # 设置系统服务和用户
 47 | setup_system
 48 | 
 49 | # 获取Prometheus最新版本
 50 | PROM_VERSION=$(curl -s "https://api.github.com/repos/prometheus/prometheus/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}' | sed 's/^v//')
 51 | 
 52 | # 下载并安装Prometheus
 53 | wget https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz -O /tmp/prometheus.tar.gz
 54 | 
 55 | # 解压Prometheus文件
 56 | tar -xzvf /tmp/prometheus.tar.gz -C /tmp
 57 | 
 58 | # 移动Prometheus可执行文件到/usr/bin目录
 59 | mv /tmp/prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/bin/
 60 | mv /tmp/prometheus-${PROM_VERSION}.linux-amd64/promtool /usr/bin/
 61 | chmod +x /usr/bin/prometheus /usr/bin/promtool
 62 | 
 63 | # 将解压后的 prometheus.yml 配置文件复制到 /etc/prometheus/single 目录
 64 | cp /tmp/prometheus-${PROM_VERSION}.linux-amd64/prometheus.yml /etc/prometheus/single/
 65 | 
 66 | # 将 consoles 和 console_libraries 目录复制到 /var/lib/prometheus
 67 | cp -r /tmp/prometheus-${PROM_VERSION}.linux-amd64/consoles /var/lib/prometheus/
 68 | cp -r /tmp/prometheus-${PROM_VERSION}.linux-amd64/console_libraries /var/lib/prometheus/
 69 | 
 70 | # 清理临时文件
 71 | rm -rf /tmp/prometheus-${PROM_VERSION}.linux-amd64
 72 | rm /tmp/prometheus.tar.gz
 73 | 
 74 | # 设置systemd服务
 75 | cat> /etc/systemd/system/prometheus.service <<EOF
 76 | [Unit]
 77 | Description=Prometheus Monitoring System and Time Series Database
 78 | After=network.target
 79 | 
 80 | [Service]
 81 | Type=simple
 82 | User=prometheus
 83 | Group=prometheus
 84 | WorkingDirectory=/var/lib/prometheus
 85 | ReadWritePaths=/var/lib/prometheus
 86 | StartLimitBurst=5
 87 | StartLimitInterval=0
 88 | Restart=on-failure
 89 | RestartSec=5
 90 | EnvironmentFile=-/etc/prometheus/single/prometheus.conf
 91 | ExecStart=/usr/bin/prometheus \$ARGS
 92 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 93 | ExecReload=/bin/kill -HUP \$MAINPID
 94 | ProtectSystem=full
 95 | LimitNOFILE=1048576
 96 | LimitNPROC=1048576
 97 | LimitCORE=infinity
 98 | StandardOutput=syslog
 99 | StandardError=syslog
100 | SyslogIdentifier=prometheus
101 | PrivateTmp=yes
102 | ProtectHome=yes
103 | NoNewPrivileges=yes
104 | ProtectSystem=strict
105 | ProtectControlGroups=true
106 | ProtectKernelModules=true
107 | ProtectKernelTunables=yes
108 | 
109 | [Install]
110 | WantedBy=multi-user.target
111 | EOF
112 | 
113 | # 创建prometheus.conf文件，包含启动参数
114 | cat> /etc/prometheus/single/prometheus.conf <<EOF
115 | ARGS="--web.listen-address=0.0.0.0:9090 --config.file=/etc/prometheus/single/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/data --storage.tsdb.retention=90d --web.enable-lifecycle --web.enable-admin-api --web.enable-remote-write-receiver --web.console.templates=/var/lib/prometheus/consoles --web.console.libraries=/var/lib/prometheus/console_libraries"
116 | EOF
117 | 
118 | # 设置权限
119 | chown -R prometheus:prometheus /var/lib/prometheus
120 | chown -R prometheus:prometheus /etc/prometheus/single
121 | 
122 | # 重新加载systemd配置并启动服务
123 | sudo systemctl daemon-reload
124 | sudo systemctl enable prometheus.service
125 | sudo systemctl restart prometheus.service
126 | 
127 | # 验证Prometheus是否运行
128 | ps aux | grep prometheus
129 | 
130 | echo "Prometheus installation and service setup complete."
131 | 


--------------------------------------------------------------------------------
/victoriametrics/binary/victoriametrics/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署单节点 VictoriaMetrics 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | victoria-metrics-prod 二进制文件放置在系统目录：`/usr/bin` 目录中
 6 | 
 7 | 环境变量启动参数在：`/etc/victoriametrics/single/vmsingle.conf` 文件中，如果需要修改 VictoriaMetrics 的启动参数，可直接把参数放在这个文件中的固定格式，然后重启 VictoriaMetrics 进程即可。
 8 | 
 9 | 时序库数据保存的文件路径：`/var/lib/victoria-metrics-data` 目录中。
10 | 
11 | VictoriaMetrics 单节点支持抓取任务，标签重置等功能，只需要在环境变量启动参数中添加：`-promscrape.config=scrape.yaml` 参数即可实现主动 `pull` 抓取目标指标数据。
12 | 
13 | 一般建议把文件：`/etc/victoriametrics/single/scrape.yaml` 放在和 `/etc/victoriametrics/single/vmsingle.conf` 文件同目录下，如何写 `scrape.yaml` 可以直接查看文档：[抓取配置](https://docs.victoriametrics.com/scrape_config_examples/) 但是由于我们的架构是使用 `vmagent` 故在这里VictoriaMetrics 就是很纯粹的当作时序库存储数据即可，其他抓取任务配置、标签重置、聚合等都交给 `vmagent` ，而告警规则配置交给 `vmalert` 即可，从而 VictoriaMetrics 解放双手，纯粹做单一事务。
14 | 
15 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
16 | 
17 | - 状态：sudo systemctl status victoria-metrics.service
18 | - 停止：sudo systemctl stop victoria-metrics.service
19 | - 启动：sudo systemctl start victoria-metrics.service
20 | - 重启：sudo systemctl restart victoria-metrics.service
21 | - 开机自启：sudo systemctl enable victoria-metrics.service


--------------------------------------------------------------------------------
/victoriametrics/binary/victoriametrics/install-vmsingle.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ]; then
  9 |         yum update && yum install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建victoriametrics配置文件目录
 19 |     mkdir -p /etc/victoriametrics/single
 20 |     # 创建victoriametrics数据保存目录
 21 |     mkdir -p /var/lib/victoria-metrics-data
 22 | 
 23 |     # 检查victoriametrics组是否存在，不存在则创建
 24 |     if ! getent group victoriametrics > /dev/null 2>&1; then
 25 |         groupadd --system victoriametrics
 26 |     fi
 27 | 
 28 |     # 检查victoriametrics用户是否存在，不存在则创建
 29 |     if ! id -u victoriametrics > /dev/null 2>&1; then
 30 |         useradd --system --home-dir /var/lib/victoria-metrics-data --no-create-home --gid victoriametrics victoriametrics
 31 |     fi
 32 | 
 33 |     chown -R victoriametrics:victoriametrics /var/lib/victoria-metrics-data
 34 | }
 35 | 
 36 | # 确定操作系统类型
 37 | OS="unknown"
 38 | if [ -f /etc/os-release ]; then
 39 |     . /etc/os-release
 40 |     OS=$ID
 41 | fi
 42 | 
 43 | # 安装依赖工具
 44 | install_dependencies
 45 | 
 46 | # 设置系统服务和用户
 47 | setup_system
 48 | 
 49 | # 获取VictoriaMetrics最新版本
 50 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}')
 51 | 
 52 | # 下载并安装VictoriaMetrics
 53 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-amd64-${VM_VERSION}.tar.gz -O /tmp/victoria-metrics.tar.gz
 54 | 
 55 | tar -xzvf /tmp/victoria-metrics.tar.gz -C /tmp
 56 | mv /tmp/victoria-metrics-prod /usr/bin/
 57 | chmod +x /usr/bin/victoria-metrics-prod
 58 | 
 59 | # 清理 /tmp 目录中的压缩文件和解压后的文件
 60 | rm -rf /tmp/victoria-metrics.tar.gz /tmp/victoria-metrics-prod*
 61 | 
 62 | cat> /etc/systemd/system/victoria-metrics.service <<EOF
 63 | [Unit]
 64 | Description=VictoriaMetrics is a fast, cost-effective and scalable monitoring solution and time series database.
 65 | # https://docs.victoriametrics.com
 66 | After=network.target
 67 | 
 68 | [Service]
 69 | Type=simple
 70 | User=victoriametrics
 71 | Group=victoriametrics
 72 | WorkingDirectory=/var/lib/victoria-metrics-data
 73 | ReadWritePaths=/var/lib/victoria-metrics-data
 74 | StartLimitBurst=5
 75 | StartLimitInterval=0
 76 | Restart=on-failure
 77 | RestartSec=5
 78 | EnvironmentFile=-/etc/victoriametrics/single/vmsingle.conf
 79 | ExecStart=/usr/bin/victoria-metrics-prod \$ARGS
 80 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 81 | ExecReload=/bin/kill -HUP \$MAINPID
 82 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
 83 | ProtectSystem=full
 84 | LimitNOFILE=1048576
 85 | LimitNPROC=1048576
 86 | LimitCORE=infinity
 87 | StandardOutput=syslog
 88 | StandardError=syslog
 89 | SyslogIdentifier=vmsingle
 90 | PrivateTmp=yes
 91 | ProtectHome=yes
 92 | NoNewPrivileges=yes
 93 | ProtectSystem=strict
 94 | ProtectControlGroups=true
 95 | ProtectKernelModules=true
 96 | ProtectKernelTunables=yes
 97 | 
 98 | [Install]
 99 | WantedBy=multi-user.target
100 | EOF
101 | 
102 | cat> /etc/victoriametrics/single/vmsingle.conf <<EOF
103 | ARGS="-storageDataPath=/var/lib/victoria-metrics-data -retentionPeriod=90d -httpListenAddr=:8428 -selfScrapeInterval=15s -vmui.defaultTimezone=Local"
104 | EOF
105 | 
106 | chown -R victoriametrics:victoriametrics /var/lib/victoria-metrics-data
107 | chown -R victoriametrics:victoriametrics /etc/victoriametrics/single
108 | 
109 | sudo systemctl enable victoria-metrics.service
110 | sudo systemctl restart victoria-metrics.service
111 | ps aux | grep victoria-metrics
112 | 
113 | echo "VictoriaMetrics installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/victoriametrics/vmsingle.conf:
--------------------------------------------------------------------------------
1 | # See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#list-of-command-line-flags to get more information about supported command-line flags
2 | # 
3 | # If you use IPv6 pleas add "-enableTCP6" to args line
4 | ARGS="-storageDataPath=/var/lib/victoria-metrics-data -retentionPeriod=90d -httpListenAddr=:8428 -selfScrapeInterval=15s -vmui.defaultTimezone=Local"


--------------------------------------------------------------------------------
/victoriametrics/binary/vmagent/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署 vmagent 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | vmagent-prod 二进制文件放置在系统目录：`/usr/bin` 目录中
 6 | 
 7 | 环境变量启动参数在：`/etc/victoriametrics/vmagent/vmagent.conf` 文件中，如果需要修改 vmagent 的启动参数，可直接把参数放在这个文件中的固定格式，然后重启 vmagent 进程即可。
 8 | 
 9 | 时序库数据保存的文件路径：`/var/lib/vmagent-remotewrite-data` 目录中。
10 | 
11 | vmagent 作为一个前置代理，可以替代 VictoriaMetrics 的部分功能，例如抓取任务、标签重置、临时缓存、各种 **`pull`** 和 **`push`** 协议对接等等。这里的抓取任务配置文件：`/etc/victoriametrics/vmagent/scrape.yml` 。
12 | 
13 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
14 | 
15 | - 状态：sudo systemctl status vmagent.service
16 | - 停止：sudo systemctl stop vmagent.service
17 | - 启动：sudo systemctl start vmagent.service
18 | - 重启：sudo systemctl restart vmagent.service
19 | - 开机自启：sudo systemctl enable vmagent.service


--------------------------------------------------------------------------------
/victoriametrics/binary/vmagent/install-vmagent.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ]; then
  9 |         yum update && yum install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建vmagent配置文件目录
 19 |     mkdir -p /etc/victoriametrics/vmagent
 20 |     # 创建vmagent临时数据缓存目录
 21 |     mkdir -p /var/lib/vmagent-remotewrite-data
 22 | 
 23 |     # 检查victoriametrics组是否存在，不存在则创建
 24 |     if ! getent group victoriametrics > /dev/null 2>&1; then
 25 |         groupadd --system victoriametrics
 26 |     fi
 27 | 
 28 |     # 检查victoriametrics用户是否存在，不存在则创建
 29 |     if ! id -u victoriametrics > /dev/null 2>&1; then
 30 |         useradd --system --home-dir /var/lib/vmagent-remotewrite-data --no-create-home --gid victoriametrics victoriametrics
 31 |     fi
 32 | 
 33 |     chown -R victoriametrics:victoriametrics /var/lib/vmagent-remotewrite-data
 34 | }
 35 | 
 36 | # 确定操作系统类型
 37 | OS="unknown"
 38 | if [ -f /etc/os-release ]; then
 39 |     . /etc/os-release
 40 |     OS=$ID
 41 | fi
 42 | 
 43 | # 安装依赖工具
 44 | install_dependencies
 45 | 
 46 | # 设置系统服务和用户
 47 | setup_system
 48 | 
 49 | # 获取vmagent最新版本
 50 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags"  | grep '"name":' | head -n 1 | awk -F '"' '{print $4}')
 51 | 
 52 | # 下载并安装vmagent
 53 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/vmutils-linux-amd64-${VM_VERSION}.tar.gz -O /tmp/vmutils.tar.gz
 54 | 
 55 | cd /tmp && tar -xzvf /tmp/vmutils.tar.gz vmagent-prod
 56 | mv /tmp/vmagent-prod /usr/bin
 57 | chmod +x /usr/bin/vmagent-prod
 58 | 
 59 | # 清理 /tmp 目录中的压缩文件和解压后的临时文件
 60 | rm -rf /tmp/vmutils.tar.gz /tmp/vmagent-prod*
 61 | 
 62 | cat> /etc/systemd/system/vmagent.service <<EOF
 63 | [Unit]
 64 | Description=vmagent is a tiny but mighty agent which helps you collect metrics from various sources and store them in VictoriaMetrics or any other Prometheus-compatible storage systems that support the remote_write protocol.
 65 | # https://docs.victoriametrics.com/vmagent.html
 66 | After=network.target
 67 | 
 68 | [Service]
 69 | Type=simple
 70 | User=victoriametrics
 71 | Group=victoriametrics
 72 | StartLimitBurst=5
 73 | StartLimitInterval=0
 74 | Restart=on-failure
 75 | RestartSec=1
 76 | EnvironmentFile=-/etc/victoriametrics/vmagent/vmagent.conf
 77 | ExecStart=/usr/bin/vmagent-prod \$ARGS
 78 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 79 | ExecReload=/bin/kill -HUP \$MAINPID
 80 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
 81 | LimitNOFILE=1048576
 82 | LimitNPROC=1048576
 83 | LimitCORE=infinity
 84 | WorkingDirectory=/var/lib/vmagent-remotewrite-data
 85 | ReadWritePaths=/var/lib/vmagent-remotewrite-data
 86 | StandardOutput=syslog
 87 | StandardError=syslog
 88 | SyslogIdentifier=vmagent
 89 | PrivateTmp=yes
 90 | ProtectHome=yes
 91 | NoNewPrivileges=yes
 92 | ProtectSystem=strict
 93 | ProtectControlGroups=true
 94 | ProtectKernelModules=true
 95 | ProtectKernelTunables=yes
 96 | 
 97 | [Install]
 98 | WantedBy=multi-user.target
 99 | EOF
100 | 
101 | cat> /etc/victoriametrics/vmagent/vmagent.conf <<EOF
102 | ARGS="-promscrape.config=/etc/victoriametrics/vmagent/scrape.yml -remoteWrite.url=http://127.0.0.1:8428/api/v1/write -remoteWrite.tmpDataPath=/var/lib/vmagent-remotewrite-data -promscrape.suppressScrapeErrors"
103 | EOF
104 | 
105 | cat> /etc/victoriametrics/vmagent/scrape.yml <<EOF
106 | global:
107 |   scrape_interval: 10s
108 |   scrape_timeout: 30s
109 | 
110 | scrape_configs:
111 |   - job_name: 'vmagent'
112 |     static_configs:
113 |       - targets: ['127.0.0.1:8429']
114 |   - job_name: victoriametrics
115 |     static_configs:
116 |       - targets:
117 |         - http://127.0.0.1:8428/metrics
118 | EOF
119 | 
120 | chown -R victoriametrics:victoriametrics /var/lib/vmagent-remotewrite-data
121 | chown -R victoriametrics:victoriametrics /etc/victoriametrics/vmagent
122 | 
123 | sudo systemctl enable vmagent.service
124 | sudo systemctl restart vmagent.service
125 | ps aux | grep vmagent
126 | 
127 | echo "vmagent installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/vmagent/vmagent.conf:
--------------------------------------------------------------------------------
 1 | # https://docs.victoriametrics.com/vmagent.html
 2 | #
 3 | # Example command line:
 4 | # /path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
 5 | #
 6 | # Please note that to write scraped data from vmagent to VictoriaMetrics Cluster you should use url like in this example -remoteWrite.url=http://vminsert-ip:8480/insert/0/prometheus/ .
 7 | # See more information here https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format.
 8 | # 
 9 | # If you only need to collect Influx data, then the following command is sufficient:
10 | #
11 | # /path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
12 | #
13 | # Then send Influx data to http://vmagent-host:8429. See these https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf for more details.
14 | 
15 | ARGS="-promscrape.config=/etc/victoriametrics/vmagent/scrape.yml -remoteWrite.url=http://127.0.0.1:8428/api/v1/write -remoteWrite.tmpDataPath=/var/lib/vmagent-remotewrite-data"


--------------------------------------------------------------------------------
/victoriametrics/binary/vmalert/README.md:
--------------------------------------------------------------------------------
 1 | ## 二进制部署 vmalert 脚本
 2 | 
 3 | 安装完成后，关于启动参数和配置文件说明：
 4 | 
 5 | vmalert-prod 二进制文件放置在系统目录：`/usr/bin` 目录中
 6 | 
 7 | 环境变量启动参数在：`/etc/victoriametrics/vmalert/vmalert.conf` 文件中，如果需要修改 vmagent 的启动参数，可直接把参数放在这个文件中的固定格式，然后重启 vmalert 进程即可。
 8 | 
 9 | 时序库数据保存的文件路径：`/var/lib/vmagent-remotewrite-data` 目录中。
10 | 
11 | vmalert 作为一个前置代理，可以替代 VictoriaMetrics 的部分功能，例如抓取任务、标签重置、临时缓存、各种 **`pull`** 和 **`push`** 协议对接等等。这里的抓取任务配置文件：`/etc/victoriametrics/vmagent/scrape.yml` 。
12 | 
13 | 二进制文件启动都使用 systemd 管理进程，可直接执行下面的命令查看prometheus进程状态：
14 | 
15 | - 状态：sudo systemctl status vmalert.service
16 | - 停止：sudo systemctl stop vmalert.service
17 | - 启动：sudo systemctl start vmalert.service
18 | - 重启：sudo systemctl restart vmalert.service
19 | - 开机自启：sudo systemctl enable vmalert.service


--------------------------------------------------------------------------------
/victoriametrics/binary/vmalert/install-vmalert.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ]; then
  9 |         yum update && yum install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建vmalert配置文件目录
 19 |     mkdir -p /etc/victoriametrics/vmalert
 20 | 
 21 |     # 检查victoriametrics组是否存在，不存在则创建
 22 |     if ! getent group victoriametrics > /dev/null 2>&1; then
 23 |         groupadd --system victoriametrics
 24 |     fi
 25 | 
 26 |     # 检查victoriametrics用户是否存在，不存在则创建
 27 |     if ! id -u victoriametrics > /dev/null 2>&1; then
 28 |         useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics
 29 |     fi
 30 | }
 31 | 
 32 | # 确定操作系统类型
 33 | OS="unknown"
 34 | if [ -f /etc/os-release ]; then
 35 |     . /etc/os-release
 36 |     OS=$ID
 37 | fi
 38 | 
 39 | # 安装依赖工具
 40 | install_dependencies
 41 | 
 42 | # 设置系统服务和用户
 43 | setup_system
 44 | 
 45 | # 获取vmalert最新版本
 46 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}')
 47 | 
 48 | # 下载并安装vmalert
 49 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/vmutils-linux-amd64-${VM_VERSION}.tar.gz -O /tmp/vmutils.tar.gz
 50 | 
 51 | cd /tmp && tar -xzvf /tmp/vmutils.tar.gz vmalert-prod
 52 | mv /tmp/vmalert-prod /usr/bin
 53 | chmod +x /usr/bin/vmalert-prod
 54 | 
 55 | # 清理 /tmp 目录中的压缩文件和解压后的临时文件
 56 | rm -rf /tmp/vmutils.tar.gz /tmp/vmalert-prod*
 57 | 
 58 | cat> /etc/systemd/system/vmalert.service <<EOF
 59 | [Unit]
 60 | Description=vmalert executes a list of the given alerting or recording rules against configured address. It is heavily inspired by Prometheus implementation and aims to be compatible with its syntax.
 61 | # https://docs.victoriametrics.com/vmalert.html
 62 | After=network.target
 63 | 
 64 | [Service]
 65 | Type=simple
 66 | User=victoriametrics
 67 | Group=victoriametrics
 68 | StartLimitBurst=5
 69 | StartLimitInterval=0
 70 | Restart=on-failure
 71 | RestartSec=1
 72 | EnvironmentFile=-/etc/victoriametrics/vmalert/vmalert.conf
 73 | ExecStart=/usr/bin/vmalert-prod \$ARGS
 74 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 75 | ExecReload=/bin/kill -HUP \$MAINPID
 76 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
 77 | LimitNOFILE=1048576
 78 | LimitNPROC=1048576
 79 | LimitCORE=infinity
 80 | StandardOutput=syslog
 81 | StandardError=syslog
 82 | # StandardOutput=journal
 83 | # StandardError=journal
 84 | SyslogIdentifier=vmalert
 85 | 
 86 | [Install]
 87 | WantedBy=multi-user.target
 88 | EOF
 89 | 
 90 | cat> /etc/victoriametrics/vmalert/vmalert.conf <<EOF
 91 | ARGS="-rule=/etc/victoriametrics/vmalert/alerts.yml -datasource.url=http://127.0.0.1:8428 -notifier.url=http://127.0.0.1:9093 -remoteWrite.url=http://127.0.0.1:8428 -remoteRead.url=http://127.0.0.1:8428"
 92 | EOF
 93 | 
 94 | chown -R victoriametrics:victoriametrics /etc/victoriametrics/vmalert
 95 | 
 96 | systemctl enable vmalert.service
 97 | systemctl restart vmalert.service
 98 | ps aux | grep vmalert
 99 | 
100 | echo "vmalert installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/vmalert/vmalert.conf:
--------------------------------------------------------------------------------
 1 | # https://docs.victoriametrics.com/vmalert.html
 2 | #
 3 | # https://docs.victoriametrics.com/vmalert.html#quickstart
 4 | # To start using vmalert you will need the following things:
 5 | #    - list of rules - PromQL/MetricsQL expressions to execute;
 6 | #    - datasource address - reachable VictoriaMetrics instance for rules execution;
 7 | #    - notifier address - reachable Alert Manager instance for processing, aggregating alerts and sending notifications.
 8 | #    - remote write address [optional] - remote write compatible storage address for storing recording rules results and alerts state in for of timeseries.
 9 | 
10 | ARGS="-rule=/etc/victoriametrics/vmalert/alerts.yml -datasource.url=http://127.0.0.1:8428 -notifier.url=http://127.0.0.1:9093 -remoteWrite.url=http://127.0.0.1:8428 -remoteRead.url=http://127.0.0.1:8428 -external.label=cluster=east-1 -external.label=replica=a"


--------------------------------------------------------------------------------
/victoriametrics/binary/vmauth/README.md:
--------------------------------------------------------------------------------
1 | ## 二进制部署 vmauth 脚本
2 | 
3 | 请切换到 root 下执行，或使用管理员用户
4 | 
5 | 启动参数配置文件：/etc/victoriametrics/vmauth/vmauth.conf
6 | 
7 | 需要根据自身的实际环境修改参数配置


--------------------------------------------------------------------------------
/victoriametrics/binary/vmauth/install-vmauth.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ]; then
  9 |         yum update && yum install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建vmauth配置文件目录
 19 |     mkdir -p /etc/victoriametrics/vmauth
 20 | 
 21 |     # 检查victoriametrics组是否存在，不存在则创建
 22 |     if ! getent group victoriametrics > /dev/null 2>&1; then
 23 |         groupadd --system victoriametrics
 24 |     fi
 25 | 
 26 |     # 检查victoriametrics用户是否存在，不存在则创建
 27 |     if ! id -u victoriametrics > /dev/null 2>&1; then
 28 |         useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics
 29 |     fi
 30 | }
 31 | 
 32 | # 确定操作系统类型
 33 | OS="unknown"
 34 | if [ -f /etc/os-release ]; then
 35 |     . /etc/os-release
 36 |     OS=$ID
 37 | fi
 38 | 
 39 | # 安装依赖工具
 40 | install_dependencies
 41 | 
 42 | # 设置系统服务和用户
 43 | setup_system
 44 | 
 45 | # 获取vmauth最新版本
 46 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}')
 47 | 
 48 | # 下载并安装vmauth
 49 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/vmutils-linux-amd64-${VM_VERSION}.tar.gz -O /tmp/vmutils.tar.gz
 50 | 
 51 | cd /tmp && tar -xzvf /tmp/vmutils.tar.gz vmauth-prod
 52 | mv /tmp/vmauth-prod /usr/bin
 53 | chmod +x /usr/bin/vmauth-prod
 54 | 
 55 | cat> /etc/systemd/system/vmauth.service <<EOF
 56 | [Unit]
 57 | Description=vmauth is used for authenticating and authorizing incoming requests.
 58 | After=network.target
 59 | 
 60 | [Service]
 61 | Type=simple
 62 | User=victoriametrics
 63 | Group=victoriametrics
 64 | StartLimitBurst=5
 65 | StartLimitInterval=0
 66 | Restart=on-failure
 67 | RestartSec=1
 68 | EnvironmentFile=-/etc/victoriametrics/vmauth/vmauth.conf
 69 | ExecStart=/usr/bin/vmauth-prod \$ARGS
 70 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 71 | ExecReload=/bin/kill -HUP \$MAINPID
 72 | LimitNOFILE=1048576
 73 | LimitNPROC=1048576
 74 | LimitCORE=infinity
 75 | StandardOutput=syslog
 76 | StandardError=syslog
 77 | SyslogIdentifier=vmauth
 78 | 
 79 | [Install]
 80 | WantedBy=multi-user.target
 81 | EOF
 82 | 
 83 | cat> /etc/victoriametrics/vmauth/vmauth.conf <<EOF
 84 | ARGS="-auth.config=/etc/victoriametrics/vmauth/config.yml"
 85 | EOF
 86 | 
 87 | cat> /etc/victoriametrics/vmauth/config.yml <<EOF
 88 | # balance load among vmselects
 89 | # see https://docs.victoriametrics.com/vmauth/#load-balancing
 90 | unauthorized_user:
 91 |   # 数据传入负载
 92 |   url_map:
 93 |   - src_paths:
 94 |     - "/insert/.+"
 95 |     url_prefix:
 96 |     # - "http://vminsert-1:8480/insert/0/prometheus"
 97 |     - "http://vminsert-1:8480/"
 98 |     - "http://vminsert-2:8480/"
 99 |     - "http://vminsert-3:8480/"
100 |   - src_paths:
101 |     - "/select/.+"
102 |     url_prefix:
103 |     - "http://vmselect-1:8481/"
104 |     - "http://vmselect-2:8481/"
105 |     - "http://vmselect-3:8481/"
106 |     retry_status_codes: [500, 502, 503]
107 |     load_balancing_policy: first_available
108 | EOF
109 | 
110 | chown -R victoriametrics:victoriametrics /etc/victoriametrics/vmauth
111 | 
112 | systemctl enable vmauth.service
113 | systemctl restart vmauth.service
114 | ps aux | grep vmauth
115 | 
116 | echo "vmauth installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/vmauth/vmauth.conf:
--------------------------------------------------------------------------------
1 | # https://docs.victoriametrics.com/vmauth.html
2 | #
3 | 
4 | ARGS="-auth.config=/etc/victoriametrics/vmauth/config.yml"


--------------------------------------------------------------------------------
/victoriametrics/binary/vmauth/vmauth.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=vmauth is a simple auth proxy, router and load balancer for VictoriaMetrics. 
 3 | # https://docs.victoriametrics.com/vmauth.html
 4 | After=network.target
 5 | 
 6 | [Service]
 7 | Type=simple
 8 | User=victoriametrics
 9 | Group=victoriametrics
10 | StartLimitBurst=5
11 | StartLimitInterval=0
12 | Restart=on-failure
13 | RestartSec=1
14 | EnvironmentFile=-/etc/victoriametrics/vmauth/vmauth.conf
15 | ExecStart=/usr/bin/vmauth-prod $ARGS
16 | ExecStop=/bin/kill -s SIGTERM $MAINPID
17 | ExecReload=/bin/kill -HUP $MAINPID
18 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
19 | LimitNOFILE=1048576
20 | LimitNPROC=1048576
21 | LimitCORE=infinity
22 | StandardOutput=syslog
23 | StandardError=syslog
24 | SyslogIdentifier=vmauth
25 | PrivateTmp=yes
26 | ProtectHome=yes
27 | NoNewPrivileges=yes
28 | ProtectSystem=strict
29 | ProtectControlGroups=true
30 | ProtectKernelModules=true
31 | ProtectKernelTunables=yes
32 | 
33 | [Install]
34 | WantedBy=multi-user.target


--------------------------------------------------------------------------------
/victoriametrics/binary/vminsert/README.md:
--------------------------------------------------------------------------------
1 | ## 二进制部署 vminsert 脚本
2 | 
3 | 启动参数配置文件：/etc/victoriametrics/vminsert/vminsert.conf


--------------------------------------------------------------------------------
/victoriametrics/binary/vminsert/install-vminsert.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ]; then
  9 |         yum update && yum install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建vminsert配置文件目录
 19 |     mkdir -p /etc/victoriametrics/vminsert
 20 | 
 21 |     # 检查victoriametrics组是否存在，不存在则创建
 22 |     if ! getent group victoriametrics > /dev/null 2>&1; then
 23 |         groupadd --system victoriametrics
 24 |     fi
 25 | 
 26 |     # 检查victoriametrics用户是否存在，不存在则创建
 27 |     if ! id -u victoriametrics > /dev/null 2>&1; then
 28 |         useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics
 29 |     fi
 30 | }
 31 | 
 32 | # 确定操作系统类型
 33 | OS="unknown"
 34 | if [ -f /etc/os-release ]; then
 35 |     . /etc/os-release
 36 |     OS=$ID
 37 | fi
 38 | 
 39 | # 安装依赖工具
 40 | install_dependencies
 41 | 
 42 | # 设置系统服务和用户
 43 | setup_system
 44 | 
 45 | # 获取VictoriaMetrics集群最新版本
 46 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}')
 47 | # 下载并安装VictoriaMetrics集群
 48 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-amd64-${VM_VERSION}-cluster.tar.gz -O /tmp/vmcluster.tar.gz
 49 | 
 50 | cd /tmp && tar -xzvf /tmp/vmcluster.tar.gz vminsert-prod
 51 | mv /tmp/vminsert-prod /usr/bin
 52 | chmod +x /usr/bin/vminsert-prod
 53 | 
 54 | cat> /etc/systemd/system/vminsert.service <<EOF
 55 | [Unit]
 56 | Description=vminsert - accepts the ingested data and spreads it among vmstorage nodes according to consistent hashing over metric name and all its labels
 57 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
 58 | After=network.target
 59 | 
 60 | [Service]
 61 | Type=simple
 62 | User=victoriametrics
 63 | Group=victoriametrics
 64 | StartLimitBurst=5
 65 | StartLimitInterval=0
 66 | Restart=on-failure
 67 | RestartSec=1
 68 | EnvironmentFile=-/etc/victoriametrics/vminsert/vminsert.conf
 69 | ExecStart=/usr/bin/vminsert-prod \$ARGS
 70 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 71 | ExecReload=/bin/kill -HUP \$MAINPID
 72 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
 73 | LimitNOFILE=1048576
 74 | LimitNPROC=1048576
 75 | LimitCORE=infinity
 76 | StandardOutput=syslog
 77 | StandardError=syslog
 78 | SyslogIdentifier=vminsert
 79 | PrivateTmp=yes
 80 | ProtectHome=yes
 81 | NoNewPrivileges=yes
 82 | ProtectSystem=strict
 83 | ProtectControlGroups=true
 84 | ProtectKernelModules=true
 85 | ProtectKernelTunables=yes
 86 | 
 87 | 
 88 | [Install]
 89 | WantedBy=multi-user.target
 90 | EOF
 91 | 
 92 | cat> /etc/victoriametrics/vminsert/vminsert.conf <<EOF
 93 | ARGS="-storageNode=127.0.0.1:8400 -replicationFactor=2"
 94 | EOF
 95 | 
 96 | chown -R victoriametrics:victoriametrics /etc/victoriametrics/vminsert
 97 | 
 98 | systemctl enable vminsert.service
 99 | systemctl restart vminsert.service
100 | ps aux | grep vminsert
101 | 
102 | echo "vminsert installation and service setup complete."
103 | 


--------------------------------------------------------------------------------
/victoriametrics/binary/vminsert/vminsert.conf:
--------------------------------------------------------------------------------
1 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
2 | # 
3 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#architecture-overview
4 | # 
5 | ARGS="-storageNode=127.0.0.1:8400"


--------------------------------------------------------------------------------
/victoriametrics/binary/vminsert/vminsert.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=vminsert - accepts the ingested data and spreads it among vmstorage nodes according to consistent hashing over metric name and all its labels
 3 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
 4 | After=network.target
 5 | 
 6 | [Service]
 7 | Type=simple
 8 | User=victoriametrics
 9 | Group=victoriametrics
10 | StartLimitBurst=5
11 | StartLimitInterval=0
12 | Restart=on-failure
13 | RestartSec=1
14 | EnvironmentFile=-/etc/victoriametrics/vmcluster/vminsert.conf
15 | ExecStart=/usr/bin/vminsert-prod $ARGS
16 | ExecStop=/bin/kill -s SIGTERM $MAINPID
17 | ExecReload=/bin/kill -HUP $MAINPID
18 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
19 | LimitNOFILE=1048576
20 | LimitNPROC=1048576
21 | LimitCORE=infinity
22 | StandardOutput=syslog
23 | StandardError=syslog
24 | SyslogIdentifier=vminsert
25 | PrivateTmp=yes
26 | ProtectHome=yes
27 | NoNewPrivileges=yes
28 | ProtectSystem=strict
29 | ProtectControlGroups=true
30 | ProtectKernelModules=true
31 | ProtectKernelTunables=yes
32 | 
33 | 
34 | [Install]
35 | WantedBy=multi-user.target


--------------------------------------------------------------------------------
/victoriametrics/binary/vmselect/README.md:
--------------------------------------------------------------------------------
1 | ## 二进制部署 vmselect 脚本
2 | 
3 | 启动参数配置文件：/etc/victoriametrics/vmselect/vmselect.conf


--------------------------------------------------------------------------------
/victoriametrics/binary/vmselect/install-vmselect.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ]; then
  9 |         yum update && yum install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建vmselect配置文件目录
 19 |     mkdir -p /etc/victoriametrics/vmselect
 20 | 
 21 |     # 检查victoriametrics组是否存在，不存在则创建
 22 |     if ! getent group victoriametrics > /dev/null 2>&1; then
 23 |         groupadd --system victoriametrics
 24 |     fi
 25 | 
 26 |     # 检查victoriametrics用户是否存在，不存在则创建
 27 |     if ! id -u victoriametrics > /dev/null 2>&1; then
 28 |         useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics
 29 |     fi
 30 | }
 31 | 
 32 | # 确定操作系统类型
 33 | OS="unknown"
 34 | if [ -f /etc/os-release ]; then
 35 |     . /etc/os-release
 36 |     OS=$ID
 37 | fi
 38 | 
 39 | # 安装依赖工具
 40 | install_dependencies
 41 | 
 42 | # 设置系统服务和用户
 43 | setup_system
 44 | 
 45 | # 获取VictoriaMetrics集群最新版本
 46 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}')
 47 | # 下载并安装VictoriaMetrics集群
 48 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-amd64-${VM_VERSION}-cluster.tar.gz -O /tmp/vmcluster.tar.gz
 49 | 
 50 | cd /tmp && tar -xzvf /tmp/vmcluster.tar.gz vmselect-prod
 51 | mv /tmp/vmselect-prod /usr/bin
 52 | chmod +x /usr/bin/vmselect-prod
 53 | 
 54 | cat> /etc/systemd/system/vmselect.service <<EOF
 55 | [Unit]
 56 | Description=vmselect - performs incoming queries by fetching the needed data from all the configured vmstorage nodes
 57 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
 58 | After=network.target
 59 | 
 60 | [Service]
 61 | Type=simple
 62 | User=victoriametrics
 63 | Group=victoriametrics
 64 | StartLimitBurst=5
 65 | StartLimitInterval=0
 66 | Restart=on-failure
 67 | RestartSec=1
 68 | EnvironmentFile=-/etc/victoriametrics/vmselect/vmselect.conf
 69 | ExecStart=/usr/bin/vmselect-prod \$ARGS
 70 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 71 | ExecReload=/bin/kill -HUP \$MAINPID
 72 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
 73 | LimitNOFILE=1048576
 74 | LimitNPROC=1048576
 75 | LimitCORE=infinity
 76 | StandardOutput=syslog
 77 | StandardError=syslog
 78 | SyslogIdentifier=vmselect
 79 | PrivateTmp=yes
 80 | ProtectHome=yes
 81 | NoNewPrivileges=yes
 82 | ProtectSystem=strict
 83 | ProtectControlGroups=true
 84 | ProtectKernelModules=true
 85 | ProtectKernelTunables=yes
 86 | 
 87 | [Install]
 88 | WantedBy=multi-user.target
 89 | EOF
 90 | 
 91 | cat> /etc/victoriametrics/vmselect/vmselect.conf <<EOF
 92 | ARGS="-storageNode=127.0.0.1:8401 -dedup.minScrapeInterval=1ms"
 93 | EOF
 94 | 
 95 | chown -R victoriametrics:victoriametrics /etc/victoriametrics/vmselect
 96 | 
 97 | systemctl enable vmselect.service
 98 | systemctl restart vmselect.service
 99 | ps aux | grep vmselect
100 | 
101 | echo "vmselect installation and service setup complete."


--------------------------------------------------------------------------------
/victoriametrics/binary/vmselect/vmselect.conf:
--------------------------------------------------------------------------------
1 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
2 | # 
3 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#architecture-overview
4 | # 
5 | ARGS="-storageNode=127.0.0.1:8401 -dedup.minScrapeInterval=1ms"


--------------------------------------------------------------------------------
/victoriametrics/binary/vmselect/vmselect.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=vmselect - performs incoming queries by fetching the needed data from all the configured vmstorage nodes
 3 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
 4 | After=network.target
 5 | 
 6 | [Service]
 7 | Type=simple
 8 | User=victoriametrics
 9 | Group=victoriametrics
10 | StartLimitBurst=5
11 | StartLimitInterval=0
12 | Restart=on-failure
13 | RestartSec=1
14 | EnvironmentFile=-/etc/victoriametrics/vmcluster/vmselect.conf
15 | ExecStart=/usr/bin/vmselect-prod $ARGS
16 | ExecStop=/bin/kill -s SIGTERM $MAINPID
17 | ExecReload=/bin/kill -HUP $MAINPID
18 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
19 | LimitNOFILE=1048576
20 | LimitNPROC=1048576
21 | LimitCORE=infinity
22 | StandardOutput=syslog
23 | StandardError=syslog
24 | SyslogIdentifier=vmselect
25 | PrivateTmp=yes
26 | ProtectHome=yes
27 | NoNewPrivileges=yes
28 | ProtectSystem=strict
29 | ProtectControlGroups=true
30 | ProtectKernelModules=true
31 | ProtectKernelTunables=yes
32 | 
33 | [Install]
34 | WantedBy=multi-user.target


--------------------------------------------------------------------------------
/victoriametrics/binary/vmstorage/README.md:
--------------------------------------------------------------------------------
1 | ## 二进制部署 vmstorage 脚本
2 | 
3 | 启动参数配置文件：/etc/victoriametrics/vmstorage/vmstorage.conf
4 | 
5 | 临时数据保存目录：/var/lib/victoria-metrics-cluster-data


--------------------------------------------------------------------------------
/victoriametrics/binary/vmstorage/install-vmstorage.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools
  8 |     elif [ "$OS" == "centos" ]; then
  9 |         yum update && yum install -y curl wget net-tools
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建vmstorage配置文件目录
 19 |     mkdir -p /etc/victoriametrics/vmstorage
 20 |     # 创建victoriametrics集群数据保存目录
 21 |     mkdir -p /var/lib/victoria-metrics-cluster-data
 22 | 
 23 |     # 检查victoriametrics组是否存在，不存在则创建
 24 |     if ! getent group victoriametrics > /dev/null 2>&1; then
 25 |         groupadd --system victoriametrics
 26 |     fi
 27 | 
 28 |     # 检查victoriametrics用户是否存在，不存在则创建
 29 |     if ! id -u victoriametrics > /dev/null 2>&1; then
 30 |         useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics
 31 |     fi
 32 | 
 33 |     chown -R victoriametrics:victoriametrics /var/lib/victoria-metrics-cluster-data
 34 | }
 35 | 
 36 | # 确定操作系统类型
 37 | OS="unknown"
 38 | if [ -f /etc/os-release ]; then
 39 |     . /etc/os-release
 40 |     OS=$ID
 41 | fi
 42 | 
 43 | # 安装依赖工具
 44 | install_dependencies
 45 | 
 46 | # 设置系统服务和用户
 47 | setup_system
 48 | 
 49 | # 获取VictoriaMetrics集群最新版本
 50 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}')
 51 | # 下载并安装VictoriaMetrics集群
 52 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-amd64-${VM_VERSION}-cluster.tar.gz -O /tmp/vmcluster.tar.gz
 53 | 
 54 | cd /tmp && tar -xzvf /tmp/vmcluster.tar.gz vmstorage-prod
 55 | mv /tmp/vmstorage-prod /usr/bin
 56 | chmod +x /usr/bin/vmstorage-prod
 57 | 
 58 | cat> /etc/systemd/system/vmstorage.service <<EOF
 59 | [Unit]
 60 | Description=vmstorage - stores the raw data and returns the queried data on the given time range for the given label filters
 61 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
 62 | After=network.target
 63 | 
 64 | [Service]
 65 | Type=simple
 66 | User=victoriametrics
 67 | Group=victoriametrics
 68 | StartLimitBurst=5
 69 | StartLimitInterval=0
 70 | Restart=on-failure
 71 | RestartSec=1
 72 | EnvironmentFile=-/etc/victoriametrics/vmstorage/vmstorage.conf
 73 | ExecStart=/usr/bin/vmstorage-prod \$ARGS
 74 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 75 | ExecReload=/bin/kill -HUP \$MAINPID
 76 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
 77 | LimitNOFILE=1048576
 78 | LimitNPROC=1048576
 79 | LimitCORE=infinity
 80 | StandardOutput=syslog
 81 | StandardError=syslog
 82 | SyslogIdentifier=vmstorage
 83 | WorkingDirectory=/var/lib/victoria-metrics-cluster-data
 84 | ReadWritePaths=/var/lib/victoria-metrics-cluster-data
 85 | PrivateTmp=yes
 86 | ProtectHome=yes
 87 | NoNewPrivileges=yes
 88 | ProtectSystem=strict
 89 | ProtectControlGroups=true
 90 | ProtectKernelModules=true
 91 | ProtectKernelTunables=yes
 92 | 
 93 | [Install]
 94 | WantedBy=multi-user.target
 95 | EOF
 96 | 
 97 | cat> /etc/victoriametrics/vmstorage/vmstorage.conf <<EOF
 98 | ARGS="-storageDataPath=/var/lib/victoria-metrics-cluster-data -retentionPeriod=30d"
 99 | EOF
100 | 
101 | chown -R victoriametrics:victoriametrics /etc/victoriametrics/vmstorage
102 | chown -R victoriametrics:victoriametrics /var/lib/victoria-metrics-cluster-data
103 | 
104 | systemctl enable vmstorage.service
105 | systemctl restart vmstorage.service
106 | ps aux | grep vmstorage
107 | 
108 | echo "vmstorage installation and service setup complete."
109 | 


--------------------------------------------------------------------------------
/victoriametrics/binary/vmstorage/vmstorage.config:
--------------------------------------------------------------------------------
1 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
2 | # 
3 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#architecture-overview
4 | # 
5 | ARGS="-storageDataPath=/var/lib/victoria-metrics-cluster-data"


--------------------------------------------------------------------------------
/victoriametrics/binary/vmstorage/vmstorage.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=vmstorage - stores the raw data and returns the queried data on the given time range for the given label filters
 3 | # https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html
 4 | After=network.target
 5 | 
 6 | [Service]
 7 | Type=simple
 8 | User=victoriametrics
 9 | Group=victoriametrics
10 | StartLimitBurst=5
11 | StartLimitInterval=0
12 | Restart=on-failure
13 | RestartSec=1
14 | EnvironmentFile=-/etc/victoriametrics/vmcluster/vmstorage.conf
15 | ExecStart=/usr/bin/vmstorage-prod $ARGS
16 | ExecStop=/bin/kill -s SIGTERM $MAINPID
17 | ExecReload=/bin/kill -HUP $MAINPID
18 | # See docs https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#tuning
19 | LimitNOFILE=1048576
20 | LimitNPROC=1048576
21 | LimitCORE=infinity
22 | StandardOutput=syslog
23 | StandardError=syslog
24 | SyslogIdentifier=vmstorage
25 | WorkingDirectory=/var/lib/victoria-metrics-cluster-data
26 | ReadWritePaths=/var/lib/victoria-metrics-cluster-data
27 | PrivateTmp=yes
28 | ProtectHome=yes
29 | NoNewPrivileges=yes
30 | ProtectSystem=strict
31 | ProtectControlGroups=true
32 | ProtectKernelModules=true
33 | ProtectKernelTunables=yes
34 | 
35 | [Install]
36 | WantedBy=multi-user.target


--------------------------------------------------------------------------------
/victoriametrics/deploy-cluster/alertmanager/alertmanager.yml:
--------------------------------------------------------------------------------
1 | route:
2 |   receiver: blackhole
3 | 
4 | receivers:
5 |   - name: blackhole


--------------------------------------------------------------------------------
/victoriametrics/deploy-cluster/provisioning/dashboards/dashboard.yml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 | 
3 | providers:
4 | - name: Prometheus
5 |   orgId: 1
6 |   folder: ''
7 |   type: file
8 |   options:
9 |     path: /var/lib/grafana/dashboards


--------------------------------------------------------------------------------
/victoriametrics/deploy-cluster/provisioning/datasources/prometheus-datasource/prometheus-datasource.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | datasources:
 4 |     # - name: VictoriaMetrics
 5 |     #   type: prometheus
 6 |     #   access: proxy
 7 |     #   url: http://victoriametrics:8428
 8 |     #   isDefault: true
 9 |     #   jsonData:
10 |     #     prometheusType: Prometheus
11 |     #     prometheusVersion: 2.24.0
12 |     - name: VictoriaMetrics - cluster
13 |       type: prometheus
14 |       access: proxy
15 |       url: http://vmauth:8427/select/0/prometheus
16 |       isDefault: false
17 |       jsonData:
18 |         prometheusType: Prometheus
19 |         prometheusVersion: 2.24.0


--------------------------------------------------------------------------------
/victoriametrics/deploy-cluster/vmagent/prometheus-cluster.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 30s
 3 |   scrape_timeout: 10s
 4 | 
 5 | scrape_configs:
 6 |   - job_name: 'vmagent'
 7 |     static_configs:
 8 |       - targets: ['vmagent:8429']
 9 |   - job_name: 'vmalert'
10 |     static_configs:
11 |       - targets: ['vmalert:8880']
12 |   - job_name: 'vmauth'
13 |     static_configs:
14 |       - targets: ['vmauth:8427']
15 |   - job_name: 'vminsert'
16 |     static_configs:
17 |       - targets: ['vminsert-1:8480', 'vminsert-2:8480', 'vminsert-3:8480']
18 |   - job_name: 'vmselect'
19 |     static_configs:
20 |       - targets: ['vmselect-1:8481', 'vmselect-2:8481', 'vmselect-3:8481']
21 |   - job_name: 'vmstorage'
22 |     static_configs:
23 |       - targets: ['vmstorage-1:8482', 'vmstorage-2:8482', 'vmstorage-3:8482']


--------------------------------------------------------------------------------
/victoriametrics/deploy-cluster/vmalert/alerts-health.yml:
--------------------------------------------------------------------------------
 1 | # File contains default list of alerts for various VM components.
 2 | # The following alerts are recommended for use for any VM installation.
 3 | # The alerts below are just recommendations and may require some updates
 4 | # and threshold calibration according to every specific setup.
 5 | groups:
 6 |   - name: vm-health
 7 |     # note the `job` filter and update accordingly to your setup
 8 |     rules:
 9 |       - alert: TooManyRestarts
10 |         expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2
11 |         labels:
12 |           severity: critical
13 |         annotations:
14 |           summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
15 |           description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
16 |             It might be crashlooping."
17 | 
18 |       - alert: ServiceDown
19 |         expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0
20 |         for: 2m
21 |         labels:
22 |           severity: critical
23 |         annotations:
24 |           summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
25 |           description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
26 | 
27 |       - alert: ProcessNearFDLimits
28 |         expr: (process_max_fds - process_open_fds) < 100
29 |         for: 5m
30 |         labels:
31 |           severity: critical
32 |         annotations:
33 |           summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
34 |           description: "Exhausting OS file descriptors limit can cause severe degradation of the process.Consider to increase the limit as fast as possible."
35 | 
36 |       - alert: TooHighMemoryUsage
37 |         expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
38 |         for: 5m
39 |         labels:
40 |           severity: critical
41 |         annotations:
42 |           summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")"
43 |           description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
44 |            Consider to either increase available memory or decrease the load on the process."
45 | 
46 |       - alert: TooHighCPUUsage
47 |         expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
48 |         for: 5m
49 |         labels:
50 |           severity: critical
51 |         annotations:
52 |           summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
53 |           description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
54 |                Consider to either increase available CPU resources or decrease the load on the process."
55 | 
56 |       - alert: TooManyLogs
57 |         expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
58 |         for: 15m
59 |         labels:
60 |           severity: warning
61 |         annotations:
62 |           summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
63 |           description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.Worth to check logs for specific error messages."
64 | 
65 |       - alert: TooManyTSIDMisses
66 |         expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0
67 |         for: 10m
68 |         labels:
69 |           severity: critical
70 |         annotations:
71 |           summary: "Too many TSID misses for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
72 |           description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).Make sure you're running VictoriaMetrics of v1.85.3 or higher.Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502"
73 | 
74 |       - alert: ConcurrentInsertsHitTheLimit
75 |         expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
76 |         for: 15m
77 |         labels:
78 |           severity: warning
79 |         annotations:
80 |           summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit"
81 |           description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n
82 |             Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.
83 |             In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
84 |             making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then 
85 |             it might be worth adjusting `-maxConcurrentInserts` cmd-line flag."


--------------------------------------------------------------------------------
/victoriametrics/deploy-cluster/vmalert/alerts-vmalert.yml:
--------------------------------------------------------------------------------
 1 | # File contains default list of alerts for vmalert service.
 2 | # The alerts below are just recommendations and may require some updates
 3 | # and threshold calibration according to every specific setup.
 4 | groups:
 5 |   # Alerts group for vmalert assumes that Grafana dashboard
 6 |   # https://grafana.com/grafana/dashboards/14950/ is installed.
 7 |   # Pls update the `dashboard` annotation according to your setup.
 8 |   - name: vmalert
 9 |     interval: 30s
10 |     rules:
11 |       - alert: ConfigurationReloadFailure
12 |         expr: vmalert_config_last_reload_successful != 1
13 |         labels:
14 |           severity: warning
15 |         annotations:
16 |           summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}"
17 |           description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}.
18 |             Check vmalert's logs for detailed error message."
19 | 
20 |       - alert: AlertingRulesError
21 |         expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0
22 |         for: 5m
23 |         labels:
24 |           severity: warning
25 |         annotations:
26 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
27 |           summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
28 |           description: "Alerting rules execution is failing for group \"{{ $labels.group }}\".
29 |             Check vmalert's logs for detailed error message."
30 | 
31 |       - alert: RecordingRulesError
32 |         expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0
33 |         for: 5m
34 |         labels:
35 |           severity: warning
36 |         annotations:
37 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
38 |           summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
39 |           description: "Recording rules execution is failing for group \"{{ $labels.group }}\".
40 |             Check vmalert's logs for detailed error message."
41 | 
42 |       - alert: RecordingRulesNoData
43 |         expr: sum(vmalert_recording_rules_last_evaluation_samples) without(recording, id) < 1
44 |         for: 30m
45 |         labels:
46 |           severity: info
47 |         annotations:
48 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}"
49 |           summary: "Recording rule {{ $labels.recording }} ({ $labels.group }}) produces no data"
50 |           description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" 
51 |             produces 0 samples over the last 30min. It might be caused by a misconfiguration 
52 |             or incorrect query expression."
53 | 
54 |       - alert: TooManyMissedIterations
55 |         expr: increase(vmalert_iteration_missed_total[5m]) > 0
56 |         for: 15m
57 |         labels:
58 |           severity: warning
59 |         annotations:
60 |           summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
61 |           description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\".
62 |             The group evaluation time takes longer than the configured evaluation interval. This may result in missed 
63 |             alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
64 |             group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert/#groups. 
65 |             If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/troubleshooting/#slow-queries."
66 | 
67 |       - alert: RemoteWriteErrors
68 |         expr: increase(vmalert_remotewrite_errors_total[5m]) > 0
69 |         for: 15m
70 |         labels:
71 |           severity: warning
72 |         annotations:
73 |           summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL"
74 |           description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting 
75 |             or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message."
76 | 
77 |       - alert: AlertmanagerErrors
78 |         expr: increase(vmalert_alerts_send_errors_total[5m]) > 0
79 |         for: 15m
80 |         labels:
81 |           severity: warning
82 |         annotations:
83 |           summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager"
84 |           description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\".
85 |             Check vmalert's logs for detailed error message."


--------------------------------------------------------------------------------
/victoriametrics/deploy-cluster/vmauth/auth-cluster.yml:
--------------------------------------------------------------------------------
 1 | # balance load among vmselects
 2 | # see https://docs.victoriametrics.com/vmauth/#load-balancing
 3 | unauthorized_user:
 4 |   # 数据传入负载
 5 |   url_map:
 6 |   - src_paths:
 7 |     - "/insert/.+"
 8 |     url_prefix:
 9 |     # - "http://vminsert-1:8480/insert/0/prometheus"
10 |     - "http://vminsert-1:8480/"
11 |     - "http://vminsert-2:8480/"
12 |     - "http://vminsert-3:8480/"
13 |   - src_paths:
14 |     - "/select/.+"
15 |     url_prefix:
16 |     - "http://vmselect-1:8481/"
17 |     - "http://vmselect-2:8481/"
18 |     - "http://vmselect-3:8481/"
19 |     retry_status_codes: [500, 502, 503]
20 |     load_balancing_policy: first_available


--------------------------------------------------------------------------------
/victoriametrics/deploy-n9e/compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   mysql:
 3 |     image: "mysql:8"
 4 |     container_name: mysql
 5 |     hostname: mysql
 6 |     restart: always
 7 |     environment:
 8 |       TZ: Asia/Shanghai
 9 |       MYSQL_ROOT_PASSWORD: 1234
10 |     volumes:
11 |       - mysqldata:/var/lib/mysql/
12 |       - ./initsql:/docker-entrypoint-initdb.d/
13 |       - ./mysql/my.cnf:/etc/my.cnf
14 |     networks:
15 |       - nightingale
16 |     ports:
17 |       - "3306:3306"
18 | 
19 |   redis:
20 |     image: "redis:6.2"
21 |     container_name: redis
22 |     hostname: redis
23 |     restart: always
24 |     environment:
25 |       TZ: Asia/Shanghai
26 |     networks:
27 |       - nightingale
28 |     ports:
29 |       - "6379:6379"
30 | 
31 |   victoriametrics:
32 |     image: victoriametrics/victoria-metrics:v1.100.1
33 |     container_name: victoriametrics
34 |     hostname: victoriametrics
35 |     restart: always
36 |     environment:
37 |       TZ: Asia/Shanghai
38 |     ports:
39 |       - "8428:8428"
40 |     networks:
41 |       - nightingale
42 |     command:
43 |       - "--loggerTimezone=Asia/Shanghai"
44 | 
45 |   nightingale:
46 |     image: flashcatcloud/nightingale:latest
47 |     container_name: nightingale
48 |     hostname: nightingale
49 |     restart: always
50 |     environment:
51 |       GIN_MODE: release
52 |       TZ: Asia/Shanghai
53 |       WAIT_HOSTS: mysql:3306, redis:6379
54 |     volumes:
55 |       - ./nightingale:/app/etc
56 |     networks:
57 |       - nightingale
58 |     ports:
59 |       - "17000:17000"
60 |       - "20090:20090"
61 |     depends_on:
62 |       - mysql
63 |       - redis
64 |       - victoriametrics
65 |     command: >
66 |       sh -c "/app/n9e"
67 | 
68 | # 使用VictoriaMetrics作为数据源配置的Grafana实例
69 |   grafana:
70 |     container_name: grafana
71 |     hostname: grafana
72 |     image: grafana/grafana:10.4.1
73 |     depends_on:
74 |       - "victoriametrics"
75 |     ports:
76 |       - 3000:3000
77 |     volumes:
78 |       - grafanadata:/var/lib/grafana
79 |       - ./provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources
80 |       - ./provisioning/dashboards:/etc/grafana/provisioning/dashboards
81 |       - ./dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json
82 |     networks:
83 |       - nightingale
84 |     restart: always
85 | 
86 | volumes:
87 |   mysqldata: {}
88 |   grafanadata: {}
89 | 
90 | networks:
91 |   nightingale:
92 |     driver: bridge


--------------------------------------------------------------------------------
/victoriametrics/deploy-n9e/initsql/c-init.sql:
--------------------------------------------------------------------------------
1 | GRANT ALL ON *.* TO 'root'@'127.0.0.1' IDENTIFIED BY '1234';
2 | GRANT ALL ON *.* TO 'root'@'localhost' IDENTIFIED BY '1234';
3 | GRANT ALL ON *.* TO 'root'@'%' IDENTIFIED BY '1234';


--------------------------------------------------------------------------------
/victoriametrics/deploy-n9e/mysql/my.cnf:
--------------------------------------------------------------------------------
1 | [mysqld]
2 | pid-file	 = /var/run/mysqld/mysqld.pid
3 | socket		 = /var/run/mysqld/mysqld.sock
4 | datadir		 = /var/lib/mysql
5 | bind-address = 0.0.0.0


--------------------------------------------------------------------------------
/victoriametrics/deploy-n9e/nightingale/script/notify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: UTF-8 -*-
 3 | import sys
 4 | import json
 5 | 
 6 | class Sender(object):
 7 |     @classmethod
 8 |     def send_email(cls, payload):
 9 |         # already done in go code
10 |         pass
11 | 
12 |     @classmethod
13 |     def send_wecom(cls, payload):
14 |         # already done in go code
15 |         pass
16 | 
17 |     @classmethod
18 |     def send_dingtalk(cls, payload):
19 |         # already done in go code
20 |         pass
21 | 
22 |     @classmethod
23 |     def send_feishu(cls, payload):
24 |         # already done in go code
25 |         pass
26 | 
27 |     @classmethod
28 |     def send_mm(cls, payload):
29 |         # already done in go code
30 |         pass
31 | 
32 |     @classmethod
33 |     def send_sms(cls, payload):
34 |         users = payload.get('event').get("notify_users_obj")
35 |         phones = {}
36 |         for u in users:
37 |             if u.get("phone"):
38 |                 phones[u.get("phone")] = 1
39 |         if phones:
40 |             print("send_sms not implemented, phones: {}".format(phones.keys()))
41 | 
42 |     @classmethod
43 |     def send_voice(cls, payload):
44 |         users = payload.get('event').get("notify_users_obj")
45 |         phones = {}
46 |         for u in users:
47 |             if u.get("phone"):
48 |                 phones[u.get("phone")] = 1
49 |         if phones:
50 |             print("send_voice not implemented, phones: {}".format(phones.keys()))
51 | 
52 | def main():
53 |     payload = json.load(sys.stdin)
54 |     with open(".payload", 'w') as f:
55 |         f.write(json.dumps(payload, indent=4))
56 |     for ch in payload.get('event').get('notify_channels'):
57 |         send_func_name = "send_{}".format(ch.strip())
58 |         if not hasattr(Sender, send_func_name):
59 |             print("function: {} not found", send_func_name)
60 |             continue
61 |         send_func = getattr(Sender, send_func_name)
62 |         send_func(payload)
63 | 
64 | def hello():
65 |     print("hello nightingale")
66 | 
67 | if __name__ == "__main__":
68 |     if len(sys.argv) == 1:
69 |         main()
70 |     elif sys.argv[1] == "hello":
71 |         hello()
72 |     else:
73 |         print("I am confused")


--------------------------------------------------------------------------------
/victoriametrics/deploy-n9e/nightingale/script/notify_feishu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | import sys
 4 | import json
 5 | import requests
 6 | 
 7 | class Sender(object):
 8 |     @classmethod
 9 |     def send_email(cls, payload):
10 |         # already done in go code
11 |         pass
12 | 
13 |     @classmethod
14 |     def send_wecom(cls, payload):
15 |         # already done in go code
16 |         pass
17 | 
18 |     @classmethod
19 |     def send_dingtalk(cls, payload):
20 |         # already done in go code
21 |         pass
22 | 
23 |     @classmethod
24 |     def send_ifeishu(cls, payload):
25 |         users = payload.get('event').get("notify_users_obj")
26 |         tokens = {}
27 |         phones = {}
28 | 
29 |         for u in users:
30 |             if u.get("phone"):
31 |                 phones[u.get("phone")] = 1
32 | 
33 |             contacts = u.get("contacts")
34 |             if contacts.get("feishu_robot_token", ""):
35 |                 tokens[contacts.get("feishu_robot_token", "")] = 1
36 |         
37 |         headers = {
38 |             "Content-Type": "application/json;charset=utf-8",
39 |             "Host": "open.feishu.cn"
40 |         }
41 | 
42 |         for t in tokens:
43 |             url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
44 |             body = {
45 |                 "msg_type": "text",
46 |                 "content": {
47 |                     "text": payload.get('tpls').get("feishu", "feishu not found")
48 |                 },
49 |                 "at": {
50 |                     "atMobiles": list(phones.keys()),
51 |                     "isAtAll": False
52 |                 }
53 |             }
54 | 
55 |             response = requests.post(url, headers=headers, data=json.dumps(body))
56 |             print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}")
57 | 
58 |     @classmethod
59 |     def send_mm(cls, payload):
60 |         # already done in go code
61 |         pass
62 | 
63 |     @classmethod
64 |     def send_sms(cls, payload):
65 |         pass
66 | 
67 |     @classmethod
68 |     def send_voice(cls, payload):
69 |         pass
70 | 
71 | def main():
72 |     payload = json.load(sys.stdin)
73 |     with open(".payload", 'w') as f:
74 |         f.write(json.dumps(payload, indent=4))
75 |     for ch in payload.get('event').get('notify_channels'):
76 |         send_func_name = "send_{}".format(ch.strip())
77 |         if not hasattr(Sender, send_func_name):
78 |             print("function: {} not found", send_func_name)
79 |             continue
80 |         send_func = getattr(Sender, send_func_name)
81 |         send_func(payload)
82 | 
83 | def hello():
84 |     print("hello nightingale")
85 | 
86 | if __name__ == "__main__":
87 |     if len(sys.argv) == 1:
88 |         main()
89 |     elif sys.argv[1] == "hello":
90 |         hello()
91 |     else:
92 |         print("I am confused")


--------------------------------------------------------------------------------
/victoriametrics/deploy-n9e/provisioning/dashboards/dashboard.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 | - name: Prometheus
 5 |   orgId: 1
 6 |   folder: ''
 7 |   type: file
 8 |   options:
 9 |     path: /var/lib/grafana/dashboards
10 | 


--------------------------------------------------------------------------------
/victoriametrics/deploy-n9e/provisioning/datasources/prometheus-datasource/prometheus-datasource.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | datasources:
 4 |     - name: VictoriaMetrics
 5 |       type: prometheus
 6 |       access: proxy
 7 |       url: http://victoriametrics:8428
 8 |       isDefault: true
 9 |       jsonData:
10 |         prometheusType: Prometheus
11 |         prometheusVersion: 2.24.0
12 | 


--------------------------------------------------------------------------------
/victoriametrics/deploy/docker-prometheus/README.md:
--------------------------------------------------------------------------------
 1 | ## docker compose 部署单节点Prometheus
 2 | 
 3 | 使用 Grafana + Prometheus + alertmanager 组合。
 4 | 
 5 | 使用前请修改Grafana配置文件中密码，当前admin密码为admin
 6 | 
 7 | 使用外部配置文件挂载到容器内部
 8 | 
 9 | 文件结构：
10 | 
11 | ```bash
12 | docker-prometheus/
13 | ├── alertmanager
14 | │   └── config.yml
15 | ├── docker-compose.yml
16 | ├── grafana
17 | │   ├── config.monitoring
18 | │   └── provisioning
19 | └── prometheus
20 |     ├── alert.yml
21 |     └── prometheus.yml
22 | ```


--------------------------------------------------------------------------------
/victoriametrics/deploy/docker-prometheus/alertmanager/alertmanager.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   # 阿里邮箱
 3 |   smtp_smarthost: 'smtp.qiye.aliyun.com:465'
 4 |   # 发邮件的邮箱
 5 |   smtp_from: 'your-email@example.com'
 6 |   # 发邮件的邮箱用户名，也就是你的邮箱　　　　　
 7 |   smtp_auth_username: 'your-email@example.com'
 8 |   # 发邮件的邮箱密码
 9 |   smtp_auth_password: 'your-password'
10 |   # 进行tls验证
11 |   smtp_require_tls: true
12 | 
13 | route:
14 |   group_by: ['alertname']
15 |   group_wait: 10s
16 |   group_interval: 10s
17 |   repeat_interval: 10m
18 |   receiver: live-monitoring
19 | 
20 | receivers:
21 | - name: 'live-monitoring'
22 |   # 收邮件的邮箱
23 |   email_configs:
24 |   - to: 'your-email@example.com'


--------------------------------------------------------------------------------
/victoriametrics/deploy/docker-prometheus/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   # prometheus 部署
 3 |   prometheus:
 4 |     container_name: prometheus
 5 |     image: prom/prometheus:latest
 6 |     restart: always
 7 |     volumes:
 8 |       - /etc/localtime:/etc/localtime:ro
 9 |       - $PWD/prometheus/:/etc/prometheus/
10 |       - prometheus_data:/prometheus
11 |     command:
12 |         - '--config.file=/etc/prometheus/prometheus.yml'
13 |         - '--storage.tsdb.path=/prometheus'
14 |         - '--web.console.libraries=/usr/share/prometheus/console_libraries'
15 |         - '--web.console.templates=/usr/share/prometheus/consoles'
16 |     networks:
17 |         - monitoring
18 |     expose:
19 |       - '9090'
20 |     ports:
21 |       - 9090:9090
22 | 
23 |   # alertmanager 部署
24 |   alertmanager:
25 |     container_name: alertmanager
26 |     image: prom/alertmanager:latest
27 |     restart: always
28 |     volumes:
29 |       - /etc/localtime:/etc/localtime:ro
30 |       - $PWD/alertmanager/:/etc/alertmanager/
31 |     command:
32 |       - '--config.file=/etc/alertmanager/alertmanager.yml'
33 |       - '--storage.path=/alertmanager'
34 |     networks:
35 |       - monitoring
36 |     expose:
37 |       - '9093'
38 |     ports:
39 |       - 9093:9093
40 | 
41 |   # grafana 部署
42 |   grafana:
43 |     container_name: grafana
44 |     image: grafana/grafana:latest
45 |     restart: always
46 |     volumes:
47 |       - /etc/localtime:/etc/localtime:ro
48 |       - grafana_data:/var/lib/grafana
49 |       - $PWD/grafana/provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources
50 |       - $PWD/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
51 |     networks:
52 |       - monitoring
53 |     ports:
54 |       - 3000:3000
55 |     depends_on:
56 |       - prometheus
57 | 
58 | volumes:
59 |   prometheus_data: {}
60 |   grafana_data: {}
61 | 
62 | networks:
63 |   monitoring:
64 |     driver: bridge


--------------------------------------------------------------------------------
/victoriametrics/deploy/docker-prometheus/grafana/provisioning/dashboards/dashboard.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 | - name: Prometheus
 5 |   orgId: 1
 6 |   folder: ''
 7 |   type: file
 8 |   options:
 9 |     path: /var/lib/grafana/dashboards
10 | 


--------------------------------------------------------------------------------
/victoriametrics/deploy/docker-prometheus/grafana/provisioning/datasources/prometheus-datasource/prometheus-datasource.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | datasources:
 4 |     - name: Prometheus
 5 |       type: prometheus
 6 |       access: proxy
 7 |       url: http://prometheus:9090
 8 |       isDefault: true
 9 |       jsonData:
10 |         prometheusType: Prometheus
11 |         prometheusVersion: 2.24.0


--------------------------------------------------------------------------------
/victoriametrics/deploy/docker-prometheus/prometheus/alert.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: example
 3 |   rules:
 4 | 
 5 |   # Alert for any instance that is unreachable for >2 minutes.
 6 |   - alert: service_down
 7 |     expr: up == 0
 8 |     for: 2m
 9 |     labels:
10 |       severity: warning
11 |     annotations:
12 |       summary: "Instance {{ $labels.instance }} down"
13 |       description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."


--------------------------------------------------------------------------------
/victoriametrics/deploy/docker-prometheus/prometheus/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # my global config
 2 | global:
 3 |   scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
 4 |   evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
 5 |   # scrape_timeout is set to the global default (10s).
 6 | 
 7 | # Alertmanager configuration
 8 | alerting:
 9 |   alertmanagers:
10 |   - static_configs:
11 |     - targets: ['alertmanager:9093']
12 |       # - alertmanager:9093
13 | 
14 | # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
15 | rule_files:
16 |   - "alert.yml"
17 |   # - "first_rules.yml"
18 |   # - "second_rules.yml"
19 | 
20 | # A scrape configuration containing exactly one endpoint to scrape:
21 | # Here it's Prometheus itself.
22 | scrape_configs:
23 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
24 |   - job_name: 'prometheus'
25 |     # Override the global default and scrape targets from this job every 5 seconds.
26 |     scrape_interval: 5s
27 |     static_configs:
28 |       - targets: ['localhost:9090']


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/README.md:
--------------------------------------------------------------------------------
1 | ## docker compose 部署单节点VictoriMetrics


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/alert/alerts-health.yml:
--------------------------------------------------------------------------------
 1 | # File contains default list of alerts for various VM components.
 2 | # The following alerts are recommended for use for any VM installation.
 3 | # The alerts below are just recommendations and may require some updates
 4 | # and threshold calibration according to every specific setup.
 5 | groups:
 6 |   - name: vm-health
 7 |     # note the `job` filter and update accordingly to your setup
 8 |     rules:
 9 |       - alert: TooManyRestarts
10 |         expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2
11 |         labels:
12 |           severity: critical
13 |         annotations:
14 |           summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
15 |           description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
16 |             It might be crashlooping."
17 | 
18 |       - alert: ServiceDown
19 |         expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0
20 |         for: 2m
21 |         labels:
22 |           severity: critical
23 |         annotations:
24 |           summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
25 |           description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
26 | 
27 |       - alert: ProcessNearFDLimits
28 |         expr: (process_max_fds - process_open_fds) < 100
29 |         for: 5m
30 |         labels:
31 |           severity: critical
32 |         annotations:
33 |           summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
34 |           description: "Exhausting OS file descriptors limit can cause severe degradation of the process.Consider to increase the limit as fast as possible."
35 | 
36 |       - alert: TooHighMemoryUsage
37 |         expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
38 |         for: 5m
39 |         labels:
40 |           severity: critical
41 |         annotations:
42 |           summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")"
43 |           description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
44 |            Consider to either increase available memory or decrease the load on the process."
45 | 
46 |       - alert: TooHighCPUUsage
47 |         expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
48 |         for: 5m
49 |         labels:
50 |           severity: critical
51 |         annotations:
52 |           summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
53 |           description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
54 |                Consider to either increase available CPU resources or decrease the load on the process."
55 | 
56 |       - alert: TooManyLogs
57 |         expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
58 |         for: 15m
59 |         labels:
60 |           severity: warning
61 |         annotations:
62 |           summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
63 |           description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.Worth to check logs for specific error messages."
64 | 
65 |       - alert: TooManyTSIDMisses
66 |         expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0
67 |         for: 10m
68 |         labels:
69 |           severity: critical
70 |         annotations:
71 |           summary: "Too many TSID misses for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
72 |           description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).Make sure you're running VictoriaMetrics of v1.85.3 or higher.Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502"
73 | 
74 |       - alert: ConcurrentInsertsHitTheLimit
75 |         expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
76 |         for: 15m
77 |         labels:
78 |           severity: warning
79 |         annotations:
80 |           summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit"
81 |           description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n
82 |             Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.
83 |             In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
84 |             making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then 
85 |             it might be worth adjusting `-maxConcurrentInserts` cmd-line flag."


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/alert/alerts-vmalert.yml:
--------------------------------------------------------------------------------
 1 | # File contains default list of alerts for vmalert service.
 2 | # The alerts below are just recommendations and may require some updates
 3 | # and threshold calibration according to every specific setup.
 4 | groups:
 5 |   # Alerts group for vmalert assumes that Grafana dashboard
 6 |   # https://grafana.com/grafana/dashboards/14950/ is installed.
 7 |   # Pls update the `dashboard` annotation according to your setup.
 8 |   - name: vmalert
 9 |     interval: 30s
10 |     rules:
11 |       - alert: ConfigurationReloadFailure
12 |         expr: vmalert_config_last_reload_successful != 1
13 |         labels:
14 |           severity: warning
15 |         annotations:
16 |           summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}"
17 |           description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}.
18 |             Check vmalert's logs for detailed error message."
19 | 
20 |       - alert: AlertingRulesError
21 |         expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0
22 |         for: 5m
23 |         labels:
24 |           severity: warning
25 |         annotations:
26 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
27 |           summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
28 |           description: "Alerting rules execution is failing for group \"{{ $labels.group }}\".
29 |             Check vmalert's logs for detailed error message."
30 | 
31 |       - alert: RecordingRulesError
32 |         expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0
33 |         for: 5m
34 |         labels:
35 |           severity: warning
36 |         annotations:
37 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
38 |           summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
39 |           description: "Recording rules execution is failing for group \"{{ $labels.group }}\".
40 |             Check vmalert's logs for detailed error message."
41 | 
42 |       - alert: RecordingRulesNoData
43 |         expr: sum(vmalert_recording_rules_last_evaluation_samples) without(recording, id) < 1
44 |         for: 30m
45 |         labels:
46 |           severity: info
47 |         annotations:
48 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}"
49 |           summary: "Recording rule {{ $labels.recording }} ({ $labels.group }}) produces no data"
50 |           description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" 
51 |             produces 0 samples over the last 30min. It might be caused by a misconfiguration 
52 |             or incorrect query expression."
53 | 
54 |       - alert: TooManyMissedIterations
55 |         expr: increase(vmalert_iteration_missed_total[5m]) > 0
56 |         for: 15m
57 |         labels:
58 |           severity: warning
59 |         annotations:
60 |           summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
61 |           description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\".
62 |             The group evaluation time takes longer than the configured evaluation interval. This may result in missed 
63 |             alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
64 |             group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert.html#groups. 
65 |             If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/Troubleshooting.html#slow-queries."
66 | 
67 |       - alert: RemoteWriteErrors
68 |         expr: increase(vmalert_remotewrite_errors_total[5m]) > 0
69 |         for: 15m
70 |         labels:
71 |           severity: warning
72 |         annotations:
73 |           summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL"
74 |           description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting 
75 |             or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message."
76 | 
77 |       - alert: AlertmanagerErrors
78 |         expr: increase(vmalert_alerts_send_errors_total[5m]) > 0
79 |         for: 15m
80 |         labels:
81 |           severity: warning
82 |         annotations:
83 |           summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager"
84 |           description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\".
85 |             Check vmalert's logs for detailed error message."


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/alertmanager/alertmanager.yml:
--------------------------------------------------------------------------------
1 | route:
2 |   receiver: blackhole
3 | 
4 | receivers:
5 |   - name: blackhole


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | services:
  2 |   #  vmagent 前置代理
  3 |   #  --promscrape.config 参数文件中定义了需要抓取的目标
  4 |   #  --remoteWrite.url 把抓取到的数据转储到时序数据库
  5 |   vmagent:
  6 |     container_name: vmagent
  7 |     image: victoriametrics/vmagent:latest
  8 |     depends_on:
  9 |       - "victoriametrics"
 10 |     ports:
 11 |       - 8429:8429
 12 |     volumes:
 13 |       - vmagentdata:/vmagentdata
 14 |       - ./scrape/prometheus.yml:/etc/prometheus/prometheus.yml
 15 |     command:
 16 |       - "--promscrape.config=/etc/prometheus/prometheus.yml"
 17 |       - "--remoteWrite.url=http://victoriametrics:8428/api/v1/write"
 18 |     networks:
 19 |       - vm_net
 20 |     restart: always
 21 |   
 22 |   # VictoriaMetrics实例，一个负责存储指标和处理读请求的单一进程
 23 |   victoriametrics:
 24 |     container_name: victoriametrics
 25 |     image: victoriametrics/victoria-metrics:stable
 26 |     ports:
 27 |       - 8428:8428
 28 |       - 8089:8089
 29 |       - 8089:8089/udp
 30 |       - 2003:2003
 31 |       - 2003:2003/udp
 32 |       - 4242:4242
 33 |     volumes:
 34 |       - vmdata:/storage
 35 |     command:
 36 |       - "--storageDataPath=/storage"
 37 |       - "--graphiteListenAddr=:2003"
 38 |       - "--opentsdbListenAddr=:4242"
 39 |       - "--httpListenAddr=:8428"
 40 |       - "--influxListenAddr=:8089"
 41 |       - "--vmalert.proxyURL=http://vmalert:8880"
 42 |     networks:
 43 |       - vm_net
 44 |     restart: always
 45 | 
 46 |   # 使用VictoriaMetrics作为数据源配置的Grafana实例
 47 |   grafana:
 48 |     container_name: grafana
 49 |     image: grafana/grafana:latest
 50 |     depends_on:
 51 |       - "victoriametrics"
 52 |     ports:
 53 |       - 3000:3000
 54 |     volumes:
 55 |       - grafanadata:/var/lib/grafana
 56 |       - ./provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources
 57 |       - ./provisioning/dashboards:/etc/grafana/provisioning/dashboards
 58 |       - ./dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json
 59 |       - ./dashboards/vmagent.json:/var/lib/grafana/dashboards/vmagent.json
 60 |       - ./dashboards/vmalert.json:/var/lib/grafana/dashboards/vmalert.json
 61 |     networks:
 62 |       - vm_net
 63 |     restart: always
 64 | 
 65 |   # vmalert执行警报和记录规则
 66 |   vmalert:
 67 |     container_name: vmalert
 68 |     image: victoriametrics/vmalert:stable
 69 |     depends_on:
 70 |       - "victoriametrics"
 71 |       - "alertmanager"
 72 |     ports:
 73 |       - 8880:8880
 74 |     volumes:
 75 |       - ./alert/alerts.yml:/etc/alerts/alerts.yml
 76 |       - ./alert/alerts-health.yml:/etc/alerts/alerts-health.yml
 77 |       - ./alert/alerts-vmagent.yml:/etc/alerts/alerts-vmagent.yml
 78 |       - ./alert/alerts-vmalert.yml:/etc/alerts/alerts-vmalert.yml
 79 |     command:
 80 |       - "--datasource.url=http://victoriametrics:8428/"
 81 |       - "--remoteRead.url=http://victoriametrics:8428/"
 82 |       - "--remoteWrite.url=http://victoriametrics:8428/"
 83 |       - "--notifier.url=http://alertmanager:9093/"
 84 |       - "--rule=/etc/alerts/*.yml"
 85 |       # 在Grafana中显示警报的来源
 86 |       - "--external.url=http://127.0.0.1:3000" # 容器外的Grafana
 87 |       # 在复制粘贴这行时，请注意在 $expr 中使用 $$ 进行转义
 88 |       - '--external.alert.source=explore?orgId=1&left={"datasource":"VictoriaMetrics","queries":[{"expr":{{$$expr|jsonEscape|queryEscape}},"refId":"A"}],"range":{"from":"now-1h","to":"now"}}'
 89 |     networks:
 90 |       - vm_net
 91 |     restart: always
 92 | 
 93 |   # Alertmanager 接收来自 vmalert 的警报通知
 94 |   # 并根据 --config.file 分发它们
 95 |   alertmanager:
 96 |     container_name: alertmanager
 97 |     image: prom/alertmanager:latest
 98 |     volumes:
 99 |       - ./alertmanager/alertmanager.yml:/config/alertmanager.yml
100 |     command:
101 |       - "--config.file=/config/alertmanager.yml"
102 |     ports:
103 |       - 9093:9093
104 |     networks:
105 |       - vm_net
106 |     restart: always
107 | 
108 | volumes:
109 |   vmagentdata: {}
110 |   vmdata: {}
111 |   grafanadata: {}
112 | networks:
113 |   vm_net:


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/provisioning/dashboards/dashboard.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 | - name: Prometheus
 5 |   orgId: 1
 6 |   folder: ''
 7 |   type: file
 8 |   options:
 9 |     path: /var/lib/grafana/dashboards
10 | 


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/provisioning/datasources/prometheus-datasource/prometheus-datasource.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | datasources:
 4 |     - name: VictoriaMetrics
 5 |       type: prometheus
 6 |       access: proxy
 7 |       url: http://victoriametrics:8428
 8 |       isDefault: true
 9 |       jsonData:
10 |         prometheusType: Prometheus
11 |         prometheusVersion: 2.24.0
12 | 
13 |     - name: VictoriaMetrics - cluster
14 |       type: prometheus
15 |       access: proxy
16 |       url: http://vmauth:8427/select/0/prometheus
17 |       isDefault: false
18 |       jsonData:
19 |         prometheusType: Prometheus
20 |         prometheusVersion: 2.24.0
21 | 


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/provisioning/datasources/victoriametrics-datasource/victoriametrics-datasource.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | # List of data sources to insert/update depending on what's
 4 | # available in the database.
 5 | datasources:
 6 |   # <string, required> Name of the VictoriaMetrics datasource
 7 |   # displayed in Grafana panels and queries.
 8 |   - name: VictoriaMetrics
 9 |     # <string, required> Sets the data source type.
10 |     type: victoriametrics-datasource
11 |       # <string, required> Sets the access mode, either
12 |       # proxy or direct (Server or Browser in the UI).
13 |     # Some data sources are incompatible with any setting
14 |     # but proxy (Server).
15 |     access: proxy
16 |     # <string> Sets default URL of the single node version of VictoriaMetrics
17 |     url: http://victoriametrics:8428
18 |     # <string> Sets the pre-selected datasource for new panels.
19 |     # You can set only one default data source per organization.
20 |     isDefault: true
21 | 
22 |     # <string, required> Name of the VictoriaMetrics datasource
23 |     # displayed in Grafana panels and queries.
24 |   - name: VictoriaMetrics - cluster
25 |     # <string, required> Sets the data source type.
26 |     type: victoriametrics-datasource
27 |     # <string, required> Sets the access mode, either
28 |     # proxy or direct (Server or Browser in the UI).
29 |     # Some data sources are incompatible with any setting
30 |     # but proxy (Server).
31 |     access: proxy
32 |     # <string> Sets default URL of the cluster version of VictoriaMetrics
33 |     url: http://vmauth:8427/select/0/prometheus
34 |     # <string> Sets the pre-selected datasource for new panels.
35 |     # You can set only one default data source per organization.
36 |     isDefault: false
37 | 


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/scrape/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 10s
 3 | 
 4 | scrape_configs:
 5 |   - job_name: 'vmagent'
 6 |     static_configs:
 7 |       - targets: ['vmagent:8429']
 8 |   - job_name: 'victoriametrics'
 9 |     static_configs:
10 |       - targets: ['victoriametrics:8428']
11 |   # - job_name: 'vmalert'
12 |   #   static_configs:
13 |   #     - targets: ['vmalert:8880']


--------------------------------------------------------------------------------
/victoriametrics/deploy/victoriametrics/single-victoriametrics.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   #  指标采集
 3 |   #  它从 "--promscrape.config" 中定义的目标中抓取数据
 4 |   #  并将它们转发到 "--remoteWrite.url"
 5 |   vmagent:
 6 |     container_name: vmagent
 7 |     image: victoriametrics/vmagent:v1.100.0
 8 |     depends_on:
 9 |       - "victoriametrics"
10 |     ports:
11 |       - 8429:8429
12 |     volumes:
13 |       - vmagentdata:/vmagentdata
14 |       - ./scrape/prometheus.yml:/etc/prometheus/prometheus.yml
15 |     command:
16 |       - "--promscrape.config=/etc/prometheus/prometheus.yml"
17 |       - "--remoteWrite.url=http://victoriametrics:8428/api/v1/write"
18 |     networks:
19 |       - vm_net
20 |     restart: always
21 |   
22 |   # VictoriaMetrics实例，一个负责存储指标和处理读请求的单一进程
23 |   victoriametrics:
24 |     container_name: victoriametrics
25 |     image: victoriametrics/victoria-metrics:v1.100.0
26 |     ports:
27 |       - 8428:8428
28 |       - 8089:8089
29 |       - 8089:8089/udp
30 |       - 2003:2003
31 |       - 2003:2003/udp
32 |       - 4242:4242
33 |     volumes:
34 |       - vmdata:/storage
35 |     command:
36 |       - "--storageDataPath=/storage"
37 |       - "--graphiteListenAddr=:2003"
38 |       - "--opentsdbListenAddr=:4242"
39 |       - "--httpListenAddr=:8428"
40 |       - "--influxListenAddr=:8089"
41 |     networks:
42 |       - vm_net
43 |     restart: always
44 | 
45 |   # 使用VictoriaMetrics作为数据源配置的Grafana实例
46 |   grafana:
47 |     container_name: grafana
48 |     image: grafana/grafana:10.4.1
49 |     depends_on:
50 |       - "victoriametrics"
51 |     ports:
52 |       - 3000:3000
53 |     volumes:
54 |       - grafanadata:/var/lib/grafana
55 |       - ./provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources
56 |       - ./provisioning/dashboards:/etc/grafana/provisioning/dashboards
57 |       - ./dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json
58 |       - ./dashboards/vmagent.json:/var/lib/grafana/dashboards/vmagent.json
59 |     networks:
60 |       - vm_net
61 |     restart: always
62 | 
63 | volumes:
64 |   vmagentdata: {}
65 |   vmdata: {}
66 |   grafanadata: {}
67 | networks:
68 |   vm_net:


--------------------------------------------------------------------------------
/victoriametrics/promxy/alert/alerts-health.yml:
--------------------------------------------------------------------------------
 1 | # File contains default list of alerts for various VM components.
 2 | # The following alerts are recommended for use for any VM installation.
 3 | # The alerts below are just recommendations and may require some updates
 4 | # and threshold calibration according to every specific setup.
 5 | groups:
 6 |   - name: vm-health
 7 |     # note the `job` filter and update accordingly to your setup
 8 |     rules:
 9 |       - alert: TooManyRestarts
10 |         expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2
11 |         labels:
12 |           severity: critical
13 |         annotations:
14 |           summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
15 |           description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
16 |             It might be crashlooping."
17 | 
18 |       - alert: ServiceDown
19 |         expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0
20 |         for: 2m
21 |         labels:
22 |           severity: critical
23 |         annotations:
24 |           summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
25 |           description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
26 | 
27 |       - alert: ProcessNearFDLimits
28 |         expr: (process_max_fds - process_open_fds) < 100
29 |         for: 5m
30 |         labels:
31 |           severity: critical
32 |         annotations:
33 |           summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
34 |           description: "Exhausting OS file descriptors limit can cause severe degradation of the process.Consider to increase the limit as fast as possible."
35 | 
36 |       - alert: TooHighMemoryUsage
37 |         expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
38 |         for: 5m
39 |         labels:
40 |           severity: critical
41 |         annotations:
42 |           summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")"
43 |           description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
44 |            Consider to either increase available memory or decrease the load on the process."
45 | 
46 |       - alert: TooHighCPUUsage
47 |         expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
48 |         for: 5m
49 |         labels:
50 |           severity: critical
51 |         annotations:
52 |           summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
53 |           description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
54 |                Consider to either increase available CPU resources or decrease the load on the process."
55 | 
56 |       - alert: TooManyLogs
57 |         expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
58 |         for: 15m
59 |         labels:
60 |           severity: warning
61 |         annotations:
62 |           summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
63 |           description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.Worth to check logs for specific error messages."
64 | 
65 |       - alert: TooManyTSIDMisses
66 |         expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0
67 |         for: 10m
68 |         labels:
69 |           severity: critical
70 |         annotations:
71 |           summary: "Too many TSID misses for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
72 |           description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).Make sure you're running VictoriaMetrics of v1.85.3 or higher.Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502"
73 | 
74 |       - alert: ConcurrentInsertsHitTheLimit
75 |         expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
76 |         for: 15m
77 |         labels:
78 |           severity: warning
79 |         annotations:
80 |           summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit"
81 |           description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n
82 |             Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.
83 |             In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
84 |             making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then 
85 |             it might be worth adjusting `-maxConcurrentInserts` cmd-line flag."


--------------------------------------------------------------------------------
/victoriametrics/promxy/alert/alerts-vmalert.yml:
--------------------------------------------------------------------------------
 1 | # File contains default list of alerts for vmalert service.
 2 | # The alerts below are just recommendations and may require some updates
 3 | # and threshold calibration according to every specific setup.
 4 | groups:
 5 |   # Alerts group for vmalert assumes that Grafana dashboard
 6 |   # https://grafana.com/grafana/dashboards/14950/ is installed.
 7 |   # Pls update the `dashboard` annotation according to your setup.
 8 |   - name: vmalert
 9 |     interval: 30s
10 |     rules:
11 |       - alert: ConfigurationReloadFailure
12 |         expr: vmalert_config_last_reload_successful != 1
13 |         labels:
14 |           severity: warning
15 |         annotations:
16 |           summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}"
17 |           description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}.
18 |             Check vmalert's logs for detailed error message."
19 | 
20 |       - alert: AlertingRulesError
21 |         expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0
22 |         for: 5m
23 |         labels:
24 |           severity: warning
25 |         annotations:
26 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
27 |           summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
28 |           description: "Alerting rules execution is failing for group \"{{ $labels.group }}\".
29 |             Check vmalert's logs for detailed error message."
30 | 
31 |       - alert: RecordingRulesError
32 |         expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0
33 |         for: 5m
34 |         labels:
35 |           severity: warning
36 |         annotations:
37 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
38 |           summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
39 |           description: "Recording rules execution is failing for group \"{{ $labels.group }}\".
40 |             Check vmalert's logs for detailed error message."
41 | 
42 |       - alert: RecordingRulesNoData
43 |         expr: sum(vmalert_recording_rules_last_evaluation_samples) without(recording, id) < 1
44 |         for: 30m
45 |         labels:
46 |           severity: info
47 |         annotations:
48 |           dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}"
49 |           summary: "Recording rule {{ $labels.recording }} ({ $labels.group }}) produces no data"
50 |           description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" 
51 |             produces 0 samples over the last 30min. It might be caused by a misconfiguration 
52 |             or incorrect query expression."
53 | 
54 |       - alert: TooManyMissedIterations
55 |         expr: increase(vmalert_iteration_missed_total[5m]) > 0
56 |         for: 15m
57 |         labels:
58 |           severity: warning
59 |         annotations:
60 |           summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
61 |           description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\".
62 |             The group evaluation time takes longer than the configured evaluation interval. This may result in missed 
63 |             alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
64 |             group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert.html#groups. 
65 |             If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/Troubleshooting.html#slow-queries."
66 | 
67 |       - alert: RemoteWriteErrors
68 |         expr: increase(vmalert_remotewrite_errors_total[5m]) > 0
69 |         for: 15m
70 |         labels:
71 |           severity: warning
72 |         annotations:
73 |           summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL"
74 |           description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting 
75 |             or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message."
76 | 
77 |       - alert: AlertmanagerErrors
78 |         expr: increase(vmalert_alerts_send_errors_total[5m]) > 0
79 |         for: 15m
80 |         labels:
81 |           severity: warning
82 |         annotations:
83 |           summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager"
84 |           description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\".
85 |             Check vmalert's logs for detailed error message."


--------------------------------------------------------------------------------
/victoriametrics/promxy/alertmanager/alertmanager.yml:
--------------------------------------------------------------------------------
1 | route:
2 |   receiver: blackhole
3 | 
4 | receivers:
5 |   - name: blackhole


--------------------------------------------------------------------------------
/victoriametrics/promxy/cmd/promxy/config.yaml:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## 常规的 Prometheus 配置
  3 | ##
  4 | global:
  5 |   # 评估间隔
  6 |   evaluation_interval: 5s
  7 |   # 外部标签
  8 |   external_labels:
  9 |     source: promxy
 10 | 
 11 | # 规则文件指定一组通配符。所有匹配的文件中都会读取规则和警报。
 12 | rule_files:
 13 | - "*rule"
 14 | 
 15 | # Alerting 指定与 Alertmanager 相关的设置。
 16 | alerting:
 17 |   alertmanagers:
 18 |   - scheme: http
 19 |     static_configs:
 20 |     - targets:
 21 |       - "127.0.0.1:12345"
 22 | 
 23 | # remote_write 配置用于 promxy 作为其本地 Appender 使用，意味着 promxy 将发送所有“写入”（而不是导出）的指标到这里。
 24 | # 这些包括：记录规则、警报规则上的指标等。
 25 | remote_write:
 26 |   - url: http://localhost:8083/receive
 27 | 
 28 | ##
 29 | ### Promxy 配置
 30 | ##
 31 | promxy:
 32 |   server_groups:
 33 |     # 所有上游 Prometheus 服务发现机制都使用相同的标记，全部在 https://github.com/prometheus/prometheus/blob/master/discovery/config/config.go#L33 中定义
 34 |     - static_configs:
 35 |         - targets:
 36 |           - localhost:9090
 37 |       # 要添加到从此 server_group 检索到的指标的标签
 38 |       labels:
 39 |         sg: localhost_9090
 40 |       # 在 server_group 中的主机之间合并时间序列值的反亲和性
 41 |       anti_affinity: 10s
 42 |       # 等待服务器响应头的时间，单位毫秒
 43 |       timeout: 5s
 44 |       # 控制是否使用 remote_read 还是 prom API 用于获取远程 RAW 数据（例如矩阵选择器）
 45 |       # 注意，某些 Prometheus 实现（例如 VictoriaMetrics）不支持 remote_read。
 46 |       remote_read: true
 47 |       # 配置发送远程读取请求的路径。默认为 "api/v1/read"
 48 |       remote_read_path: api/v1/read
 49 |       # path_prefix 定义要添加到此 servergroup 中所有查询的前缀
 50 |       # 这可以使用 __path_prefix__ 进行重标记
 51 |       path_prefix: /example/prefix
 52 |       # query_params 将以下查询参数映射添加到下游请求。
 53 |       # 最初的用例是将 `nocache=1` 添加到 VictoriaMetrics 下游
 54 |       query_params:
 55 |         nocache: 1
 56 |       # 配置用于请求的协议方案。默认为 http
 57 |       scheme: http
 58 |       # promxy 与 server_groups 中的主机通信时的 HTTP 客户端选项
 59 |       http_client:
 60 |         # 连接下游的等待时间，默认为 200 毫秒。
 61 |         dial_timeout: 1s
 62 |         tls_config:
 63 |           insecure_skip_verify: true
 64 | 
 65 |       # relative_time_range 定义相对于当前时间的时间范围，此 server_group 包含该范围内的数据。
 66 |       # 这是完全可选的，start/end 也都是可选的
 67 |       # 例如，如果此 servergroup 仅包含最近的 3 小时数据
 68 |       # "start" 将为 -3h，而 end 将被省略
 69 |       relative_time_range:
 70 |         start: -3h
 71 |         end: -1h
 72 |         truncate: false
 73 | 
 74 |       # 在合并样本流时，将优先考虑给定时间戳的最大值
 75 |       prefer_max: false
 76 | 
 77 |       # absolute_time_range 定义此 server_group 包含的绝对时间范围。
 78 |       # 这是完全可选的，start/end 也都是可选的
 79 |       # 例如，如果 servergroup 已被弃用且不再接收数据
 80 |       # 您可以设置其具有数据的特定时间。
 81 |       absolute_time_range:
 82 |         start: '2009-10-10T23:00:00Z'
 83 |         end: '2009-10-11T23:00:00Z'
 84 |         truncate: true
 85 | 
 86 |     # 可以有任意数量的其他 server_groups
 87 |     - static_configs:
 88 |         - targets:
 89 |           - localhost:9091
 90 |       labels:
 91 |         sg: localhost_9091
 92 |       anti_affinity: 10s
 93 |       scheme: http
 94 |       http_client:
 95 |         tls_config:
 96 |           insecure_skip_verify: true
 97 |       # ignore_error 将使给定安全组的响应“可选”
 98 |       # 这意味着如果此 servergroup 返回错误而其他 servergroup 不返回，则总体查询仍然可以成功
 99 |       ignore_error: true
100 | 


--------------------------------------------------------------------------------
/victoriametrics/promxy/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   #  指标采集
 3 |   #  它从 "--promscrape.config" 中定义的目标中抓取数据
 4 |   #  并将它们转发到 "--remoteWrite.url"
 5 |   vmagent:
 6 |     container_name: vmagent
 7 |     image: victoriametrics/vmagent:v1.100.0
 8 |     depends_on:
 9 |       - "victoriametrics"
10 |     ports:
11 |       - 8429:8429
12 |     volumes:
13 |       - vmagentdata:/vmagentdata
14 |       - ./scrape/prometheus.yml:/etc/prometheus/prometheus.yml
15 |     command:
16 |       - "--promscrape.config=/etc/prometheus/prometheus.yml"
17 |       - "--remoteWrite.url=http://victoriametrics:8428/api/v1/write"
18 |     networks:
19 |       - vm_net
20 |     restart: always
21 |   
22 |   # VictoriaMetrics实例，一个负责存储指标和处理读请求的单一进程
23 |   victoriametrics:
24 |     container_name: victoriametrics
25 |     image: victoriametrics/victoria-metrics:v1.100.0
26 |     ports:
27 |       - 8428:8428
28 |       - 8089:8089
29 |       - 8089:8089/udp
30 |       - 2003:2003
31 |       - 2003:2003/udp
32 |       - 4242:4242
33 |     volumes:
34 |       - vmdata:/storage
35 |     command:
36 |       - "--storageDataPath=/storage"
37 |       - "--graphiteListenAddr=:2003"
38 |       - "--opentsdbListenAddr=:4242"
39 |       - "--httpListenAddr=:8428"
40 |       - "--influxListenAddr=:8089"
41 |     networks:
42 |       - vm_net
43 |     restart: always
44 |   
45 |   promxy:
46 |     container_name: promxy
47 |     image: quay.io/jacksontj/promxy
48 |     hostname: promxy
49 |     ports:
50 |       - "8082:8082"
51 |     volumes:
52 |       - promxydata:/var/log
53 |       - ./cmd/promxy/config.yaml:/etc/promxy/config.yaml
54 |     command:
55 |       - --config=/etc/promxy/config.yaml
56 |       - --log-level=info
57 |       - --web.enable-lifecycle
58 |     networks:
59 |       - vm_net
60 | 
61 |   # 使用VictoriaMetrics作为数据源配置的Grafana实例
62 |   grafana:
63 |     container_name: grafana
64 |     image: grafana/grafana:10.4.1
65 |     depends_on:
66 |       - "victoriametrics"
67 |     ports:
68 |       - 3000:3000
69 |     volumes:
70 |       - grafanadata:/var/lib/grafana
71 |       - ./provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources
72 |       - ./provisioning/dashboards:/etc/grafana/provisioning/dashboards
73 |       - ./dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json
74 |       - ./dashboards/vmagent.json:/var/lib/grafana/dashboards/vmagent.json
75 |     networks:
76 |       - vm_net
77 |     restart: always
78 | 
79 | volumes:
80 |   vmagentdata: {}
81 |   vmdata: {}
82 |   promxydata: {}
83 |   grafanadata: {}
84 | networks:
85 |   vm_net:


--------------------------------------------------------------------------------
/victoriametrics/promxy/install-promxy.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | # 函数：安装依赖工具
  5 | install_dependencies() {
  6 |     if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then
  7 |         apt-get update && apt-get install -y curl wget net-tools jq
  8 |     elif [ "$OS" == "centos" ]; then
  9 |         yum update && yum install -y curl wget net-tools jq
 10 |     else
 11 |         echo "Unsupported operating system."
 12 |         exit 1
 13 |     fi
 14 | }
 15 | 
 16 | # 函数：设置系统服务和用户
 17 | setup_system() {
 18 |     # 创建promxy配置文件目录
 19 |     mkdir -p /etc/promxy
 20 |     # 创建promxy数据保存目录
 21 |     mkdir -p /var/lib/promxy
 22 | 
 23 |     # 检查promxy组是否存在，不存在则创建
 24 |     if ! getent group promxy > /dev/null 2>&1; then
 25 |         groupadd --system promxy
 26 |     fi
 27 | 
 28 |     # 检查promxy用户是否存在，不存在则创建
 29 |     if ! id -u promxy > /dev/null 2>&1; then
 30 |         useradd --system --home-dir /var/lib/promxy --no-create-home --gid promxy promxy
 31 |     fi
 32 | 
 33 |     chown -R promxy:promxy /var/lib/promxy
 34 | }
 35 | 
 36 | # 确定操作系统类型
 37 | OS="unknown"
 38 | if [ -f /etc/os-release ]; then
 39 |     . /etc/os-release
 40 |     OS=$ID
 41 | fi
 42 | 
 43 | # 安装依赖工具
 44 | install_dependencies
 45 | 
 46 | # 设置系统服务和用户
 47 | setup_system
 48 | 
 49 | # 获取Promxy最新版本
 50 | PROMXY_VERSION=$(curl -s "https://api.github.com/repos/jacksontj/promxy/tags" | jq -r '.[0].name')
 51 | 
 52 | # 下载并安装Promxy
 53 | wget https://github.com/jacksontj/promxy/releases/download/${PROMXY_VERSION}/promxy-${PROMXY_VERSION}-linux-amd64 -O /usr/local/promxy
 54 | chmod +x /usr/local/promxy
 55 | 
 56 | cat> /etc/systemd/system/promxy.service <<EOF
 57 | [Unit]
 58 | Description=promxy is a Prometheus proxy server
 59 | After=network.target
 60 | 
 61 | [Service]
 62 | Type=simple
 63 | User=promxy
 64 | Group=promxy
 65 | WorkingDirectory=/var/lib/promxy
 66 | ReadWritePaths=/var/lib/promxy
 67 | StartLimitBurst=5
 68 | StartLimitInterval=0
 69 | Restart=on-failure
 70 | RestartSec=5
 71 | ExecStart=/usr/local/promxy --bind-addr=:8082 --config=/etc/promxy/config.yaml
 72 | ExecStop=/bin/kill -s SIGTERM \$MAINPID
 73 | ExecReload=/bin/kill -HUP \$MAINPID
 74 | ProtectSystem=full
 75 | LimitNOFILE=1048576
 76 | LimitNPROC=1048576
 77 | LimitCORE=infinity
 78 | StandardOutput=syslog
 79 | StandardError=syslog
 80 | SyslogIdentifier=promxy
 81 | PrivateTmp=yes
 82 | ProtectHome=yes
 83 | NoNewPrivileges=yes
 84 | ProtectSystem=strict
 85 | ProtectControlGroups=true
 86 | ProtectKernelModules=true
 87 | ProtectKernelTunables=yes
 88 | 
 89 | [Install]
 90 | WantedBy=multi-user.target
 91 | EOF
 92 | 
 93 | cat> /etc/promxy/config.yaml <<EOF
 94 | promxy:
 95 |   server_groups:
 96 |     - static_configs:
 97 |         - targets:
 98 |           - 192.168.1.102:8428 # 如果涉及多个VM节点，继续下面追加即可
 99 |       path_prefix: /prometheus # 追加请求前缀
100 | EOF
101 | 
102 | chown -R promxy:promxy /var/lib/promxy
103 | chown -R promxy:promxy /etc/promxy
104 | 
105 | systemctl enable promxy.service
106 | systemctl restart promxy.service
107 | ps aux | grep promxy
108 | 
109 | echo "promxy installation and service setup complete."
110 | 


--------------------------------------------------------------------------------
/victoriametrics/promxy/provisioning/dashboards/dashboard.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 | - name: Prometheus
 5 |   orgId: 1
 6 |   folder: ''
 7 |   type: file
 8 |   options:
 9 |     path: /var/lib/grafana/dashboards
10 | 


--------------------------------------------------------------------------------
/victoriametrics/promxy/provisioning/datasources/prometheus-datasource/prometheus-datasource.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | datasources:
 4 |     - name: VictoriaMetrics
 5 |       type: prometheus
 6 |       access: proxy
 7 |       url: http://victoriametrics:8428
 8 |       isDefault: true
 9 |       jsonData:
10 |         prometheusType: Prometheus
11 |         prometheusVersion: 2.24.0
12 | 
13 |     - name: VictoriaMetrics - cluster
14 |       type: prometheus
15 |       access: proxy
16 |       url: http://vmauth:8427/select/0/prometheus
17 |       isDefault: false
18 |       jsonData:
19 |         prometheusType: Prometheus
20 |         prometheusVersion: 2.24.0
21 | 


--------------------------------------------------------------------------------
/victoriametrics/promxy/scrape/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 10s
 3 | 
 4 | scrape_configs:
 5 |   - job_name: 'vmagent'
 6 |     static_configs:
 7 |       - targets: ['vmagent:8429']
 8 |   - job_name: 'victoriametrics'
 9 |     static_configs:
10 |       - targets: ['victoriametrics:8428']
11 |   # - job_name: 'vmalert'
12 |   #   static_configs:
13 |   #     - targets: ['vmalert:8880']


--------------------------------------------------------------------------------