├── .gitignore ├── LICENSE ├── README.md ├── UPDATE.md ├── generator.png ├── generator ├── README.md ├── f5 │ └── generator_ad.yml ├── h3c │ ├── generator-demo.yml │ ├── generator.yml │ ├── switch │ │ └── generator_h3c_sw.yml │ └── wireless │ │ ├── README.md │ │ ├── generator-demo.yml │ │ ├── generator_h3c_wireless.yml │ │ ├── grafana.json │ │ ├── h3c-ac.yml │ │ └── prometheus.yml ├── hillstone │ └── firewall │ │ └── generator_hillstone_firewall.yml ├── huawei │ ├── generator.yml │ ├── generator_v3.yml │ ├── switch │ │ ├── README.md │ │ ├── generator-demo.yml │ │ ├── generator_huawei_switch.yml │ │ ├── grafana.json │ │ ├── network-switch.yml │ │ └── prometheus.yml │ └── wireless │ │ ├── README.md │ │ ├── generator-demo.yml │ │ ├── grafana.json │ │ ├── huawei-ac.yml │ │ └── prometheus.yml ├── ruijie │ └── wireless │ │ ├── README.md │ │ ├── generator-demo.yml │ │ ├── generator-ruijie.yml │ │ ├── grafana-ruijie.json │ │ ├── prometheus.yml │ │ └── ruijie-ac.yml ├── sangfor │ ├── ac │ │ ├── README.md │ │ ├── generator.yml │ │ └── sangfor-ac.txt │ ├── ad │ │ ├── README.md │ │ ├── generator.yml │ │ ├── grafana.json │ │ └── prometheus.yml │ └── af │ │ └── README.md ├── synology │ ├── README.md │ ├── generator │ │ └── generator_synology_nas.yml │ ├── grafana │ │ ├── Synology NAS Details Dashboard for Prometheus.json │ │ └── Synology NAS Overview Dashboard for Prometheus.json │ ├── img │ │ ├── 1.jpg │ │ ├── image-1.png │ │ ├── image.png │ │ └── qrcode.jpg │ └── snmp │ │ └── snmp_synology_nas.yml └── test │ ├── generator_demo.yaml │ ├── generator_demo.yml │ ├── generator_haikang_monitor.yml │ ├── generator_huawei_switch.yml │ └── snmp.yml ├── prometheus └── rules │ ├── prod │ ├── blackbox.yml │ ├── idrac-status.yml │ ├── node-exporter.yml │ ├── sangfor-ad-status.yml │ ├── switch-status.yml │ └── windows-status.yml │ └── vm │ ├── alerts-health.yml │ ├── alerts-vmagent.yml │ ├── alerts-vmalert.yml │ ├── alerts-vmauth.yml │ └── alerts.yml └── victoriametrics ├── README.md ├── binary ├── PrometheusAlert │ ├── README.md │ └── install-prometheusalert.sh ├── alertmanager │ ├── README.md │ └── install-alertmanager.sh ├── blackbox_exporter │ ├── README.md │ └── install-blackbox.sh ├── categraf │ ├── README.md │ ├── install-categraf-cgo.sh │ ├── install-categraf.sh │ └── update-config.sh ├── grafana │ └── README.md ├── network_exporter │ ├── README.md │ └── install-network.sh ├── node_exporter │ ├── README.md │ └── install-node.sh ├── prometheus │ ├── README.md │ └── install-promsingle.sh ├── victoriametrics │ ├── README.md │ ├── install-vmsingle.sh │ └── vmsingle.conf ├── vmagent │ ├── README.md │ ├── install-vmagent.sh │ └── vmagent.conf ├── vmalert │ ├── README.md │ ├── install-vmalert.sh │ └── vmalert.conf ├── vmauth │ ├── README.md │ ├── install-vmauth.sh │ ├── vmauth.conf │ └── vmauth.service ├── vminsert │ ├── README.md │ ├── install-vminsert.sh │ ├── vminsert.conf │ └── vminsert.service ├── vmselect │ ├── README.md │ ├── install-vmselect.sh │ ├── vmselect.conf │ └── vmselect.service └── vmstorage │ ├── README.md │ ├── install-vmstorage.sh │ ├── vmstorage.config │ └── vmstorage.service ├── deploy-cluster ├── alertmanager │ └── alertmanager.yml ├── dashboards │ ├── victoriametrics-cluster.json │ ├── vmagent.json │ └── vmalert.json ├── docker-compose.yml ├── provisioning │ ├── dashboards │ │ └── dashboard.yml │ └── datasources │ │ └── prometheus-datasource │ │ └── prometheus-datasource.yml ├── vmagent │ └── prometheus-cluster.yml ├── vmalert │ ├── alerts-cluster.yml │ ├── alerts-health.yml │ ├── alerts-vmagent.yml │ └── alerts-vmalert.yml └── vmauth │ └── auth-cluster.yml ├── deploy-n9e ├── compose.yml ├── dashboards │ └── victoriametrics.json ├── initsql │ ├── a-n9e.sql │ └── c-init.sql ├── mysql │ └── my.cnf ├── nightingale │ ├── config.toml │ ├── metrics.yml │ └── script │ │ ├── notify.py │ │ ├── notify_feishu.py │ │ └── rule_converter.py └── provisioning │ ├── dashboards │ └── dashboard.yml │ └── datasources │ └── prometheus-datasource │ └── prometheus-datasource.yml ├── deploy ├── docker-prometheus │ ├── README.md │ ├── alertmanager │ │ └── alertmanager.yml │ ├── docker-compose.yml │ ├── grafana │ │ └── provisioning │ │ │ ├── dashboards │ │ │ └── dashboard.yml │ │ │ └── datasources │ │ │ └── prometheus-datasource │ │ │ └── prometheus-datasource.yml │ └── prometheus │ │ ├── alert.yml │ │ └── prometheus.yml └── victoriametrics │ ├── README.md │ ├── alert │ ├── alerts-health.yml │ ├── alerts-vmagent.yml │ ├── alerts-vmalert.yml │ └── alerts.yml │ ├── alertmanager │ └── alertmanager.yml │ ├── dashboards │ ├── victoriametrics.json │ ├── vmagent.json │ └── vmalert.json │ ├── docker-compose.yml │ ├── provisioning │ ├── dashboards │ │ └── dashboard.yml │ └── datasources │ │ ├── prometheus-datasource │ │ └── prometheus-datasource.yml │ │ └── victoriametrics-datasource │ │ └── victoriametrics-datasource.yml │ ├── scrape │ └── prometheus.yml │ └── single-victoriametrics.yml └── promxy ├── alert ├── alerts-health.yml ├── alerts-vmagent.yml ├── alerts-vmalert.yml └── alerts.yml ├── alertmanager └── alertmanager.yml ├── cmd └── promxy │ └── config.yaml ├── dashboards ├── victoriametrics.json ├── vmagent.json └── vmalert.json ├── docker-compose.yaml ├── install-promxy.sh ├── provisioning ├── dashboards │ └── dashboard.yml └── datasources │ └── prometheus-datasource │ └── prometheus-datasource.yml └── scrape └── prometheus.yml /.gitignore: -------------------------------------------------------------------------------- 1 | # 忽略所有 .DS_Store 文件 2 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 yanghua 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### 关于网络设备的监控和告警(基于Prometheus + SNMP Exporter + Grafana) 2 | 3 | - 适配品牌类型:华为无线、华三无线、锐捷无线;欢迎有相关品牌的无线产品资源的联系我 4 | - 目前还更新适配:深信服AD,华为交换机,华三交换机,锐捷交换机等 5 | - [snmp_exporter](https://github.com/prometheus/snmp_exporter)版本:0.25.0 6 | 7 | #### 更新日志 8 | 9 | 版本更新日志: 10 | 11 | [更新日志](UPDATE.md) 12 | 13 | #### 目录介绍 14 | 15 | - 顶级目录下面是各品牌的英文名称,如:h3c、huawei、ruijie等 16 | - 品牌名称下就是mibs文件夹,放置了相关品牌的mib库文件 17 | - 品牌目录下的info.txt是说明信息,generator.yml文件是已经适配对应品牌并测试好的常规SNMP导出配置生成器,里面的指标都是常用无线数据指标:AC的CPU使用率、内存使用率、温度、启动时间等,AP的内存使用率、CPU使用率、温度、状态、上线时间、承载用户数、型号、名称、IP、MAC等指标数据,详情指标直接到generator.yml中查看。 18 | 19 | > 如果generator.yml文件中的指标不满足你的监控需求,可自定义编写,添加自定义指标,满足自身监控需求,也可以反馈issue中,如果我觉得合适会添加适配。 20 | 21 | #### 使用配置 22 | 23 | ##### 前提 24 | 25 | - Prometheus搭建好,这里我不提供搭建教程,如有需要可到我知乎和微信公众号查看:网络小斐。 26 | - AC配置好SNMP Agent,推荐使用v2c版本,如果对安全需求很大可开启v3版本。 27 | - 准备好一台单独的Linux服务器,系统推荐CentOS 7.9,用来单独部署SNMP Exporter。 28 | 29 | ##### 搭建 30 | 31 | Linux首先需要部署git,当然你也可以直接从github下载源码包,上传到服务器中,这里默认用git拉snmp_exporter源码包到服务器本地。 32 | 33 | ```bash 34 | Ubuntu下载依赖包: 35 | sudo apt-get install unzip build-essential libsnmp-dev 36 | 37 | CentOS下载依赖包: 38 | sudo yum install gcc gcc-g++ make net-snmp net-snmp-utils net-snmp-libs net-snmp-devel 39 | ``` 40 | 41 | 这里用CentOS 7.9作为演示: 42 | 43 | ```bash 44 | # 下载git 45 | sudo yum install -y git curl wget 46 | # curl 更新 47 | yum -y install epel-release 48 | wget http://mirror.city-fan.org/ftp/contrib/yum-repo/rhel7/x86_64/city-fan.org-release-3-9.rhel7.noarch.rpm 49 | rpm -ivh city-fan.org-release-3-9.rhel7.noarch.rpm 50 | 51 | vim /etc/yum.repos.d/city-fan.org.repo 52 | 53 | # 把enabled=0修改为enabled=1 54 | [city-fan.org] 55 | name=city-fan.org repository for Red Hat Enterprise Linux (and clones) $releasever ($basearch) 56 | #baseurl=http://mirror.city-fan.org/ftp/contrib/yum-repo/rhel$releasever/$basearch 57 | mirrorlist=http://mirror.city-fan.org/ftp/contrib/yum-repo/mirrorlist-rhel$releasever 58 | enabled=1 59 | gpgcheck=1 60 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-city-fan.org file:///etc/pki/rpm-gpg/RPM-GPG-KEY-city-fan.org-rhel-7 61 | 62 | 63 | yum update curl --enablerepo=city-fan.org -y 64 | curl --version 65 | 66 | # 安装golang 1.20.x https://golang.google.cn/dl/ 67 | wget https://golang.google.cn/dl/go1.20.8.linux-amd64.tar.gz 68 | # 解压安装 69 | tar -zxvf go1.20.8.linux-amd64.tar.gz -C /usr/local 70 | # 将go添加到环境变量 71 | vim /etc/profile 72 | 73 | if [ -n "${BASH_VERSION-}" ] ; then 74 | if [ -f /etc/bashrc ] ; then 75 | # Bash login shells run only /etc/profile 76 | # Bash non-login shells run only /etc/bashrc 77 | # Check for double sourcing is done in /etc/bashrc. 78 | . /etc/bashrc 79 | fi 80 | fi 81 | #go 环境变量 82 | export GO111MODULE=on 83 | export GOPROXY=https://goproxy.cn,direct 84 | export GOROOT=/usr/local/go 85 | export GOPATH=$HOME/go 86 | export PATH=$PATH:$GOROOT/bin:$GOPATH/bin 87 | 88 | # 应用环境变量 89 | source /etc/profile 90 | 91 | # 拉取snmp_exporter 92 | git clone https://github.com/prometheus/snmp_exporter.git 93 | # 进入目录snmp_exporter 94 | cd snmp_exporter/ 95 | # 构建snmp_exporter二进制可执行文件 96 | go build 97 | # 查看生成的二进制可执行文件 98 | ls -lsh snmp_exporter 99 | 100 | # 进入生成器目录构建二进制可执行文件 101 | cd snmp_exporter/generator/ 102 | # 国内网络下载mib公共库报错 忽略即可 make: *** [mibs/apc-powernet-mib] 错误 22 103 | make generator mibs 104 | 105 | # mibs文件夹中放入对应品牌的无线设备mib库文件即可 106 | # 把对应的generator.yml文件放入 ../snmp_exporter/generator/ 目录下 107 | export MIBDIRS=/root/snmp_exporter/generator/mibs 108 | ./generator --fail-on-parse-errors generate 109 | 110 | mv snmp.yml ../ 111 | 112 | # 重启snmp_exporter 113 | systemctl restart snmp_exporter 114 | ``` 115 | ##### ./generator generate 案例 116 | 117 | ![generate](generator.png) 118 | 119 | ##### Prometheus.yml如何添加Job 120 | 121 | 查看目录中prometheus.yml文件中配置案例 122 | 123 | grafana.json只是根据案例中的指标写出的json模版,适配每个环境下的监控需要做一定的修改。 124 | 125 | grafana模版针对AP上的在线终端数,AP的CPU利用率和内存利用率做了排序,前20优先显示在Grafana中。 -------------------------------------------------------------------------------- /UPDATE.md: -------------------------------------------------------------------------------- 1 | #### 记录版本更新日志 2 | 3 | - 2023年9月19日 17:00 第一版发布华为和华三无线监控 4 | - 2023年9月19日 20:00 更新grafana查询语句,修改AP的终端数、CPU、内存等展示排序前20 解决多AP(30以上)下展示混乱问题 5 | - 2023年9月20日 15:30 更新深信服AD相关版本的监控 6 | - 2023年9月28日 11:00 更新第一版锐捷无线相关的 generator.yml 和 prometheus.yml 的配置信息 7 | - 2023年11月16日 14:00 更新第二版锐捷无线相关的 generator.yml 和 grafana.json 的配置信息 -------------------------------------------------------------------------------- /generator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator.png -------------------------------------------------------------------------------- /generator/README.md: -------------------------------------------------------------------------------- 1 | # 关于物理机器和网络设备利用SNMP协议获取设备信息,通过snmp_exporter生成器生成snmp.yaml采集OID配置文件。 2 | 3 | ## 模块说明 4 | 5 | generator.yml 文件是针对 generator 目录下所有设备信息采集的配置文件,统一生成 SNMP 设备的 snmp.yml 指标采集文件。 -------------------------------------------------------------------------------- /generator/f5/generator_ad.yml: -------------------------------------------------------------------------------- 1 | auths: 2 | f5_auth: 3 | version: 2 4 | community: public 5 | 6 | modules: 7 | # F5负载均衡常规指标 8 | f5_common: 9 | walk: 10 | - 1.3.6.1.4.1.3375.2.1.6.7 # sysSystemUptimeInSec 运行时间 11 | - 1.3.6.1.2.1.1.5 # sysName 主机名 12 | - 1.3.6.1.2.1.1.1 # sysDescr 系统信息 13 | - 1.3.6.1.4.1.3375.2.1.4.2 # sysProductVersion 系统版本 14 | - 1.3.6.1.4.1.3375.2.1.4.4 # sysProductEdition 系统补丁版本 15 | - 1.3.6.1.4.1.3375.2.1.3.2.1.2.1.2 # sysChassisFanStatus 风扇状态 16 | - 1.3.6.1.4.1.3375.2.1.3.2.1.2.1.3 # sysChassisFanSpeed 风扇速度 17 | - 1.3.6.1.4.1.3375.2.1.3.2.2.2.1.2 # sysChassisPowerSupplyStatus 电源状态 18 | - 1.3.6.1.4.1.3375.2.1.3.2.3.2.1.2 # sysChassisTempTemperature 设备温度传感器 19 | 20 | max_repetitions: 25 21 | retries: 3 22 | timeout: 5s 23 | 24 | lookups: 25 | # 风扇下标 sysChassisFanIndex 26 | # 电源下标 sysChassisPowerSupplyIndex 27 | # 温度传感器下标 sysChassisTempIndex 28 | - source_indexes: [sysChassisFanIndex] 29 | lookup: sysChassisFanSpeed 30 | 31 | overrides: 32 | sysChassisFanSpeed: 33 | ignore: false 34 | 35 | # F5负载均衡接口信息 36 | f5_interface: 37 | walk: 38 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.1 # sysInterfaceStatName 接口名称 39 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.2 # sysInterfaceStatPktsIn 接口下行包总数 40 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.3 # sysInterfaceStatBytesIn 接口下行包总字节 41 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.4 # sysInterfaceStatPktsOut 接口上行包总数 42 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.5 # sysInterfaceStatBytesOut 接口上行包总字节 43 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.8 # sysInterfaceStatErrorsIn 接口下行错包 44 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.9 # sysInterfaceStatErrorsOut 接口上行错包 45 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.10 # sysInterfaceStatDropsIn 接口下行丢包 46 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.11 # sysInterfaceStatDropsOut 接口上行丢包 47 | - 1.3.6.1.4.1.3375.2.1.2.4.4.3.1.15 # sysInterfaceStatPauseActive 接口流控制帧状态 48 | 49 | max_repetitions: 25 50 | retries: 3 51 | timeout: 5s 52 | 53 | lookups: 54 | # 接口下标 sysInterfaceStatName 55 | - source_indexes: [sysInterfaceStatName] 56 | lookup: sysInterfaceStatPauseActive 57 | 58 | overrides: 59 | sysInterfaceStatPauseActive: 60 | ignore: true 61 | 62 | # F5负载均衡应用信息 63 | f5_app: 64 | walk: 65 | - 1.3.6.1.4.1.3375.2.2.5.1.2.1.23 # ltmPoolMemberCnt 指定池成员总数 66 | - 1.3.6.1.4.1.3375.2.2.5.1.2.1.8 # ltmPoolActiveMemberCnt 指定池活跃成员总数 67 | - 1.3.6.1.4.1.3375.2.2.5.2.3.1.31 # ltmPoolStatCurSessions 指定池的当前会话数量 68 | - 1.3.6.1.4.1.3375.2.2.10.1.2.1.9 # ltmVirtualServEnabled 虚拟服务器状态 69 | - 1.3.6.1.4.1.3375.2.2.10.2.3.1.12 # ltmVirtualServStatClientCurConns 客户端到当前虚拟服务器连接数 70 | - 1.3.6.1.4.1.3375.2.2.10.2.3.1.33 # ltmVirtualServStatVsUsageRatio5m 虚拟服务器5分钟使用率 71 | 72 | max_repetitions: 25 73 | retries: 3 74 | timeout: 5s 75 | 76 | lookups: 77 | 78 | overrides: -------------------------------------------------------------------------------- /generator/h3c/generator-demo.yml: -------------------------------------------------------------------------------- 1 | auths: 2 | # 认证模块名 3 | h3c_auth: 4 | # SNMP版本使用v2c版本 5 | version: 2 6 | # SNMP协议v2c版本设置团体名为public 7 | community: public 8 | 9 | modules: 10 | # 华三公共指标模块名称 核心层/汇聚层/接入层 11 | h3c_common: 12 | walk: 13 | # 交换机基础信息 14 | - 1.3.6.1.2.1.1.1 # sysDescr - 设备描述 15 | - 1.3.6.1.2.1.1.5 # sysName - 系统名称 16 | - 1.3.6.1.2.1.1.3 # sysUpTime - 设备上电时间 17 | # 实体CPU和内存信息 18 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.6 # hh3cEntityExtCpuUsage - 实体 CPU 实时利用率 19 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.8 # hh3cEntityExtMemUsage - 实体内存实时利用率百分比 20 | # 实体风扇和电源状态信息 21 | - 1.3.6.1.2.1.47.1.1.1.1.5 # entPhysicalClass - 实体类型 22 | - 1.3.6.1.2.1.47.1.1.1.1.7 # entPhysicalName - 实体名称 23 | # prometheus 通过合并查询实现 24 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.19 # hh3cEntityExtErrorStatus - 实体错误状态 25 | # 实体传感器温度信息 26 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.12 # hh3cEntityExtTemperature - 实体温度 27 | # 存储介质信息 28 | - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.4 # hh3cFlhPartSpace - 存储设备分区容量 单位byte 29 | - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.5 # hh3cFlhPartSpaceFree - 存储介质分区大小 30 | - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.10 # hh3cFlhPartName - 存储设备分区名称 31 | 32 | max_repetitions: 20 33 | retries: 3 34 | timeout: 5s 35 | 36 | lookups: 37 | # hh3cEntityExtPhysicalIndex = entPhysicalIndex 38 | - source_indexes: [entPhysicalIndex] 39 | lookup: entPhysicalClass 40 | - source_indexes: [entPhysicalIndex] 41 | lookup: entPhysicalName 42 | 43 | overrides: 44 | entPhysicalClass: 45 | ignore: true 46 | 47 | # 华三交换机堆叠模块 48 | h3c_stack: # 核心层/汇聚层 49 | walk: 50 | # 堆叠信息 51 | - 1.3.6.1.4.1.25506.2.91.1.7 # hh3cStackTopology - 堆叠系统的拓扑类型 52 | - 1.3.6.1.4.1.25506.2.91.1.2 # hh3cStackMemberNum - 本IRF系统目前包含的堆叠设备数量 53 | 54 | max_repetitions: 10 55 | retries: 3 56 | timeout: 5s 57 | 58 | # 华三交换机接口信息模块 59 | h3c_interface: # 核心层/汇聚层/接入层 60 | walk: 61 | # 接口信息 - 索引 ifIndex 62 | - 1.3.6.1.2.1.2.2.1.2 # ifDescr - 接口描述 63 | - 1.3.6.1.2.1.31.1.1.1.18 # ifAlias - 接口别名 64 | - 1.3.6.1.2.1.31.1.1.1.1 # ifName - 接口名字 65 | - 1.3.6.1.2.1.2.2.1.6 # ifPhysAddress - 接口物理地址 66 | - 1.3.6.1.2.1.2.2.1.7 # ifAdminStatus - 接口默认状态 67 | - 1.3.6.1.2.1.2.2.1.8 # ifOperStatus - 接口运行状态 68 | - 1.3.6.1.2.1.2.2.1.13 # ifInDiscards - 入方向丢包统计 69 | - 1.3.6.1.2.1.2.2.1.14 # ifInErrors - 入方向错包统计 70 | - 1.3.6.1.2.1.2.2.1.19 # ifOutDiscards - 出方向丢包统计 71 | - 1.3.6.1.2.1.2.2.1.20 # ifOutErrors - 出方向错包统计 72 | - 1.3.6.1.2.1.31.1.1.1.6 # ifHCInOctets - 入方向报文统计 73 | - 1.3.6.1.2.1.31.1.1.1.10 # ifHCOutOctets - 出方向报文统计 74 | - 1.3.6.1.2.1.31.1.1.1.15 # ifHighSpeed - 接口当前带宽 75 | 76 | # 光模块信息 - 索引 ifIndex 77 | - 1.3.6.1.4.1.25506.2.70.1.1.1.9 # hh3cTransceiverCurTXPower 光模块当前的发送光功率 单位为百分之一dBM 78 | - 1.3.6.1.4.1.25506.2.70.1.1.1.12 # hh3cTransceiverCurRXPower 光模块当前的接收功率 单位为百分之一dBM 79 | - 1.3.6.1.4.1.25506.2.70.1.1.1.15 # hh3cTransceiverTemperature 光模块当前的温度 单位为摄氏度 80 | - 1.3.6.1.4.1.25506.2.70.1.1.1.20 # hh3cTransceiverTempHiWarn 温度预警上限值,单位为千分之一摄氏度 81 | - 1.3.6.1.4.1.25506.2.70.1.1.1.32 # hh3cTransceiverPwrOutHiWarn 输出功率预警上限值 单位为十分之一微瓦 为0时代表不支持 82 | - 1.3.6.1.4.1.25506.2.70.1.1.1.33 # hh3cTransceiverPwrOutLoWarn 输出功率预警下限值,单位为十分之一微瓦 83 | - 1.3.6.1.4.1.25506.2.70.1.1.1.36 # hh3cTransceiverRcvPwrHiWarn 输入功率预警上限值,单位为十分之一微瓦 84 | - 1.3.6.1.4.1.25506.2.70.1.1.1.37 # hh3cTransceiverRcvPwrLoWarn 输入功率预警下限值,单位为十分之一微瓦 85 | 86 | max_repetitions: 50 87 | retries: 3 88 | timeout: 5s 89 | 90 | lookups: 91 | - source_indexes: [ifIndex] 92 | lookup: ifDescr 93 | - source_indexes: [ifIndex] 94 | lookup: ifAlias 95 | - source_indexes: [ifIndex] 96 | lookup: ifName 97 | - source_indexes: [ifIndex] 98 | lookup: ifPhysAddress 99 | - source_indexes: [ifIndex] 100 | lookup: ifAdminStatus 101 | - source_indexes: [ifIndex] 102 | lookup: ifOperStatus 103 | - source_indexes: [ifIndex] 104 | lookup: ifHighSpeed 105 | 106 | overrides: 107 | ifDescr: 108 | ignore: true 109 | ifAlias: 110 | ignore: true 111 | ifName: 112 | ignore: true 113 | ifPhysAddress: 114 | ignore: true 115 | ifAdminStatus: 116 | ignore: true 117 | ifOperStatus: 118 | ignore: true 119 | ifHighSpeed: 120 | ignore: true -------------------------------------------------------------------------------- /generator/h3c/switch/generator_h3c_sw.yml: -------------------------------------------------------------------------------- 1 | auths: 2 | h3c_auth: # 认证模块名称 3 | version: 2 # snmp v2c版本 4 | community: public # snmp 团体名 5 | 6 | modules: 7 | h3c_common: # 华三公共指标模块名称 8 | walk: 9 | # 交换机基础信息 10 | - 1.3.6.1.2.1.1.1 # sysDescr - 设备描述 11 | - 1.3.6.1.2.1.1.5 # sysName - 系统名称 12 | - 1.3.6.1.2.1.1.3 # sysUpTime - 设备上电时间 13 | # 实体CPU和内存信息 14 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.6 # hh3cEntityExtCpuUsage - 实体 CPU 实时利用率 15 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.8 # hh3cEntityExtMemUsage - 实体内存实时利用率百分比 16 | # 实体风扇和电源状态信息 17 | - 1.3.6.1.2.1.47.1.1.1.1.5 # entPhysicalClass - 实体类型 18 | - 1.3.6.1.2.1.47.1.1.1.1.7 # entPhysicalName - 实体名称 19 | # prometheus 通过合并查询实现 20 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.19 # hh3cEntityExtErrorStatus - 实体错误状态 21 | # 实体传感器温度信息 22 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.12 # hh3cEntityExtTemperature - 实体温度 23 | # 存储介质信息 24 | - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.4 # hh3cFlhPartSpace - 存储设备分区容量 单位byte 25 | - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.5 # hh3cFlhPartSpaceFree - 存储介质分区大小 26 | - 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.10 # hh3cFlhPartName - 存储设备分区名称 27 | 28 | max_repetitions: 20 29 | retries: 3 30 | timeout: 5s 31 | 32 | lookups: 33 | # hh3cEntityExtPhysicalIndex = entPhysicalIndex 34 | - source_indexes: [entPhysicalIndex] 35 | lookup: entPhysicalClass 36 | - source_indexes: [entPhysicalIndex] 37 | lookup: entPhysicalName 38 | 39 | overrides: 40 | entPhysicalClass: 41 | ignore: true 42 | 43 | h3c_stack: # 华三堆叠信息 44 | walk: 45 | # 堆叠信息 46 | - 1.3.6.1.4.1.25506.2.91.1.7 # hh3cStackTopology - 堆叠系统的拓扑类型 47 | - 1.3.6.1.4.1.25506.2.91.1.2 # hh3cStackMemberNum - 本IRF系统目前包含的堆叠设备数量 48 | 49 | max_repetitions: 20 50 | retries: 3 51 | timeout: 5s 52 | 53 | h3c_interface: 54 | walk: 55 | # 接口信息 - 索引 ifIndex 56 | - 1.3.6.1.2.1.2.2.1.2 # ifDescr - 接口描述 57 | - 1.3.6.1.2.1.31.1.1.1.18 # ifAlias - 接口别名 58 | - 1.3.6.1.2.1.31.1.1.1.1 # ifName - 接口名字 59 | - 1.3.6.1.2.1.2.2.1.6 # ifPhysAddress - 接口物理地址 60 | - 1.3.6.1.2.1.2.2.1.7 # ifAdminStatus - 接口默认状态 61 | - 1.3.6.1.2.1.2.2.1.8 # ifOperStatus - 接口运行状态 62 | - 1.3.6.1.2.1.2.2.1.13 # ifInDiscards - 入方向丢包统计 63 | - 1.3.6.1.2.1.2.2.1.14 # ifInErrors - 入方向错包统计 64 | - 1.3.6.1.2.1.2.2.1.19 # ifOutDiscards - 出方向丢包统计 65 | - 1.3.6.1.2.1.2.2.1.20 # ifOutErrors - 出方向错包统计 66 | - 1.3.6.1.2.1.31.1.1.1.6 # ifHCInOctets - 入方向报文统计 67 | - 1.3.6.1.2.1.31.1.1.1.10 # ifHCOutOctets - 出方向报文统计 68 | - 1.3.6.1.2.1.31.1.1.1.15 # ifHighSpeed - 接口当前带宽 69 | 70 | # 光模块信息 - 索引 ifIndex 71 | - hh3cTransceiverInfoTable 72 | 73 | max_repetitions: 60 74 | retries: 3 75 | timeout: 5s 76 | 77 | lookups: 78 | - source_indexes: [ifIndex] 79 | lookup: ifDescr 80 | - source_indexes: [ifIndex] 81 | lookup: ifAlias 82 | - source_indexes: [ifIndex] 83 | lookup: ifName 84 | - source_indexes: [ifIndex] 85 | lookup: ifPhysAddress 86 | - source_indexes: [ifIndex] 87 | lookup: ifAdminStatus 88 | - source_indexes: [ifIndex] 89 | lookup: ifOperStatus 90 | - source_indexes: [ifIndex] 91 | lookup: ifHighSpeed 92 | 93 | overrides: 94 | ifDescr: 95 | ignore: true 96 | ifAlias: 97 | ignore: true 98 | ifName: 99 | ignore: true 100 | ifPhysAddress: 101 | ignore: true 102 | ifAdminStatus: 103 | ignore: true 104 | ifOperStatus: 105 | ignore: true 106 | ifHighSpeed: 107 | ignore: true -------------------------------------------------------------------------------- /generator/h3c/wireless/README.md: -------------------------------------------------------------------------------- 1 | 本目录中generator.yml是适配了华三无线H3C WX3500X系列的无线控制器。 2 | 已完成测试:H3C WX3510X 其他型号未做测试,理论上讲WX3500X系列通用。 3 | 4 | 华三无线产品mib库下载链接: 5 | 根据对应的系统版本下载对应的MIB,如:Comware V7 6 | 7 | 下载路径:首页 > 产品与解决方案 > 智能联接 > 操作系统 > ComwareV7 > MIB > MIB 8 | 9 | 链接:https://www.h3c.com/cn/d_201806/1089291_473262_0.htm 10 | 11 | 12 | 根据对应的系统版本下载对应的MIB,如:Comware V5 13 | 14 | 下载路径:首页 > 产品与解决方案 > 智能联接 > 操作系统 > ComwareV5 > MIB > MIB 15 | 16 | 链接:https://www.h3c.com/cn/d_200905/635750_473262_0.htm 17 | 18 | 华三无线产品mib OID信息参考链接: 19 | 根据对应的版本做参考,如:Comware V7 或者找客服拿对应的mib OID对照表 20 | 链接: 21 | 22 | mibs文件夹中,我已经提前下载好Comware V7的mib库文件,需要自行解压得到mib后缀的文件。 23 | 24 | 推荐版本1(Comware V7):Comware-V7-MIB.zip 25 | 推荐版本2(Comware V5):Comware-V5-MIB.zip 26 | 27 | 至于下载那个版本的,需要查看你AC中目前的对应什么版本号。 28 | 通过测试的H3C WX2560H、H3C WX3510X用的是Comware-V7-MIB.zip 理论上其他版本AC也通用。 -------------------------------------------------------------------------------- /generator/h3c/wireless/generator-demo.yml: -------------------------------------------------------------------------------- 1 | auths: # 认证模块 2 | public_v2: # 认证模块名称 可自定义 在prometheus.yml中需要配置参数auth对应这个名称 3 | version: 2 # 定义SNMP Agent的版本为v2c 支持v3 4 | community: public # SNMP Agent的团体名设置和AC中设置的团体名需一致 5 | 6 | modules: # 指标模块 7 | H3C_AC: # 指标模块名称 可自定义 8 | walk: 9 | - 1.3.6.1.2.1.1.3 # SysUpTime - 设备运行时间 10 | - 1.3.6.1.2.1.2.2.1.1 # ifIndex - 接口索引 11 | - 1.3.6.1.2.1.2.2.1.2 # ifDescr - 接口描述 12 | - 1.3.6.1.2.1.2.2.1.5 # ifSpeed - 接口带宽 13 | - 1.3.6.1.2.1.2.2.1.8 # ifOperStatus - 接口当前状态 14 | - 1.3.6.1.2.1.2.2.1.13 # ifInDiscards - 接口接收丢弃包 15 | - 1.3.6.1.2.1.2.2.1.14 # ifInErrors - 接口接收错误包数 16 | - 1.3.6.1.2.1.2.2.1.19 # ifOutDiscards - 接口发送丢弃包 17 | - 1.3.6.1.2.1.2.2.1.20 # ifOutErrors - 接受发生错误包 18 | - 1.3.6.1.2.1.31.1.1.1.1 # ifName - AC接口名称 19 | - 1.3.6.1.2.1.31.1.1.1.6 # ifHCInOctets - AC接口接收字节数 20 | - 1.3.6.1.2.1.31.1.1.1.10 # ifHCOutOctets - AC接口发送字节数 21 | - 1.3.6.1.2.1.31.1.1.1.15 # ifHighSpeed - 接口带宽 22 | - 1.3.6.1.2.1.31.1.1.1.18 # ifAlias - 接口别名 23 | # - 1.3.6.1.4.1.25506.2.75.1.1.2.3.1.1 # hh3cDot11ACIfIndex - AC接口索引 24 | #- 1.3.6.1.4.1.25506.2.75.2.1.1.1.2 # hh3cDot11APIPAddress - AP的IP 25 | #- 1.3.6.1.4.1.25506.2.75.2.1.1.1.3 # hh3cDot11APMacAddress -AP的Mac 26 | #- 1.3.6.1.4.1.25506.2.75.2.1.1.1.7 # hh3cDot11APCpuUsage - AP的CPU实时利用率 27 | - 1.3.6.1.4.1.25506.2.75.2.1.1.1.1 # hh3cDot11APID - AP接口索引ID 28 | - 1.3.6.1.4.1.25506.2.75.2.1.1.1.4 # hh3cDot11APOperationStatus - AP与AC的关联状态 29 | - 1.3.6.1.4.1.25506.2.75.2.1.1.1.5 # hh3cDot11APTemplateNameOfAP - 设定的AP名称 30 | - 1.3.6.1.4.1.25506.2.75.2.1.6.1.1 # hh3cDot11APIfIndex hh3cDot11APObjID - AP接口索引 31 | - 1.3.6.1.4.1.25506.2.75.2.1.2.1.2 # hh3cDot11CurrAPIPAddress - AP的IP 32 | - 1.3.6.1.4.1.25506.2.75.2.1.2.1.3 # hh3cDot11CurrAPMacAddress - AP的Mac 33 | - 1.3.6.1.4.1.25506.2.75.2.1.2.1.9 # hh3cDot11CurrAPModelName - AP类型名称 34 | - 1.3.6.1.4.1.25506.2.75.1.1.2.1 # hh3cDot11APConnectCount - 当前AC连接的AP总数量 - gauge 35 | - 1.3.6.1.4.1.25506.2.75.1.1.2.2 # hh3cDot11StationConnectCount - 当前所有AP在线终端总数 - gauge 36 | - 1.3.6.1.4.1.25506.2.75.2.1.2.1.7 # hh3cDot11CurrAPStationAssocCount - AP当前连接STA数量 - gauge 37 | - 1.3.6.1.4.1.25506.2.75.2.1.10.1.1 # hh3cDot11APSysUpTime2 - AP启动时间 - gauge 38 | - 1.3.6.1.4.1.25506.2.75.2.1.10.1.2 # hh3cDot11APCPURTUsage2 - AP实时CPU利用率(周期1分钟) 39 | - 1.3.6.1.4.1.25506.2.75.2.1.10.1.4 # hh3cDot11APMemRTUsage2 - AP实时内存利用率(周期1分钟) 40 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.6 # hh3cEntityExtCpuUsage - AC的CPU实时利用率 - gauge 41 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.8 # hh3cEntityExtMemUsage - AC的内存实时利用率 - gauge 42 | 43 | max_repetitions: 25 44 | retries: 3 45 | timeout: 5s 46 | 47 | lookups: 48 | - source_indexes: [ifIndex] 49 | lookup: ifAlias 50 | - source_indexes: [ifIndex] 51 | lookup: ifDescr 52 | - source_indexes: [ifIndex] 53 | lookup: ifOperStatus 54 | - source_indexes: [hh3cDot11APObjID] 55 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.1.1.5 56 | drop_source_indexes: true 57 | - source_indexes: [hh3cDot11APObjID] 58 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.1.1.4 59 | drop_source_indexes: true 60 | - source_indexes: [hh3cDot11APObjID] 61 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.9 62 | drop_source_indexes: true 63 | - source_indexes: [hh3cDot11APObjID] 64 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.2 65 | drop_source_indexes: true 66 | - source_indexes: [hh3cDot11APObjID] 67 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.3 68 | drop_source_indexes: true 69 | 70 | overrides: 71 | ifAlias: 72 | ignore: true 73 | ifDescr: 74 | ignore: true 75 | ifOperStatus: 76 | ignore: true 77 | hh3cDot11APTemplateNameOfAP: 78 | ignore: true 79 | type: DisplayString 80 | hh3cDot11APOperationStatus: 81 | ignore: true 82 | hh3cDot11CurrAPMacAddress: 83 | ignore: true 84 | hh3cDot11CurrAPModelName: 85 | ignore: true 86 | type: DisplayString 87 | hh3cDot11CurrAPIPAddress: 88 | ignore: true -------------------------------------------------------------------------------- /generator/h3c/wireless/generator_h3c_wireless.yml: -------------------------------------------------------------------------------- 1 | auths: 2 | h3c_ac: 3 | version: 2 4 | community: public 5 | 6 | modules: 7 | h3c_wireless: 8 | walk: 9 | # AC基本信息 10 | - 1.3.6.1.2.1.1.1 # sysDescr - 设备描述 11 | - 1.3.6.1.2.1.1.5 # sysName - 系统名称 12 | - 1.3.6.1.2.1.1.3 # sysUpTime - 设备上电时间 13 | # AC接口信息 14 | - 1.3.6.1.2.1.2.2.1.1 # ifIndex - 接口索引 15 | - 1.3.6.1.2.1.2.2.1.2 # ifDescr - 接口描述 16 | - 1.3.6.1.2.1.31.1.1.1.18 # ifAlias - 接口别名 17 | - 1.3.6.1.2.1.31.1.1.1.1 # ifName - 接口名称 18 | - 1.3.6.1.2.1.31.1.1.1.15 # ifHighSpeed - 接口带宽 19 | - 1.3.6.1.2.1.2.2.1.8 # ifOperStatus - 接口当前状态 20 | - 1.3.6.1.2.1.2.2.1.13 # ifInDiscards - 入方向丢包统计 21 | - 1.3.6.1.2.1.2.2.1.14 # ifInErrors - 入方向错包统计 22 | - 1.3.6.1.2.1.2.2.1.19 # ifOutDiscards - 出方向丢包统计 23 | - 1.3.6.1.2.1.2.2.1.20 # ifOutErrors - 出方向错包统计 24 | - 1.3.6.1.2.1.31.1.1.1.6 # ifHCInOctets - 入方向报文统计 25 | - 1.3.6.1.2.1.31.1.1.1.10 # ifHCOutOctets - 出方向报文统计 26 | # AP基础信息 27 | - 1.3.6.1.4.1.25506.2.75.2.1.1.1.1 # hh3cDot11APID - AP接口索引ID 28 | - 1.3.6.1.4.1.25506.2.75.2.1.1.1.4 # hh3cDot11APOperationStatus - AP与AC的关联状态 29 | - 1.3.6.1.4.1.25506.2.75.2.1.1.1.5 # hh3cDot11APTemplateNameOfAP - 设定的AP名称 30 | - 1.3.6.1.4.1.25506.2.75.2.1.6.1.1 # hh3cDot11APIfIndex hh3cDot11APObjID - AP接口索引 31 | - 1.3.6.1.4.1.25506.2.75.2.1.2.1.2 # hh3cDot11CurrAPIPAddress - AP的IP 32 | - 1.3.6.1.4.1.25506.2.75.2.1.2.1.3 # hh3cDot11CurrAPMacAddress - AP的Mac 33 | - 1.3.6.1.4.1.25506.2.75.2.1.2.1.9 # hh3cDot11CurrAPModelName - AP类型名称 34 | - 1.3.6.1.4.1.25506.2.75.1.1.2.1 # hh3cDot11APConnectCount - 当前AC连接的AP总数量 35 | - 1.3.6.1.4.1.25506.2.75.1.1.2.2 # hh3cDot11StationConnectCount - 当前所有AP在线终端总数 36 | - 1.3.6.1.4.1.25506.2.75.2.1.2.1.7 # hh3cDot11CurrAPStationAssocCount - AP当前连接STA数量 37 | - 1.3.6.1.4.1.25506.2.75.2.1.10.1.1 # hh3cDot11APSysUpTime2 - AP启动时间 38 | - 1.3.6.1.4.1.25506.2.75.2.1.10.1.2 # hh3cDot11APCPURTUsage2 - AP实时CPU利用率(周期1分钟) 39 | - 1.3.6.1.4.1.25506.2.75.2.1.10.1.4 # hh3cDot11APMemRTUsage2 - AP实时内存利用率(周期1分钟) 40 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.6 # hh3cEntityExtCpuUsage - AC的CPU实时利用率 41 | - 1.3.6.1.4.1.25506.2.6.1.1.1.1.8 # hh3cEntityExtMemUsage - AC的内存实时利用率 42 | 43 | max_repetitions: 25 44 | retries: 3 45 | timeout: 5s 46 | 47 | lookups: 48 | - source_indexes: [ifIndex] 49 | lookup: ifAlias 50 | - source_indexes: [ifIndex] 51 | lookup: ifDescr 52 | - source_indexes: [ifIndex] 53 | lookup: ifOperStatus 54 | - source_indexes: [hh3cDot11APObjID] 55 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.1.1.5 56 | drop_source_indexes: true 57 | - source_indexes: [hh3cDot11APObjID] 58 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.1.1.4 59 | drop_source_indexes: true 60 | - source_indexes: [hh3cDot11APObjID] 61 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.9 62 | drop_source_indexes: true 63 | - source_indexes: [hh3cDot11APObjID] 64 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.2 65 | drop_source_indexes: true 66 | - source_indexes: [hh3cDot11APObjID] 67 | lookup: 1.3.6.1.4.1.25506.2.75.2.1.2.1.3 68 | drop_source_indexes: true 69 | 70 | overrides: 71 | ifAlias: 72 | ignore: true 73 | ifDescr: 74 | ignore: true 75 | ifOperStatus: 76 | ignore: true 77 | hh3cDot11APTemplateNameOfAP: 78 | ignore: true 79 | type: DisplayString 80 | hh3cDot11APOperationStatus: 81 | ignore: true 82 | hh3cDot11CurrAPMacAddress: 83 | ignore: true 84 | hh3cDot11CurrAPModelName: 85 | ignore: true 86 | type: DisplayString 87 | hh3cDot11CurrAPIPAddress: 88 | ignore: true -------------------------------------------------------------------------------- /generator/h3c/wireless/h3c-ac.yml: -------------------------------------------------------------------------------- 1 | # Prometheus通过文件发现机制定义的采集目标 2 | # /root/monitor/prometheus/targets/h3c-ac.yml 3 | - labels: 4 | module: h3c_ac # generator.yml中定义的指标模块名称,如果有多个可以写多个模块名 5 | auth: public_v2 # generator.yml中定义的认证模块名 6 | brand: H3C # 可删除可自定义 7 | hostname: XX-XXXX-CORE # 可删除可自定义 8 | model: H3C WX3510X # 可删除可自定义 9 | targets: 10 | - 172.17.14.1 # 核心 # 需要采集的无线控制器管理IP -------------------------------------------------------------------------------- /generator/h3c/wireless/prometheus.yml: -------------------------------------------------------------------------------- 1 | # 全局配置文件 2 | global: 3 | # ...... 4 | # 告警插件配置 5 | alerting: 6 | # ...... 7 | # 按照设定参数进行扫描加载,用于自定义报警规则,其报警媒介和route路由由alertmanager插件实现 8 | rule_files: 9 | # ...... 10 | 11 | # 设定采集对象,这里既有静态设置也有设置服务发现 12 | scrape_configs: 13 | # ...... 14 | 15 | - job_name: "h3c_wireless" 16 | scrape_interval: 15s 17 | scrape_timeout: 10s 18 | file_sd_configs: 19 | - files: 20 | - /root/monitor/prometheus/targets/h3c-ac.yml 21 | refresh_interval: 2m 22 | metrics_path: /snmp 23 | relabel_configs: 24 | - source_labels: ["__address__"] 25 | target_label: __param_target 26 | - source_labels: ["__param_target"] 27 | target_label: instance 28 | - target_label: __address__ 29 | replacement: 172.17.40.54:9116 # snmp_exporter 服务IP地址 30 | - source_labels: ["module"] # 从自定义的目标标签获取指标模块名称 31 | target_label: __param_module 32 | - source_labels: ["auth"] # 从自定义的目标标签获取认证模块名称 33 | target_label: __param_auth -------------------------------------------------------------------------------- /generator/huawei/switch/README.md: -------------------------------------------------------------------------------- 1 | # 华为交换机通过SNMP协议配置OID信息,采集对应OID指标信息生成器的配置文件 2 | 3 | ## 模块说明 4 | 5 | generator-demo.yml中的文件就是配置生成器配置文件,如果需要自定义采集指标,请自行查阅官方MIB库信息,根据对应的OID拿到自己需要的指标信息。 6 | 7 | 华为官方MIB信息查询:https://info.support.huawei.com/info-finder/tool/zh/enterprise/mib 8 | 9 | - huawei_common 这对华为通用交换机的常规指标采集 10 | - huawei_core 针对华为核心交换机的指标采集 基于CloudEngine S12700E-4 11 | 12 | ## MIB库文件 13 | 14 | - MIB_V200R022C00SPC500.zip 压缩包基于CloudEngine S12700E-4版本的MIB库文件,文件解压缩有一个文件说明该MIB库支持的交换机固件版本列表 -------------------------------------------------------------------------------- /generator/huawei/switch/network-switch.yml: -------------------------------------------------------------------------------- 1 | # Prometheus通过文件发现机制定义的采集目标 2 | - labels: 3 | module: huawei_common,huawei_core # generator.yml中定义的指标模块名称,如果有多个可以写多个模块名 4 | auth: public_v2 # generator.yml中定义的认证模块名 5 | brand: Huawei # 可删除可自定义 6 | hostname: XX-XXXX-CORE # 可删除可自定义 7 | model: S12700E-4 # 可删除可自定义 8 | targets: 9 | - 172.17.14.1 # 核心 # 需要采集的交换机管理IP 10 | - labels: 11 | module: HUAWEI 12 | auth: public_v2 13 | brand: Huawei 14 | hostname: XXXX-XXX-XX-AG 15 | model: S5720-36C-EI-AC 16 | targets: 17 | - 172.17.14.2 # 汇聚 -------------------------------------------------------------------------------- /generator/huawei/switch/prometheus.yml: -------------------------------------------------------------------------------- 1 | # 全局配置文件 2 | global: 3 | # ...... 4 | # 告警插件配置 5 | alerting: 6 | # ...... 7 | # 按照设定参数进行扫描加载,用于自定义报警规则,其报警媒介和route路由由alertmanager插件实现 8 | rule_files: 9 | # ...... 10 | 11 | # 设定采集对象,这里既有静态设置也有设置服务发现 12 | scrape_configs: 13 | # ...... 14 | 15 | # 采集华为交换机信息 16 | - job_name: "snmp" 17 | scrape_interval: 30s # 针对SNMP采集节点 覆盖全局配置15s 18 | scrape_timeout: 20s 19 | file_sd_configs: 20 | - files: 21 | - /root/monitor/prometheus/targets/network-*.yml 22 | refresh_interval: 2m 23 | metrics_path: /snmp 24 | relabel_configs: 25 | - source_labels: ["__address__"] 26 | target_label: __param_target 27 | - source_labels: ["__param_target"] 28 | target_label: instance 29 | # prometheus采集目标直接修改为snmp_exporter 服务IP地址 30 | - target_label: __address__ 31 | replacement: 172.17.40.54:9116 # snmp_exporter 服务IP地址 32 | - source_labels: ["module"] # 从自定义的目标标签获取指标模块名称 33 | target_label: __param_module 34 | - source_labels: ["auth"] # 从自定义的目标标签获取认证模块名称 35 | target_label: __param_auth -------------------------------------------------------------------------------- /generator/huawei/wireless/README.md: -------------------------------------------------------------------------------- 1 | 本目录中generator.yml是适配了华为无线AC6000系列的无线控制器。 2 | 已完成测试:AC6003、AC6005、AC6508、AC6605,其他型号未做测试,理论上讲AC6000系列通用。 3 | 4 | 华为无线产品mib库下载链接: 5 | 根据对应的版本下载对应的MIB,如:AC6605 V200R019C00SPC500 6 | 7 | 下载路径:技术支持 > 无线局域网 > AC > AC6000 > 软件 > 选择版本过滤 8 | 9 | 链接:https://support.huawei.com/enterprise/zh/software/250730566-ESW2000205621 10 | 11 | 12 | 华为无线产品mib OID信息参考链接: 13 | 根据对应的版本做参考,如:WLAN AC V200R019C10 MIB参考 14 | 链接:https://support.huawei.com/enterprise/zh/doc/EDOC1100156646 15 | 16 | mibs文件夹中,我已经提前下载好对应两个比较推荐的版本的mib库文件,需要自行解压得到mib后缀的文件。 17 | 18 | 推荐版本1(V200R019C00SPC500):MIB_WLAN_V200R019C00SPC500.zip 19 | 推荐版本2(V200R022C00SPC100):MIB_WLAN_V200R022C00SPC100.zip 20 | 21 | 至于下载那个版本的,需要查看你AC中目前的对应什么版本号。 22 | 由于我测试的AC6005、AC6508、AC6605都升级到V200R019C00SPC500版本, 23 | 故都是使用MIB_WLAN_V200R019C00SPC500.zip中的库文件。 -------------------------------------------------------------------------------- /generator/huawei/wireless/generator-demo.yml: -------------------------------------------------------------------------------- 1 | auths: 2 | public_v2: # 认证模块名称 3 | version: 2 # snmp v2c版本 4 | community: public # snmp 团体名 5 | 6 | modules: 7 | huawei_ac: 8 | walk: 9 | - 1.3.6.1.4.1.2011.5.25.31.1.1.1.1.5 # AC的CPU利用率 10 | - 1.3.6.1.4.1.2011.5.25.31.1.1.1.1.7 # AC的内存利用率 11 | - 1.3.6.1.4.1.2011.5.25.31.1.1.1.1.10 # AC的启动时间 12 | - 1.3.6.1.2.1.1.5 # AC的设备名称 13 | - 1.3.6.1.4.1.2011.5.25.31.1.1.1.1.11 # AC的温度 14 | - 1.3.6.1.2.1.2.2.1.1 # ifIndex 接口索引 15 | - 1.3.6.1.2.1.2.2.1.2 # IfDescr 描述接口的字符串 16 | #- 1.3.6.1.2.1.2.2.1.3 # ifType 接口类型 17 | - 1.3.6.1.2.1.2.2.1.5 # ifSpeed 估计的接口当前带宽 单位是bit/s 18 | - 1.3.6.1.2.1.31.1.1.1.15 # ifHighSpeed 接口当前带宽的估计值 单位为1,000,000 bit/s 19 | - 1.3.6.1.2.1.31.1.1.1.18 # ifAlias 网络管理员指定的接口别名 20 | - 1.3.6.1.2.1.2.2.1.8 # ifOperStatus 接口当前的状态 21 | - 1.3.6.1.2.1.2.2.1.13 # ifInDiscards 入方向的被丢弃的报文个数 22 | - 1.3.6.1.2.1.2.2.1.14 # ifInErrors 出错而不会被送往上层协议的报文/传输单元个数 23 | - 1.3.6.1.2.1.2.2.1.19 # ifOutDiscards 出方向的被丢弃的报文个数 24 | - 1.3.6.1.2.1.2.2.1.20 # ifOutErrors 出错而不会被传送的报文/传输单元个数 25 | #- 1.3.6.1.2.1.31.1.1.1.1 # ifName 由本地设备分配的接口名 26 | - 1.3.6.1.2.1.31.1.1.1.6 # ifHCInOctets 接口上接收到的字节总数 27 | - 1.3.6.1.2.1.31.1.1.1.10 # ifHCOutOctets 接口发送的字节总数 28 | - hwWlanIDIndexedApId # AP的ID 29 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.5 # AP的名称 30 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.7 # AP的状态 31 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.2 # AP的Mac 32 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.4 # AP的型号 33 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.14 # AP的IP 34 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.19 # AP的运行时间 35 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.22 # AP的上线时长 36 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.41 # AP的内存使用率 37 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.42 # AP的CPU使用率 38 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.44 # AP的温度 39 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.45 # AP的在线用户数 40 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.48 # AP的用户上线失败率 41 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.49 # AP的用户掉线率 42 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.50 # AP的粘性用户比率 43 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.57 # AP的以太接口上行速率 44 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.58 # AP的以太接口下行速率 45 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.65 # AP上行端口接收总的字节数 46 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.66 # AP上行端口发送总的字节数 47 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.65 # AP上行端口接收总的字节数 48 | - 1.3.6.1.4.1.2011.6.139.13.3.10.1.66 # AP上行端口发送总的字节数 49 | 50 | max_repetitions: 50 51 | retries: 3 52 | timeout: 5s 53 | 54 | lookups: 55 | - source_indexes: [ifIndex] 56 | lookup: ifAlias 57 | - source_indexes: [ifIndex] 58 | lookup: ifDescr 59 | - source_indexes: [ifIndex] 60 | lookup: ifOperStatus 61 | - source_indexes: [hwWlanIDIndexedApId] 62 | lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.5 63 | - source_indexes: [hwWlanIDIndexedApId] 64 | lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.7 65 | - source_indexes: [hwWlanIDIndexedApId] 66 | lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.2 67 | - source_indexes: [hwWlanIDIndexedApId] 68 | lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.4 69 | - source_indexes: [hwWlanIDIndexedApId] 70 | lookup: 1.3.6.1.4.1.2011.6.139.13.3.10.1.14 71 | 72 | overrides: 73 | ifAlias: 74 | ignore: true 75 | ifDescr: 76 | ignore: true 77 | ifOperStatus: 78 | ignore: true 79 | hwWlanIDIndexedApName: 80 | ignore: true 81 | type: DisplayString 82 | hwWlanIDIndexedApRunState: 83 | ignore: true 84 | hwWlanIDIndexedApMac: 85 | ignore: true 86 | hwWlanIDIndexedApTypeInfo: 87 | ignore: true 88 | type: DisplayString 89 | hwWlanIDIndexedApIpAddress: 90 | ignore: true -------------------------------------------------------------------------------- /generator/huawei/wireless/huawei-ac.yml: -------------------------------------------------------------------------------- 1 | # Prometheus通过文件发现机制定义的采集目标 2 | # /root/monitor/prometheus/targets/h3c-ac.yml 3 | - labels: 4 | module: huawei_ac # generator.yml中定义的指标模块名称,如果有多个可以写多个模块名 5 | auth: public_v2 # generator.yml中定义的认证模块名 6 | brand: H3C # 可删除可自定义 7 | hostname: XX-XXXX-CORE # 可删除可自定义 8 | model: AC6005 # 可删除可自定义 9 | targets: 10 | - 172.17.14.1 # 需要采集的无线控制器管理IP -------------------------------------------------------------------------------- /generator/huawei/wireless/prometheus.yml: -------------------------------------------------------------------------------- 1 | # 全局配置文件 2 | global: 3 | # ...... 4 | # 告警插件配置 5 | alerting: 6 | # ...... 7 | # 按照设定参数进行扫描加载,用于自定义报警规则,其报警媒介和route路由由alertmanager插件实现 8 | rule_files: 9 | # ...... 10 | 11 | # 设定采集对象,这里既有静态设置也有设置服务发现 12 | scrape_configs: 13 | # ...... 14 | 15 | # 采集华为AC信息 16 | - job_name: "huawei_wireless" 17 | scrape_interval: 15s 18 | scrape_timeout: 10s 19 | file_sd_configs: 20 | - files: 21 | - /root/monitor/prometheus/targets/huawei-*.yml 22 | refresh_interval: 2m 23 | metrics_path: /snmp 24 | relabel_configs: 25 | - source_labels: ["__address__"] 26 | target_label: __param_target 27 | - source_labels: ["__param_target"] 28 | target_label: instance 29 | - target_label: __address__ 30 | replacement: 172.17.40.54:9116 # snmp_exporter 服务IP地址 31 | - source_labels: ["module"] # 从自定义的目标标签获取指标模块名称 32 | target_label: __param_module 33 | - source_labels: ["auth"] # 从自定义的目标标签获取认证模块名称 34 | target_label: __param_auth -------------------------------------------------------------------------------- /generator/ruijie/wireless/README.md: -------------------------------------------------------------------------------- 1 | 本目录中generator.yml是适配了锐捷无线RG-WS6XXX系列的无线控制器。 2 | 已完成测试:RG-WS6008、RG-WS6108、RG-WS6512其他型号未做测试,理论上讲RGOS 11.X系列通用。 3 | 4 | 锐捷无线产品mib库下载链接: 5 | 根据对应的版本下载对应的MIB 6 | 7 | 下载路径:智能客服 > 转人工 > 提交需要获取的MIB文件的设备型号与版本号 > 等待人工回复 8 | 9 | 说明: 10 | 11 | 锐捷无线AC AP MIB OID节点获取? 12 | MIB库文件需要申请并签署保密协议,详细咨询4008111000。 -------------------------------------------------------------------------------- /generator/ruijie/wireless/generator-demo.yml: -------------------------------------------------------------------------------- 1 | auths: # 认证模块 2 | public_v2: # 认证模块名称 可自定义 在prometheus.yml中需要配置参数auth对应这个名称 3 | version: 2 # 定义SNMP Agent的版本为v2c 支持v3 4 | community: public # SNMP Agent的团体名设置和AC中设置的团体名需一致 5 | 6 | modules: # 指标模块 7 | ruijie_ac: # 指标模块名称 可自定义 8 | walk: 9 | - 1.3.6.1.4.1.4881.1.1.10.2.56.1.1.8 # AC的设备名称 - ruijieAcAcName 10 | #- 1.3.6.1.4.1.4881.1.1.10.2.56.1.1.21 # AC的温度 - ruijieAcTemp 11 | - ruijieCPUUtilization5Sec # AC的CPU利用率 - ruijieCPUUtilization5Sec 12 | - ruijieCPUUtilization1Min # AC的CPU利用率 - ruijieCPUUtilization1Min 13 | - ruijieCPUUtilization5Min # AC的CPU利用率 - ruijieCPUUtilization5Min 14 | - 1.3.6.1.4.1.4881.1.1.10.2.35.1.1.1.3 # AC的内存利用率 - ruijieMemoryPoolCurrentUtilization 15 | - 1.3.6.1.4.1.4881.1.1.10.2.1.1.27 # AC的运行时长 - ruijieSystemUptime 16 | - 1.3.6.1.2.1.2.2.1.1 # ifIndex - 接口索引 17 | - 1.3.6.1.2.1.2.2.1.2 # IfDescr - 描述接口的字符串 18 | - 1.3.6.1.2.1.2.2.1.5 # ifSpeed - 估计的接口当前带宽,单位是bit/s 19 | - 1.3.6.1.2.1.31.1.1.1.15 # ifHighSpeed - 接口当前带宽的估计值 单位为1,000,000 bit/s 20 | - 1.3.6.1.2.1.31.1.1.1.18 # ifAlias - 网络管理员指定的接口别名 21 | - 1.3.6.1.2.1.2.2.1.8 # ifOperStatus - 接口当前的状态 22 | - 1.3.6.1.2.1.2.2.1.13 # ifInDiscards - 入方向的被丢弃的报文个数 23 | - 1.3.6.1.2.1.2.2.1.14 # ifInErrors - 出错而不会被送往上层协议的报文/传输单元个数 24 | - 1.3.6.1.2.1.2.2.1.19 # ifOutDiscards - 出方向的被丢弃的报文个数 25 | - 1.3.6.1.2.1.2.2.1.20 # ifOutErrors - 出错而不会被传送的报文/传输单元个数 26 | - 1.3.6.1.2.1.31.1.1.1.6 # ifHCInOctets - 接口上接收到的字节总数 包括成帧的字符 该节点有64bit 是ifInOctets的扩充 27 | - 1.3.6.1.2.1.31.1.1.1.10 # ifHCOutOctets - 接口上发送到的字节总数 包括成帧字符 该节点有64bit 是ifOutOctets的扩充 28 | - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.2 # AP的名称 - ruijieApApName 29 | - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.48 # AP的状态 - ruijieApState 30 | #- 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.1 # AP的Mac - ruijieApMacAddr 31 | - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.39 # AP的型号 - ruijieApPID 32 | - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.33 # AP的IP - ruijieApIp 33 | - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.52 # AP的上线时长 - ruijieApUptimeMs 34 | - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.51 # AP的启动时间 - ruijieApLinkOnTimeIntervalMs 35 | - 1.3.6.1.4.1.4881.1.1.10.2.1.1.49.1.6 # AP的内存使用率 - ruijieSystemApMemoryPoolCurrentUtilization 36 | - 1.3.6.1.4.1.4881.1.1.10.2.1.1.49.1.4 # AP的CPU使用率(5s) - ruijieSystemApCPUUtilizationCurrent 37 | - 1.3.6.1.4.1.4881.1.1.10.2.1.1.49.1.5 # AP的CPU使用率(5m) - ruijieSystemApCPUUtilizationAverage 38 | #- 1.3.6.1.4.1.4881.1.1.10.2.1.1.49.1.9 # AP的温度 - ruijieSystemAPDeviceTemperature 39 | - 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.34 # AP的在线用户数 - ruijieApStaNum 40 | - 1.3.6.1.4.1.4881.1.1.10.2.56.1.1.15 # 当前连接到本AC的无线用户数 - ruijieAcStaNum 41 | - 1.3.6.1.4.1.4881.1.1.10.2.56.1.1.11 # 当前连接到本AC的AP数 - ruijieAcApNum 42 | 43 | max_repetitions: 25 44 | retries: 3 45 | timeout: 5s 46 | 47 | lookups: 48 | - source_indexes: [ifIndex] 49 | lookup: ifAlias 50 | - source_indexes: [ifIndex] 51 | lookup: ifDescr 52 | - source_indexes: [ifIndex] 53 | lookup: ifOperStatus 54 | - source_indexes: [ruijieApMacAddr] 55 | lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.2 56 | - source_indexes: [ruijieApMacAddr] 57 | lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.48 58 | - source_indexes: [ruijieApMacAddr] 59 | lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.39 60 | - source_indexes: [ruijieApMacAddr] 61 | lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.33 62 | - source_indexes: [ruijieSystemApStatMacAddr] 63 | lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.2 64 | - source_indexes: [ruijieSystemApStatMacAddr] 65 | lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.48 66 | - source_indexes: [ruijieSystemApStatMacAddr] 67 | lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.39 68 | - source_indexes: [ruijieSystemApStatMacAddr] 69 | lookup: 1.3.6.1.4.1.4881.1.1.10.2.56.2.1.1.1.33 70 | 71 | overrides: 72 | ifAlias: 73 | ignore: true 74 | ifDescr: 75 | ignore: true 76 | ifOperStatus: 77 | ignore: true 78 | ruijieApApName: 79 | ignore: true 80 | type: DisplayString 81 | ruijieApState: 82 | ignore: true 83 | ruijieApPID: 84 | ignore: true 85 | type: DisplayString 86 | ruijieApIp: 87 | ignore: true -------------------------------------------------------------------------------- /generator/ruijie/wireless/prometheus.yml: -------------------------------------------------------------------------------- 1 | # 全局配置文件 2 | global: 3 | # ...... 4 | # 告警插件配置 5 | alerting: 6 | # ...... 7 | # 按照设定参数进行扫描加载,用于自定义报警规则,其报警媒介和route路由由alertmanager插件实现 8 | rule_files: 9 | # ...... 10 | 11 | # 设定采集对象,这里既有静态设置也有设置服务发现 12 | scrape_configs: 13 | # ...... 14 | 15 | # 采集华为AC信息 16 | - job_name: "ruijie_wireless" 17 | scrape_interval: 15s 18 | scrape_timeout: 10s 19 | file_sd_configs: 20 | - files: 21 | - /root/monitor/prometheus/targets/ruijie-*.yml 22 | refresh_interval: 2m 23 | metrics_path: /snmp 24 | relabel_configs: 25 | - source_labels: ["__address__"] 26 | target_label: __param_target 27 | - source_labels: ["__param_target"] 28 | target_label: instance 29 | - target_label: __address__ 30 | replacement: 172.17.40.54:9116 # snmp_exporter 服务IP地址 31 | - source_labels: ["module"] # 从自定义的目标标签获取指标模块名称 32 | target_label: __param_module 33 | - source_labels: ["auth"] # 从自定义的目标标签获取认证模块名称 34 | target_label: __param_auth -------------------------------------------------------------------------------- /generator/ruijie/wireless/ruijie-ac.yml: -------------------------------------------------------------------------------- 1 | # Prometheus通过文件发现机制定义的采集目标 2 | # /root/monitor/prometheus/targets/h3c-ac.yml 3 | - labels: 4 | module: ruijie_ac # generator.yml中定义的指标模块名称,如果有多个可以写多个模块名 5 | auth: public_v2 # generator.yml中定义的认证模块名 6 | brand: Ruijie # 可删除可自定义 7 | hostname: XX-XXXX-XXX # 可删除可自定义 8 | model: RG-WS6008 # 可删除可自定义 9 | targets: 10 | - 172.17.14.1 # 需要采集的无线控制器管理IP -------------------------------------------------------------------------------- /generator/sangfor/ac/README.md: -------------------------------------------------------------------------------- 1 | 深信服AC采集配置 2 | 3 | 配置无法生成,深信服AC还是通过API获取数据并监控比较合适 -------------------------------------------------------------------------------- /generator/sangfor/ac/generator.yml: -------------------------------------------------------------------------------- 1 | auths: # 认证模块 2 | public_v2: # 认证模块名称 可自定义 在prometheus.yml中需要配置参数auth对应这个名称 3 | version: 2 # 定义SNMP Agent的版本为v2c 支持v3 4 | community: public # SNMP Agent的团体名设置和AC中设置的团体名需一致 5 | 6 | modules: # 指标模块 7 | # 深信服AD设备信息抓取 8 | sangfor_ac: # 指标模块名称 可自定义 9 | walk: 10 | - 1.3.6.1.4.1.35047.1.1 # 深信服AC设备名称和model 11 | - 1.3.6.1.4.1.35047.1.2 # 深信服AC 系统时间 12 | - 1.3.6.1.4.1.35047.1.3 # 深信服AC CPU使用率 13 | - 1.3.6.1.4.1.35047.1.10 # 深信服AC 剩余内存 14 | - 1.3.6.1.4.1.35047.1.11 # 深信服AC 总内存 15 | - 1.3.6.1.4.1.35047.1.5.1.4 # 深信服AC 磁盘已使用空间 16 | - 1.3.6.1.4.1.35047.1.5.1.5 # 深信服AC 磁盘可使用空间 17 | - 1.3.6.1.4.1.35047.1.5.1.6 # 深信服AC 磁盘占用率 18 | - 1.3.6.1.4.1.35047.1.7 # 深信服AC 双机状态 19 | - 1.3.6.1.4.1.35047.2.1.1.1 # 深信服AC numOfCurOnLine 实时在线用户数 20 | - 1.3.6.1.4.1.35047.2.1.1.2 # 深信服AC numOfMaxOnLine 最大在线用户数 21 | - 1.3.6.1.4.1.35047.2.1.1.3 # 深信服AC numOfCurOU 当前用户组数量 22 | - 1.3.6.1.4.1.35047.2.1.1.4 # 深信服AC numOfMaxOU 最大用户组数量 23 | - 1.3.6.1.4.1.35047.2.1.1.5 # 深信服AC numOfMaxSession 最大会话数 24 | - 1.3.6.1.4.1.35047.2.1.1.6 # 深信服AC numOfSession 实时会话数 25 | - version # 深信服AC version 版本 26 | - 1.3.6.1.4.1.35047.2.1.1.8 # 深信服AC 网关模式 27 | - 1.3.6.1.4.1.35047.2.1.2.1.2 # 深信服AC 接口名称 28 | - 1.3.6.1.4.1.35047.2.1.2.1.3 # 深信服AC 接口区域 29 | - 1.3.6.1.4.1.35047.2.1.2.1.4 # 深信服AC 接口状态 30 | - 1.3.6.1.4.1.35047.2.1.2.1.5 # 深信服AC 每秒发送数据包数量 31 | - 1.3.6.1.4.1.35047.2.1.2.1.6 # 深信服AC 每秒接收数据包数量 32 | - 1.3.6.1.4.1.35047.2.1.2.1.7 # 深信服AC 每秒发送字节 33 | - 1.3.6.1.4.1.35047.2.1.2.1.8 # 深信服AC 每秒接收字节 34 | - 1.3.6.1.4.1.35047.2.1.4.1.3 # 深信服AC 网管序列号状态 35 | - 1.3.6.1.4.1.35047.2.1.4.1.4 # 深信服AC 网关杀毒序列号状态 36 | - 1.3.6.1.4.1.35047.2.1.4.1.5 # 深信服AC 多功能序列号状态 37 | - 1.3.6.1.4.1.35047.2.1.4.1.6 # 深信服AC 跨运营商序列号状态 38 | - 1.3.6.1.4.1.35047.2.1.4.1.7 # 深信服AC 软件升级序列号状态 39 | - 1.3.6.1.4.1.35047.2.1.4.1.8 # 深信服AC 安全桌面序列号状态 40 | - 1.3.6.1.4.1.35047.2.1.4.1.9 # 深信服AC URL/应用规则库升级序列号状态 41 | - 1.3.6.1.4.1.35047.2.1.5.1.2 # 深信服AC 日志信息拦截数/记录数 42 | - 1.3.6.1.4.1.35047.2.1.5.1.3 # 深信服AC 日志信息网页访问 43 | - 1.3.6.1.4.1.35047.2.1.5.1.4 # 深信服AC 日志信息邮件收发 44 | - 1.3.6.1.4.1.35047.2.1.5.1.5 # 深信服AC 日志信息外发文件 45 | - 1.3.6.1.4.1.35047.2.1.5.1.6 # 深信服AC 日志信息论坛发帖 46 | - 1.3.6.1.4.1.35047.2.1.5.1.7 # 深信服AC 日志信息聊天内容 47 | 48 | max_repetitions: 50 49 | retries: 3 50 | timeout: 5s 51 | 52 | lookups: 53 | - source_indexes: [1.3.6.1.4.1.35047.2.1.2.1.1] 54 | lookup: 1.3.6.1.4.1.35047.2.1.2.1.2 55 | - source_indexes: [1.3.6.1.4.1.35047.2.1.2.1.1] 56 | lookup: 1.3.6.1.4.1.35047.2.1.2.1.3 57 | 58 | overrides: 59 | 1.3.6.1.4.1.35047.2.1.2.1.2: 60 | type: DisplayString 61 | ignore: true 62 | 1.3.6.1.4.1.35047.2.1.2.1.3: 63 | type: DisplayString 64 | ignore: true -------------------------------------------------------------------------------- /generator/sangfor/ac/sangfor-ac.txt: -------------------------------------------------------------------------------- 1 | .1.3.6.1.4.1.35047.1.3 Cpu使用率 2 | .1.3.6.1.4.1.35047.1.4 Free memory. 剩余内存 3 | .1.3.6.1.4.1.35047.1.9 sfSysTotalMemory 4 | .1.3.6.1.4.1.35047.1.5.1.4 sfDiskUsed 已使用空间 5 | .1.3.6.1.4.1.35047.1.5.1.5 sfDiskAvail 可使用空间 6 | .1.3.6.1.4.1.35047.1.5.1.6 sfDiskUsedPercent 磁盘占用率 7 | 8 | .1.3.6.1.4.1.35047.2.1.1.1 实时在线用户数 9 | .1.3.6.1.4.1.35047.2.1.1.2 最大在线用户数 10 | .1.3.6.1.4.1.35047.2.1.1.3 当前用户组数量 11 | .1.3.6.1.4.1.35047.2.1.1.4 最大用户组数量 12 | .1.3.6.1.4.1.35047.2.1.1.5 最大会话数 13 | .1.3.6.1.4.1.35047.2.1.1.6 实时会话数 14 | 15 | .1.3.6.1.4.1.35047.2.1.2.1.2 interface name 接口名称 16 | .1.3.6.1.4.1.35047.2.1.2.1.3 link area 接口区域 17 | .1.3.6.1.4.1.35047.2.1.2.1.4 link status 接口状态 18 | .1.3.6.1.4.1.35047.2.1.2.1.5 Number of send packets per second 每秒发送数据包数量 19 | .1.3.6.1.4.1.35047.2.1.2.1.6 Number of receive packets per second 每秒接收数据包数量 20 | .1.3.6.1.4.1.35047.2.1.2.1.7 Number of send Bytes per second 每秒发送字节 21 | .1.3.6.1.4.1.35047.2.1.2.1.8 Number of receive Bytes per second 每秒接收字节 22 | 23 | 24 | .1.3.6.1.4.1.35047.2.1.5.1.2 block/record 拦截数/记录数 25 | .1.3.6.1.4.1.35047.2.1.5.1.3 http get 网页访问 26 | .1.3.6.1.4.1.35047.2.1.5.1.4 send mail or receive mail 邮件收发 27 | .1.3.6.1.4.1.35047.2.1.5.1.5 send file or receive file 外发文件 28 | .1.3.6.1.4.1.35047.2.1.5.1.6 web BBS 论坛发帖 29 | .1.3.6.1.4.1.35047.2.1.5.1.7 IM chat 聊天内容 -------------------------------------------------------------------------------- /generator/sangfor/ad/README.md: -------------------------------------------------------------------------------- 1 | 本目录中generator.yml是适配了深信服AD设备。 2 | 已完成测试:AD7.0.8R4版本测试,其他版本未做测试,理论上讲7.0.x系列通用。 3 | 4 | 深信服mib库下载链接: 5 | 6 | 下载路径:AD > 系统管理 > SNMP > 下载MIB库 -------------------------------------------------------------------------------- /generator/sangfor/ad/generator.yml: -------------------------------------------------------------------------------- 1 | auths: # 认证模块 2 | public_v2: # 认证模块名称 可自定义 在prometheus.yml中需要配置参数auth对应这个名称 3 | version: 2 # 定义SNMP Agent的版本为v2c 支持v3 4 | community: public # SNMP Agent的团体名设置和AC中设置的团体名需一致 5 | 6 | modules: # 指标模块 7 | # 深信服AD设备信息抓取 8 | sangfor_ad: # 指标模块名称 可自定义 9 | walk: 10 | - adSysName 11 | - adCpuCostRate # 深信服AD CPU使用率 12 | - adMemCostRate # 深信服AD 内存使用率 13 | - sfIntCpuTemp # 深信服 设备温度 14 | - sfDiskTemp # 深信服 磁盘 15 | - sfFanSpeed # 深信服 设备风扇 16 | - adDiskCostRate # 深信服AD 磁盘使用率 17 | - sfDeviceStatus # 深信服 磁盘状态 1为正常 18 | - sfFanState # 深信服 风扇状态 2 3为正常 19 | - sfPowerState # 深信服 电源状态 2为正常 20 | - adConns # AD系统并发连接数 21 | - adNewConns # AD系统新建连接数 22 | - adVsHealthStatus # 虚拟服务的健康状态 23 | - adVsHealthNodeCnt # 虚拟服务的健康节点个数 24 | - adUplinkThroughput # 所有链路上行流量(整型) 25 | - adDownlinkThroughput # 所有链路下行流量 (整型) 26 | - adUptime # AD设备运行时间 27 | - adDevicePattern # AD运行模式 单机是3 28 | - adStandByState # AD双机主备状态 29 | - adLinkName # AD链路名称 30 | - adLinkType # AD链路类型 31 | - adLinkIfName # AD链路引用的网口 32 | - adLinkStatus # 链路状态,0为离线,1为正常 33 | - adLinkBitIn # 链路上行流量 34 | - adLinkBitOut # 链路下行流量 35 | - adLinkNumber # 设备链路个数 36 | 37 | max_repetitions: 25 38 | retries: 3 39 | timeout: 5s 40 | 41 | lookups: 42 | - source_indexes: [LinkIndex] 43 | lookup: adLinkType 44 | - source_indexes: [LinkIndex] 45 | lookup: adLinkIfName 46 | - source_indexes: [LinkIndex] 47 | lookup: adLinkName 48 | 49 | overrides: 50 | adSysName: 51 | type: DisplayString 52 | adLinkName: 53 | type: DisplayString 54 | ignore: true 55 | adLinkIfName: 56 | type: DisplayString 57 | ignore: true 58 | sfCpuTemp: 59 | type: DisplayString 60 | adLinkType: 61 | type: DisplayString 62 | ignore: true 63 | adVsHealthStatus: 64 | type: DisplayString 65 | sfFanState: 66 | type: DisplayString 67 | sfPowerState: 68 | type: DisplayString -------------------------------------------------------------------------------- /generator/sangfor/ad/prometheus.yml: -------------------------------------------------------------------------------- 1 | # 全局配置文件 2 | global: 3 | # ...... 4 | # 告警插件配置 5 | alerting: 6 | # ...... 7 | # 按照设定参数进行扫描加载,用于自定义报警规则,其报警媒介和route路由由alertmanager插件实现 8 | rule_files: 9 | # ...... 10 | 11 | # 设定采集对象,这里既有静态设置也有设置服务发现 12 | scrape_configs: 13 | # ...... 14 | 15 | # 采集华为AC信息 16 | - job_name: "sangfor_ad" 17 | scrape_interval: 15s 18 | scrape_timeout: 10s 19 | # file_sd_configs: # 可启动文件自动发现机制 20 | # - files: 21 | # - /opt/monitor/prometheus/targets/huawei-ac.yml # 定义AC的目标IP 22 | # refresh_interval: 2m 23 | static_configs: 24 | - targets: 25 | - 192.168.1.2 # 深信服设备管理IP 如深信服AD的管理IP 26 | # - tcp://192.168.1.3:1161 # SNMP设备使用自定义的TCP端口传输 27 | metrics_path: /snmp 28 | params: 29 | auth: [public_v2] # generator.yml中自定义的认证模块名称 30 | module: [sangfor_ad] # generator.yml中自定义的指标模块名称 31 | relabel_configs: 32 | - source_labels: ["__address__"] 33 | target_label: __param_target 34 | - source_labels: ["__param_target"] 35 | target_label: instance 36 | - target_label: __address__ 37 | replacement: 127.0.0.1:9116 # snmp_exporter服务器真实IP地址和端口 需要修改为snmp_exporter运行的监听地址和端口 -------------------------------------------------------------------------------- /generator/sangfor/af/README.md: -------------------------------------------------------------------------------- 1 | 还未完成AF的适配 -------------------------------------------------------------------------------- /generator/synology/README.md: -------------------------------------------------------------------------------- 1 | ## 群晖 `NAS` 监控 2 | 💻📊🔥✅ 3 | 监控群晖 `NAS` 4 | 5 | - 整体监控架构基于 Prometheus 体系 6 | - 指标采集器基于 SNMP Exporter 7 | - 仪表板基于 Grafana 8 | 9 | ## 监控架构 10 | 11 | ![监控架构](img/image.png) 12 | 13 | 上图中的架构,采集对象是群晖 `NAS` 采集器使用 SNMP Exporter,基于 vmagent 采集,可以使用的时序库 `Prometheus` 或者 `VictoriaMetrics`,最终使用 `Grafana` 呈现群晖的整体状态。 14 | 15 | ## 推荐架构 16 | 17 | ![推荐架构](img/image-1.png) 18 | 19 | 我这边建议使用夜莺生态架构,可以满足绝大部分的场景需求,符合大部分企业对可观测性的要求。 20 | 21 | ## 该采集如何使用可查看公众号文章 22 | 23 | 该仪表板主要采集群晖 `NAS` 指标,实现对群晖 `NAS` 各组件和内部信息的监控和告警,这里主要以 `SNMP` 协议开启采集。 24 | 25 | ## SNMP 26 | 27 | 以 `SNMP Exporter` 作为采集器,群晖 `NAS` 开启 `SNMP` 服务,这里以 `SNMP v3` 版本为例,需要提前在群晖 NAS 上开启 `SNMP v3`,如下所示: 28 | 29 | ![SNMP v3](img/1.jpg) 30 | 31 | Grafana Dashboard ID: [`22265`](https://grafana.com/grafana/dashboards/22265) 32 | 33 | ## 如何采集 34 | 35 | - 利用我生成好的配置文件 36 | - 修改 `SNMP` 认证模块参数 37 | - 抓取配置配置好 38 | 39 | 采集配置文件:[采集配置](snmp/snmp_synology_nas.yml) 40 | 41 | 修改 `snmp_synology_nas.yml` 文件中的头部认证模块: 42 | 43 | ```yaml 44 | auths: 45 | synology_v3: 46 | community: public 47 | security_level: authPriv 48 | username: monitor 49 | password: Mrot@2024neo 50 | auth_protocol: SHA 51 | priv_protocol: AES 52 | priv_password: Mrot@2024mei 53 | version: 3 54 | ``` 55 | 56 | 在群晖NAS中你配置的 `SNMP v3` 版本的参数在配置文件中修改下,修改完成后,可以直接保存即可,把你的配置文件放到 SNMP Exporter 采集配置中,即可实现采集。 57 | 58 | 配置抓取任务: 59 | 60 | ```yaml 61 | scrape_configs: 62 | - job_name: "synology" 63 | scrape_interval: 15s 64 | scrape_timeout: 10s 65 | file_sd_configs: 66 | - files: 67 | - /etc/victoriametrics/vmagent/synology-nas.yml 68 | # refresh_interval: 2m vmagent 不支持这个参数 prometheus 中可使用这个参数 69 | relabel_configs: 70 | - source_labels: ["__address__"] 71 | target_label: __param_target 72 | - source_labels: ["__param_target"] 73 | target_label: instance 74 | - target_label: __address__ 75 | replacement: 172.17.40.13:9116 76 | - source_labels: ["module"] 77 | target_label: __param_module 78 | - source_labels: ["auth"] 79 | target_label: __param_auth 80 | 81 | 82 | 83 | # /etc/victoriametrics/vmagent/synology-nas.yml 84 | # Prometheus 通过文件发现机制定义的采集目标 85 | - labels: 86 | module: synology_common,synology_interface 87 | auth: synology_v3 88 | brand: Synology 89 | targets: 90 | - 172.17.40.140 91 | ``` 92 | 93 | ## 更多信息 94 | 95 | 如果需要了解关于监控的更多信息,还请关注公众号:网络小斐,下面是公众号二维码。 96 | 97 | ![公众号](img/qrcode.jpg) -------------------------------------------------------------------------------- /generator/synology/img/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator/synology/img/1.jpg -------------------------------------------------------------------------------- /generator/synology/img/image-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator/synology/img/image-1.png -------------------------------------------------------------------------------- /generator/synology/img/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator/synology/img/image.png -------------------------------------------------------------------------------- /generator/synology/img/qrcode.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robotneo/networkdevice-monitor/97baface3f3c458e687fb43f4b537b2236185453/generator/synology/img/qrcode.jpg -------------------------------------------------------------------------------- /generator/test/generator_haikang_monitor.yml: -------------------------------------------------------------------------------- 1 | auths: 2 | haikang_auth: 3 | version: 3 4 | username: dnt 5 | # noAuthNoPriv authNoPriv authPriv 6 | security_level: authPriv 7 | password: Dnt@jiankong241 8 | # MD5, SHA, SHA224, SHA256, SHA384, or SHA512 9 | auth_protocol: SHA 10 | # DES, AES, AES192, AES256, AES192C, or AES256C 11 | priv_protocol: AES 12 | priv_password: Dnt@jiankong241 13 | # context_name: context 14 | 15 | modules: # 指标模块 16 | haikang_metrics: 17 | walk: 18 | - 1.3.6.1.4.1.39165.1.1 # 设备类型 19 | - 1.3.6.1.4.1.39165.1.2 # 硬件版本 20 | - 1.3.6.1.4.1.39165.1.3 # 软件版本 21 | - 1.3.6.1.4.1.39165.1.4 # MAC地址 22 | - 1.3.6.1.4.1.39165.1.5 # 厂商代码 23 | - 1.3.6.1.4.1.39165.1.6 # 厂商名称 24 | - 1.3.6.1.4.1.39165.1.7 # CPU利用率 25 | # - 1.3.6.1.4.1.39165.1.8 # 硬盘大小 26 | - 1.3.6.1.4.1.39165.1.9 # 硬盘使用率 27 | # - 1.3.6.1.4.1.39165.1.10 # 内存大小 28 | - 1.3.6.1.4.1.39165.1.11 # 内存使用率 29 | - 1.3.6.1.4.1.39165.1.12 # 设备重启 30 | - 1.3.6.1.4.1.39165.1.13 # 动态IP地址 31 | - 1.3.6.1.4.1.39165.1.14 # 动态掩码 32 | - 1.3.6.1.4.1.39165.1.15 # 动态网关 33 | - 1.3.6.1.4.1.39165.1.16 # 静态IP地址 34 | - 1.3.6.1.4.1.39165.1.17 # 静态掩码 35 | - 1.3.6.1.4.1.39165.1.18 # 静态网关 36 | - 1.3.6.1.4.1.39165.1.19 # 系统时间 37 | - 1.3.6.1.4.1.39165.1.20 # 视频输入通道数 38 | - 1.3.6.1.4.1.39165.1.21 # 视频编码格式 39 | - 1.3.6.1.4.1.39165.1.22 # 视频网传格式 40 | - 1.3.6.1.4.1.39165.1.23 # 有无音频能力 41 | - 1.3.6.1.4.1.39165.1.24 # 音频输入数目 42 | - 1.3.6.1.4.1.39165.1.25 # 音频输出数目 43 | - 1.3.6.1.4.1.39165.1.26 # 透明通道数目 44 | - 1.3.6.1.4.1.39165.1.27 # 是否支持本地存储 45 | - 1.3.6.1.4.1.39165.1.28 # 是否支持RTST回看 46 | - 1.3.6.1.4.1.39165.1.29 # 支持的网络接入类型 47 | - 1.3.6.1.4.1.39165.1.30 # 告警输入通道数目 48 | - 1.3.6.1.4.1.39165.1.31 # 告警输出通道数目 49 | 50 | max_repetitions: 25 51 | # 查询失败时的最大重复次数,查询的总时间为 timeout * retries 52 | retries: 3 53 | # 每个单独的 SNMP 查询返回数据的超时时间(秒) 54 | timeout: 5s 55 | allow_nonincreasing_oids: false 56 | use_unconnected_udp_socket: false 57 | 58 | lookups: # 不是表量 59 | 60 | overrides: 61 | 1.3.6.1.4.1.39165.1.7: 62 | ignore: false 63 | regex_extracts: 64 | '': # 指标名称保持不变 65 | - regex: '([0-9]+) (.*)' 66 | value: '$1' # float64 67 | 1.3.6.1.4.1.39165.1.9: 68 | ignore: false 69 | regex_extracts: 70 | '': # 指标名称保持不变 71 | - regex: '([0-9]+) (.*)' 72 | value: '$1' # float64 73 | 1.3.6.1.4.1.39165.1.11: 74 | ignore: false 75 | regex_extracts: 76 | '': # 指标名称保持不变 77 | - regex: '([0-9]+) (.*)' 78 | value: '$1' # float64 79 | 1.3.6.1.4.1.39165.1.13: 80 | type: InetAddressIPv4 81 | 1.3.6.1.4.1.39165.1.14: 82 | type: InetAddressIPv4 83 | 1.3.6.1.4.1.39165.1.15: 84 | type: InetAddressIPv4 85 | 1.3.6.1.4.1.39165.1.16: 86 | type: InetAddressIPv4 87 | 1.3.6.1.4.1.39165.1.17: 88 | type: InetAddressIPv4 89 | 1.3.6.1.4.1.39165.1.18: 90 | type: InetAddressIPv4 -------------------------------------------------------------------------------- /prometheus/rules/prod/blackbox.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: 网络协议服务状态-告警 3 | rules: 4 | - alert: 站点可用性 5 | expr: probe_success{job="blackbox_exporter"} == 0 6 | for: 1m 7 | labels: 8 | alertype: domain 9 | severity: Critical 10 | annotations: 11 | description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):站点无法访问** \n > {{ $labels.instance }}" 12 | 13 | - alert: 站点1h可用性低于80% 14 | expr: sum_over_time(probe_success{job="blackbox_exporter"}[1h])/count_over_time(probe_success{job="blackbox_exporter"}[1h]) * 100 < 80 15 | for: 3m 16 | labels: 17 | alertype: domain 18 | severity: warning 19 | annotations: 20 | description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):站点1h可用性:{{ $value | humanize }}%** \n > {{ $labels.instance }}" 21 | 22 | - alert: 站点状态异常 23 | expr: (probe_success{job="blackbox_exporter"} == 0 and probe_http_status_code > 499) or probe_http_status_code == 0 24 | for: 1m 25 | labels: 26 | alertype: domain 27 | severity: warning 28 | annotations: 29 | description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):站点状态异常:{{ $value }}** \n > {{ $labels.instance }}" 30 | 31 | - alert: 站点耗时过高 32 | expr: probe_duration_seconds > 0.5 33 | for: 2m 34 | labels: 35 | alertype: domain 36 | severity: warning 37 | annotations: 38 | description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):当前站点耗时:{{ $value | humanize }}s** \n > {{ $labels.instance }}" 39 | 40 | - alert: SSL证书有效期 41 | expr: (probe_ssl_earliest_cert_expiry-time()) / 3600 / 24 < 15 42 | for: 2m 43 | labels: 44 | alertype: domain 45 | severity: warning 46 | annotations: 47 | description: "**{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):证书有效期剩余{{ $value | humanize }}天** \n > {{ $labels.instance }}" 48 | 49 | - alert: 采集状态 50 | expr: up{job=~"blackbox_exporter|blackbox"} == 0 51 | for: 3m 52 | labels: 53 | alertype: itself 54 | severity: Critical 55 | annotations: 56 | description: "**{{ $labels.job }}:异常** \n > {{ $labels.module }}-{{ $labels.name }}-{{ $labels.instance }}" -------------------------------------------------------------------------------- /prometheus/rules/prod/idrac-status.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: 物理机iDrac状态-告警 3 | rules: 4 | - alert: 节点存活--杭州IT机房 5 | expr: globalSystemStatus{job="iDrac_SNMP"} != 3 6 | for: 1m 7 | labels: 8 | alertype: dell 9 | severity: Critical 10 | annotations: 11 | description: "**物理机【{{ $labels.instance }}】状态异常** \n > 状态值 = {{ $value }}" 12 | 13 | - alert: 内存状态--杭州IT机房 14 | expr: memoryDeviceStatus{job="iDrac_SNMP"} != 3 15 | for: 1m 16 | labels: 17 | alertype: dell 18 | severity: Critical 19 | annotations: 20 | description: "**物理机内存第【{{$labels.memoryDeviceIndex}}】根故障** \n > 状态值 = {{ $value }}" 21 | 22 | - alert: CPU状态--杭州IT机房 23 | expr: processorDeviceStatus{job="iDrac_SNMP"} != 3 24 | for: 1m 25 | labels: 26 | alertype: dell 27 | severity: Critical 28 | annotations: 29 | description: "**物理机CPU第【{{$labels.processorDeviceIndex}}】块故障** \n > 状态值 = {{ $value }}" 30 | 31 | - alert: 虚拟磁盘状态--杭州IT机房 32 | expr: virtualDiskState{job="iDrac_SNMP"} != 2 33 | for: 1m 34 | labels: 35 | alertype: dell 36 | severity: warning 37 | annotations: 38 | description: "**物理机虚拟磁盘第【{{$labels.virtualDiskNumber}}】块故障** \n > 状态值 = {{ $value }}" 39 | 40 | - alert: 电源状态--杭州IT机房 41 | expr: systemPowerState{job="iDrac_SNMP"} != 4 42 | for: 1m 43 | labels: 44 | alertype: dell 45 | severity: Critical 46 | annotations: 47 | description: "**物理机【{{ $labels.instance }}】电源故障** \n > 状态值 = {{ $value }}" 48 | 49 | - alert: 网卡状态--杭州IT机房 50 | expr: networkDeviceStatus{job="iDrac_SNMP"} != 3 51 | for: 1m 52 | labels: 53 | alertype: dell 54 | severity: Critical 55 | annotations: 56 | description: "**物理机网卡第【{{$labels.networkDeviceIndex}}】块故障** \n > 状态值 = {{ $value }}" 57 | 58 | - alert: 存储状态--杭州IT机房 59 | expr: globalStorageStatus{job="iDrac_SNMP"} != 3 60 | for: 1m 61 | labels: 62 | alertype: dell 63 | severity: warning 64 | annotations: 65 | description: "**物理机存储第【{{$labels.networkDeviceIndex}}】块故障** \n > 状态值 = {{ $value }}" 66 | 67 | - alert: 采集状态 68 | expr: up{job=~"idrac_snmp"} == 0 69 | for: 3m 70 | labels: 71 | alertype: itself 72 | severity: Critical 73 | annotations: 74 | description: "**{{ $labels.job }}:异常** \n > {{ $labels.brand }}-{{ $labels.module }}-{{ $labels.instance }}" -------------------------------------------------------------------------------- /prometheus/rules/prod/node-exporter.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: node_usage_record_rules 3 | interval: 1m 4 | rules: 5 | - record: cpu:usage:rate1m 6 | expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100 7 | - record: mem:usage:rate1m 8 | expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 9 | 10 | - name: Linux服务器状态-告警 11 | rules: 12 | - alert: VM内存使用率 13 | expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90 14 | for: 5m 15 | labels: 16 | alertype: system 17 | severity: warning 18 | annotations: 19 | description: "**{{ $labels.name }}:内存使用率{{ $value | humanize }}%** \n > {{ $labels.group }}-{{ $labels.instance }}" 20 | 21 | - alert: VM_CPU使用率 22 | expr: 100 - (avg by(instance,name,group,account) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 23 | for: 5m 24 | labels: 25 | alertype: system 26 | severity: warning 27 | annotations: 28 | description: "**{{ $labels.name }}:CPU使用率{{ $value | humanize }}%** \n > {{ $labels.group }}-{{ $labels.instance }}" 29 | 30 | - alert: VM系统负载 31 | expr: node_load5 / on (instance,name,group,account) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance,name,group,account)) by(instance,name,group,account) > 1.7 32 | for: 10m 33 | labels: 34 | alertype: system 35 | severity: warning 36 | annotations: 37 | description: "**{{ $labels.name }}:系统负载{{ $value | humanize }}倍** \n > {{ $labels.group }}-{{ $labels.instance }}" 38 | 39 | - alert: VM磁盘使用率 40 | expr: | 41 | 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype=~"ext.?|xfs",mountpoint!~".*pods.*|/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85 42 | for: 5m 43 | labels: 44 | alertype: system 45 | severity: warning 46 | annotations: 47 | description: "**{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%** \n > {{ $labels.group }}-{{ $labels.instance }}" 48 | 49 | - alert: VM主机重启 50 | expr: node_time_seconds - node_boot_time_seconds < 600 51 | for: 1m 52 | labels: 53 | alertype: system 54 | severity: warning 55 | annotations: 56 | description: "**{{ $labels.name }}:主机重启** \n > {{ $labels.group }}-{{ $labels.instance }}" 57 | 58 | - alert: VM文件系统只读 59 | expr: node_filesystem_readonly == 1 60 | for: 1m 61 | labels: 62 | alertype: system 63 | severity: warning 64 | annotations: 65 | description: "**{{ $labels.name }}-{{ $labels.mountpoint }}:文件系统只读** \n > {{ $labels.group }}-{{ $labels.instance }}" 66 | 67 | - alert: K8S节点POD磁盘使用率 68 | expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{mountpoint=~"/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85 69 | for: 5m 70 | labels: 71 | alertype: system 72 | severity: warning 73 | annotations: 74 | description: "**{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%** \n > {{ $labels.group }}-{{ $labels.instance }}" 75 | 76 | - alert: NFS磁盘使用率 77 | expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype="nfs"} * 100) > 90 78 | for: 5m 79 | labels: 80 | alertype: system 81 | severity: warning 82 | annotations: 83 | description: "**{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%** \n > {{ $labels.group }}-{{ $labels.instance }}" 84 | 85 | - alert: VM磁盘读写容量 86 | expr: (irate(node_disk_read_bytes_total[5m]) ) /1024 /1024 > 80 or (irate(node_disk_written_bytes_total[5m]) ) /1024 /1024 > 80 87 | for: 8m 88 | labels: 89 | alertype: disk 90 | severity: warning 91 | annotations: 92 | description: "**{{ $labels.name }}_{{ $labels.device }}:当前IO为{{ $value | humanize }}MB/s** \n > {{ $labels.group }}-{{ $labels.instance }}" 93 | 94 | - alert: VM网络流入(下载)数据过多 95 | expr: sum by(device,instance, name, group, account) (irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70 96 | for: 5m 97 | labels: 98 | alertype: network 99 | severity: warning 100 | annotations: 101 | description: "**{{ $labels.name }}:流入数据为{{ $value | humanize }}MB/s** \n > {{ $labels.group }}-{{ $labels.instance }}" 102 | 103 | - alert: VM网络流出(上传)数据过多 104 | expr: sum by(device,instance, name, group, account) (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70 105 | for: 5m 106 | labels: 107 | alertype: network 108 | severity: warning 109 | annotations: 110 | description: "**{{ $labels.name }}:流出数据为{{ $value | humanize }}MB/s** \n > {{ $labels.group }}-{{ $labels.instance }}" 111 | 112 | - name: Exporter服务状态-告警 113 | rules: 114 | - alert: Exporter状态 115 | expr: up{job=~"windows_exporter|node_exporter"} == 0 116 | for: 3m 117 | labels: 118 | alertype: itself 119 | severity: Critical 120 | annotations: 121 | description: "**{{ $labels.job }}:异常** \n > {{ $labels.group }}-{{ $labels.name }}-{{ $labels.instance }}" -------------------------------------------------------------------------------- /prometheus/rules/prod/sangfor-ad-status.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: 出口链路服务状态-告警 3 | rules: 4 | - alert: CMCC-Office下行出口带宽 5 | expr: (adLinkBitOut{adLinkName="CMCC-Office"} / 1000000) > 480 6 | for: 15m 7 | labels: 8 | alertype: network 9 | severity: High 10 | annotations: 11 | description: "**{{ $labels.adLinkName }}:下行带宽(15分钟内持续)使用超限** \n > 当前带宽 = {{ $value }}Mbps \n {{ $labels.adLinkIfName }}-{{ $labels.instance }}" 12 | 13 | - alert: CTCC-Office下行出口带宽 14 | expr: (adLinkBitOut{adLinkName="CTCC-Office"} / 1000000) > 280 15 | for: 10m 16 | labels: 17 | alertype: network 18 | severity: High 19 | annotations: 20 | description: "**{{ $labels.adLinkName }}:下行带宽(10分钟内持续)使用超限** \n > 当前带宽 = {{ $value }}Mbps \n {{ $labels.adLinkIfName }}-{{ $labels.instance }}" 21 | 22 | - alert: CTCC-Server下行出口带宽 23 | expr: (adLinkBitOut{adLinkName="CTCC-Server"} / 1000000) > 95 24 | for: 10m 25 | labels: 26 | alertype: network 27 | severity: High 28 | annotations: 29 | description: "**{{ $labels.adLinkName }}:下行带宽(10分钟内持续)使用超限** \n > 当前带宽 = {{ $value }}Mbps \n {{ $labels.adLinkIfName }}-{{ $labels.instance }}" 30 | 31 | - alert: CMCC-Office上行出口带宽 32 | expr: (adLinkBitIn{adLinkName="CMCC-Office"} / 1000000) > 480 33 | for: 15m 34 | labels: 35 | alertype: network 36 | severity: warning 37 | annotations: 38 | description: "**{{ $labels.adLinkName }}:上行带宽(15分钟内持续)使用超限** \n > 当前带宽 = {{ $value }}Mbps \n {{ $labels.adLinkName }}-{{ $labels.instance }}" 39 | 40 | - alert: CTCC-Office上行出口带宽 41 | expr: (adLinkBitIn{adLinkName="CTCC-Office"} / 1000000) > 280 42 | for: 10m 43 | labels: 44 | alertype: network 45 | severity: warning 46 | annotations: 47 | description: "**{{ $labels.adLinkName }}:上行带宽(10分钟内持续)使用超限** \n > 当前带宽 = {{ $value }}Mbps \n {{ $labels.adLinkName }}-{{ $labels.instance }}" 48 | 49 | - alert: CTCC-Server上行出口带宽 50 | expr: (adLinkBitIn{adLinkName="CTCC-Server"} / 1000000) > 95 51 | for: 10m 52 | labels: 53 | alertype: network 54 | severity: warning 55 | annotations: 56 | description: "**{{ $labels.adLinkName }}:上行带宽(10分钟内持续)使用超限** \n > 当前带宽 = {{ $value }}Mbps \n {{ $labels.adLinkName }}-{{ $labels.instance }}" 57 | 58 | - alert: 出口链路状态-故障 59 | expr: adLinkStatus == 0 60 | for: 0m 61 | labels: 62 | alertype: network 63 | severity: Critical 64 | annotations: 65 | description: "**{{ $labels.adLinkName }}:出口链路离线** \n > 故障值 = {{ $value }} \n {{ $labels.adLinkName }}-{{ $labels.instance }}" 66 | 67 | - alert: 出口链路状态-繁忙 68 | expr: adLinkStatus == 2 69 | for: 2m 70 | labels: 71 | alertype: network 72 | severity: High 73 | annotations: 74 | description: "**{{ $labels.adLinkName }}:出口链路(10分钟内持续)繁忙** \n > 故障值 = {{ $value }} \n {{ $labels.adLinkName }}-{{ $labels.instance }}" -------------------------------------------------------------------------------- /prometheus/rules/prod/switch-status.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: 杭州华为交换机信息-告警 3 | rules: 4 | - alert: 设备板卡温度告警 5 | expr: 0 < hwEntityTemperature >= hwEntityTemperatureThreshold 6 | for: 1m 7 | labels: 8 | alertype: switch 9 | severity: Critical 10 | annotations: 11 | description: "**设备板卡温度高门限** \n > 状态值 = {{ $value }}%" 12 | 13 | - alert: 交换机设备风扇状态 14 | expr: hwEntityFanSpeed == 0 15 | for: 1m 16 | labels: 17 | alertype: switch 18 | severity: Critical 19 | annotations: 20 | description: "**{{ $labels.hostname }} - 设备风扇转速为 {{ $value }}% ** \n > 风扇状态值:{{ $labels.hwEntityFanState }} - 风扇在位状态值:{{ $labels.hwEntityFanPresent }}" 21 | 22 | - alert: CPU使用率超限 23 | expr: 0 < hwEntityCpuUsage{job=~"huawei_sw|huawei_sw"} >= 60 24 | for: 5m 25 | labels: 26 | alertype: switch 27 | severity: Critical 28 | annotations: 29 | description: "**{{ $labels.hostname }} - 交换机CPU使用率超限** \n > 当前使用率 = {{ $value }}%" 30 | 31 | - alert: 内存使用率超限 32 | expr: 0 < hwEntityMemUsage{job=~"huawei_sw|huawei_sw"} >= 85 33 | for: 10m 34 | labels: 35 | alertype: switch 36 | severity: Critical 37 | annotations: 38 | description: "**{{$labels.hwWlanIDIndexedApName}} - 交换机内存使用率超限** \n > 当前使用率 = {{ $value }}%" 39 | 40 | - alert: 核心交换机CSS集群状态 41 | expr: hwCssMemberConfigEnable{job=~"huawei_sw"} != 1 42 | for: 0m 43 | labels: 44 | alertype: switch 45 | severity: Critical 46 | annotations: 47 | description: "**{{$labels.hostname}} CSS集群状态异常** \n > 当前状态值 = {{ $value }}" -------------------------------------------------------------------------------- /prometheus/rules/prod/windows-status.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: Windows服务器状态-告警 3 | rules: 4 | - alert: Windows Server 收集器不成功 5 | expr: windows_exporter_collector_success == 0 6 | for: 0m 7 | labels: 8 | alertype: windows 9 | severity: High 10 | annotations: 11 | description: "**Collector {{ $labels.collector }} was not successful** \n > 状态值 = {{ $value }}%" 12 | 13 | - alert: Windows 远程桌面状态不正常 14 | expr: windows_service_status{exported_name="termservice", status="ok"} != 1 15 | for: 1m 16 | labels: 17 | alertype: windows 18 | severity: High 19 | annotations: 20 | description: "**远程桌面服务状态异常** \n > 状态值 = {{ $value }}" 21 | 22 | - alert: Windows服务器CPU使用率超过 90% 23 | expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 90 24 | for: 0m 25 | labels: 26 | alertype: windows 27 | severity: warning 28 | annotations: 29 | description: "**CPU使用率超过 90%** \n > 使用率 = {{ $value }}%" 30 | 31 | - alert: Windows服务器内存使用率超过 90% 32 | expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90 33 | for: 2m 34 | labels: 35 | alertype: windows 36 | severity: warning 37 | annotations: 38 | description: "**内存使用率超过 90%** \n > 使用率 = {{ $value }}%" 39 | 40 | - alert: Windows服务器磁盘使用率超过 90% 41 | expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 90 42 | for: 2m 43 | labels: 44 | alertype: windows 45 | severity: warning 46 | annotations: 47 | description: "**磁盘使用率超过90%** \n > 使用率 = {{ $value }}% \n volume = {{ $labels.volume }}" -------------------------------------------------------------------------------- /prometheus/rules/vm/alerts-vmalert.yml: -------------------------------------------------------------------------------- 1 | # File contains default list of alerts for vmalert service. 2 | # The alerts below are just recommendations and may require some updates 3 | # and threshold calibration according to every specific setup. 4 | groups: 5 | # Alerts group for vmalert assumes that Grafana dashboard 6 | # https://grafana.com/grafana/dashboards/14950/ is installed. 7 | # Pls update the `dashboard` annotation according to your setup. 8 | - name: vmalert 9 | interval: 30s 10 | rules: 11 | - alert: ConfigurationReloadFailure 12 | expr: vmalert_config_last_reload_successful != 1 13 | labels: 14 | severity: warning 15 | annotations: 16 | summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}" 17 | description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}. 18 | Check vmalert's logs for detailed error message." 19 | 20 | - alert: AlertingRulesError 21 | expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0 22 | for: 5m 23 | labels: 24 | severity: warning 25 | annotations: 26 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" 27 | summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}" 28 | description: "Alerting rules execution is failing for group \"{{ $labels.group }}\". 29 | Check vmalert's logs for detailed error message." 30 | 31 | - alert: RecordingRulesError 32 | expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0 33 | for: 5m 34 | labels: 35 | severity: warning 36 | annotations: 37 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" 38 | summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}" 39 | description: "Recording rules execution is failing for group \"{{ $labels.group }}\". 40 | Check vmalert's logs for detailed error message." 41 | 42 | - alert: RecordingRulesNoData 43 | expr: sum(vmalert_recording_rules_last_evaluation_samples) without(id) < 1 44 | for: 30m 45 | labels: 46 | severity: info 47 | annotations: 48 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}" 49 | summary: "Recording rule {{ $labels.recording }} ({{ $labels.group }}) produces no data" 50 | description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" 51 | produces 0 samples over the last 30min. It might be caused by a misconfiguration 52 | or incorrect query expression." 53 | 54 | - alert: TooManyMissedIterations 55 | expr: increase(vmalert_iteration_missed_total[5m]) > 0 56 | for: 15m 57 | labels: 58 | severity: warning 59 | annotations: 60 | summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations" 61 | description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\". 62 | The group evaluation time takes longer than the configured evaluation interval. This may result in missed 63 | alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of 64 | group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert/#groups. 65 | If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/troubleshooting/#slow-queries." 66 | 67 | - alert: RemoteWriteErrors 68 | expr: increase(vmalert_remotewrite_errors_total[5m]) > 0 69 | for: 15m 70 | labels: 71 | severity: warning 72 | annotations: 73 | summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL" 74 | description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting 75 | or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message." 76 | 77 | - alert: AlertmanagerErrors 78 | expr: increase(vmalert_alerts_send_errors_total[5m]) > 0 79 | for: 15m 80 | labels: 81 | severity: warning 82 | annotations: 83 | summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager" 84 | description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\". 85 | Check vmalert's logs for detailed error message." -------------------------------------------------------------------------------- /prometheus/rules/vm/alerts-vmauth.yml: -------------------------------------------------------------------------------- 1 | # File contains default list of alerts for vmauth service. 2 | # The alerts below are just recommendations and may require some updates 3 | # and threshold calibration according to every specific setup. 4 | groups: 5 | - name: vmauth 6 | interval: 30s 7 | rules: 8 | - alert: ConcurrentRequestsLimitReached 9 | expr: sum(increase(vmauth_concurrent_requests_limit_reached_total[1m])) by (instance) > 0 10 | for: 3m 11 | labels: 12 | severity: warning 13 | annotations: 14 | summary: "vmauth ({{ $labels.instance }}) reached concurrent requests limit" 15 | description: "Possible solutions: increase the limit with flag: -maxConcurrentRequests, 16 | deploy additional vmauth replicas, check requests latency at backend service. 17 | See more details at https://docs.victoriametrics.com/vmauth/#concurrency-limiting" 18 | - alert: UserConcurrentRequestsLimitReached 19 | expr: sum(increase(vmauth_user_concurrent_requests_limit_reached_total[1m])) by (username) > 0 20 | for: 3m 21 | labels: 22 | severity: warning 23 | annotations: 24 | summary: "vmauth has reached concurrent requests limit for username {{ $labels.username }}" 25 | description: "Possible solutions: increase limit with flag: -maxConcurrentPerUserRequests, 26 | deploy additional vmauth replicas, check requests latency at backend service." -------------------------------------------------------------------------------- /victoriametrics/README.md: -------------------------------------------------------------------------------- 1 | ## VictoriaMetrics 生态组件部署方案 2 | 3 | 这个目录主要记录 VictoriaMetrics 生态组件的部署方案和脚本,可以作为测试环境和生产环境中部署参考。 -------------------------------------------------------------------------------- /victoriametrics/binary/PrometheusAlert/README.md: -------------------------------------------------------------------------------- 1 | ## 二进制部署单节点 PrometheusAlert 脚本 2 | 3 | 安装完成后,关于启动参数和配置文件说明: 4 | 5 | PrometheusAlert 二进制文件放置在新创建目录:`/opt/PrometheusAlert` 目录中 6 | 7 | PrometheusAlert 的配置参数文件在:`/opt/PrometheusAlert/conf/app.conf` 文件中,如果需要开启飞书、钉钉、企业微信等 webhook 配置可直接修该文件。 8 | 9 | 二进制文件启动都使用 systemd 管理进程,可直接执行下面的命令查看prometheus进程状态: 10 | 11 | - 状态:sudo systemctl status prometheusalert.service 12 | - 停止:sudo systemctl stop prometheusalert.service 13 | - 启动:sudo systemctl start prometheusalert.service 14 | - 重启:sudo systemctl restart prometheusalert.service 15 | - 开机自启:sudo systemctl enable prometheusalert.service 16 | 17 | 更多关于 PrometheusAlert 的教程请查看官方文档:[PrometheusAlert文档](https://feiyu563.gitbook.io/) -------------------------------------------------------------------------------- /victoriametrics/binary/PrometheusAlert/install-prometheusalert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # 函数:安装依赖工具 5 | install_dependencies() { 6 | if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then 7 | apt-get update && apt-get install -y curl wget unzip net-tools 8 | elif [ "$OS" == "centos" ] || [ "$OS" == "rocky" ]; then 9 | dnf update -y && dnf install -y curl wget unzip net-tools 10 | else 11 | echo "Unsupported operating system." 12 | exit 1 13 | fi 14 | } 15 | 16 | # 函数:设置系统服务和用户 17 | setup_system() { 18 | # 创建 PrometheusAlert 安装目录 19 | mkdir -p /opt/PrometheusAlert 20 | 21 | # 检查 prometheusalert 组是否存在,不存在则创建 22 | if ! getent group prometheusalert > /dev/null 2>&1; then 23 | groupadd --system prometheusalert 24 | fi 25 | 26 | # 检查 prometheusalert 用户是否存在,不存在则创建 27 | if ! id -u prometheusalert > /dev/null 2>&1; then 28 | useradd --system --home-dir /opt/PrometheusAlert --no-create-home --gid prometheusalert prometheusalert 29 | fi 30 | 31 | chown -R prometheusalert:prometheusalert /opt/PrometheusAlert 32 | } 33 | 34 | # 确定操作系统类型 35 | OS="unknown" 36 | if [ -f /etc/os-release ]; then 37 | . /etc/os-release 38 | OS=$ID 39 | fi 40 | 41 | # 安装依赖工具 42 | install_dependencies 43 | 44 | # 设置系统服务和用户 45 | setup_system 46 | 47 | # 获取 PrometheusAlert 最新版本 48 | PA_VERSION=$(curl -s "https://api.github.com/repos/feiyu563/PrometheusAlert/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//') 49 | 50 | # 下载 PrometheusAlert 二进制文件 51 | echo "Downloading PrometheusAlert v${PA_VERSION}..." 52 | wget https://github.com/feiyu563/PrometheusAlert/releases/download/v${PA_VERSION}/linux.zip -O /tmp/prometheusalert.zip 53 | 54 | # 如果下载速度慢,提示用户科学上网 55 | if [ $? -ne 0 ]; then 56 | echo "Download failed or too slow. Consider using a VPN or proxy to download faster." 57 | exit 1 58 | fi 59 | 60 | # 解压到 /tmp 61 | unzip /tmp/prometheusalert.zip -d /tmp 62 | 63 | # 拷贝 linux 目录中的所有文件到 /opt/PrometheusAlert 64 | cp -r /tmp/linux/* /opt/PrometheusAlert/ 65 | 66 | # 添加执行权限 67 | chmod +x /opt/PrometheusAlert/PrometheusAlert 68 | 69 | # 删除临时文件 70 | rm -rf /tmp/prometheusalert.zip /tmp/linux 71 | 72 | # 检查并加载配置文件 73 | if [ -f /opt/PrometheusAlert/conf/app.conf ]; then 74 | echo "Configuration file found in /opt/PrometheusAlert/conf/app.conf" 75 | else 76 | echo "Configuration file not found in /opt/PrometheusAlert/conf/. Please check." 77 | exit 1 78 | fi 79 | 80 | # 确保配置文件权限正确 81 | chown -R prometheusalert:prometheusalert /opt/PrometheusAlert 82 | 83 | # 创建 systemd 单元文件 84 | cat > /etc/systemd/system/prometheusalert.service < /dev/null 2>&1; then 25 | groupadd --system alertmanager 26 | fi 27 | 28 | # 检查 alertmanager 用户是否存在,不存在则创建 29 | if ! id -u alertmanager > /dev/null 2>&1; then 30 | useradd --system --home-dir /var/lib/alertmanager --no-create-home --gid alertmanager alertmanager 31 | fi 32 | 33 | chown -R alertmanager:alertmanager /var/lib/alertmanager 34 | } 35 | 36 | # 确定操作系统类型 37 | OS="unknown" 38 | if [ -f /etc/os-release ]; then 39 | . /etc/os-release 40 | OS=$ID 41 | fi 42 | 43 | # 安装依赖工具 44 | install_dependencies 45 | 46 | # 设置系统服务和用户 47 | setup_system 48 | 49 | # 获取 Alertmanager 最新版本 50 | AM_VERSION=$(curl -s "https://api.github.com/repos/prometheus/alertmanager/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//') 51 | 52 | # 下载并解压 Alertmanager 53 | wget "https://github.com/prometheus/alertmanager/releases/download/v${AM_VERSION}/alertmanager-${AM_VERSION}.linux-amd64.tar.gz" -O /tmp/alertmanager.tar.gz 54 | 55 | # 解压文件 56 | tar -xzvf /tmp/alertmanager.tar.gz -C /tmp 57 | 58 | # 复制解压的 alertmanager.yml 文件到 /etc/alertmanager 59 | cp /tmp/alertmanager-${AM_VERSION}.linux-amd64/alertmanager.yml /etc/alertmanager/ 60 | 61 | # 移动可执行文件到 /usr/bin 62 | mv /tmp/alertmanager-${AM_VERSION}.linux-amd64/alertmanager /usr/bin/ 63 | mv /tmp/alertmanager-${AM_VERSION}.linux-amd64/amtool /usr/bin/ 64 | 65 | # 清理临时文件 66 | rm -rf /tmp/alertmanager-${AM_VERSION}.linux-amd64 67 | rm /tmp/alertmanager.tar.gz 68 | 69 | # 确保配置文件权限正确 70 | chown -R alertmanager:alertmanager /etc/alertmanager 71 | 72 | # 创建 systemd 单元文件 73 | cat > /etc/systemd/system/alertmanager.service < /etc/alertmanager/alertmanager.conf < /etc/systemd/system/blackbox_exporter.service <&2; exit 1; } 22 | # 下载最新版本的Categraf并解压到指定目录 23 | wget -qO- "$latest_url" | tar xvz --strip-components=1 24 | echo "Categraf deployed successfully in $categraf_dir." 25 | # 复制 categraf.service 到 /etc/systemd/system/ 并启动服务及设置开机自启动 26 | if [ -f "${categraf_dir}/conf/categraf.service" ]; then 27 | mv "${categraf_dir}/conf/categraf.service" /etc/systemd/system/ 28 | systemctl daemon-reload 29 | systemctl start categraf 30 | systemctl enable categraf 31 | echo "Categraf service is started and enabled on boot." 32 | else 33 | echo "The categraf.service file does not exist. Please check the installation." 34 | fi 35 | else 36 | echo "Categraf is already deployed in $categraf_dir. Checking for updates..." 37 | # 获取当前部署的Categraf版本 38 | current_version=$("$categraf_dir/categraf" --version | awk '{print $3}') 39 | # 检查是否是最新版本 40 | if [ "$current_version" != "$latest_version" ]; then 41 | echo "Updating Categraf from version $current_version to $latest_version..." 42 | # 使用categraf --update命令更新到最新版本 43 | cd $categraf_dir 44 | ./categraf --update_url $latest_url --update 45 | echo "Categraf updated successfully to version $latest_version in $categraf_dir." 46 | else 47 | echo "Categraf is already up to date in $categraf_dir." 48 | fi 49 | fi -------------------------------------------------------------------------------- /victoriametrics/binary/categraf/install-categraf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 定义Categraf安装目录 4 | categraf_dir="/opt/categraf" 5 | # 获取最新版本号 6 | latest_version=$(curl -s https://api.github.com/repos/flashcatcloud/categraf/releases/latest | grep "tag_name" | cut -d '"' -f 4) 7 | # 下载最新版本的Categraf链接 8 | # latest_url="https://github.com/flashcatcloud/categraf/releases/download/$latest_version/categraf-$latest_version-linux-amd64.tar.gz" 9 | latest_url="https://download.flashcat.cloud/categraf-$latest_version-linux-amd64.tar.gz" 10 | # 定义下载文件的名称 11 | categraf_archive="categraf-$latest_version-linux-amd64.tar.gz" 12 | 13 | # 检查Categraf是否已经部署 14 | if [ ! -d "$categraf_dir" ]; then 15 | echo "Categraf is not deployed. Downloading and deploying latest version..." 16 | 17 | # 创建目标目录 18 | mkdir -p "$categraf_dir" || { echo "Error: Failed to create directory $categraf_dir." >&2; exit 1; } 19 | 20 | # 下载文件到安装目录 21 | echo "Downloading Categraf $latest_version..." 22 | wget --show-progress "$latest_url" -O "$categraf_dir/$categraf_archive" || { echo "Error: Failed to download Categraf." >&2; exit 1; } 23 | 24 | # 切换到安装目录 25 | cd "$categraf_dir" || { echo "Error: Failed to change to directory $categraf_dir." >&2; exit 1; } 26 | 27 | # 解压下载的文件 28 | echo "Extracting Categraf..." 29 | tar -xzf "$categraf_archive" --strip-components=1 || { echo "Error: Failed to extract Categraf." >&2; exit 1; } 30 | 31 | # 清理下载的压缩文件 32 | rm "$categraf_archive" 33 | 34 | # 使用新的 --install 命令安装服务 35 | echo "Installing Categraf as a service..." 36 | sudo ./categraf --install || { echo "Error: Failed to install Categraf service." >&2; exit 1; } 37 | 38 | # 启动并设置 Categraf 服务为开机启动 39 | sudo systemctl start categraf 40 | sudo systemctl enable categraf 41 | echo "Categraf service is started and enabled on boot." 42 | 43 | else 44 | echo "Categraf is already deployed. Checking for updates..." 45 | 46 | # 获取当前部署的Categraf版本 47 | current_version=$("$categraf_dir/categraf" --version | awk '{print $3}') 48 | 49 | # 检查是否是最新版本 50 | if [ "$current_version" != "$latest_version" ]; then 51 | echo "Updating Categraf from version $current_version to $latest_version..." 52 | 53 | # 下载新的版本 54 | wget --show-progress "$latest_url" -O "$categraf_dir/$categraf_archive" || { echo "Error: Failed to download new version." >&2; exit 1; } 55 | 56 | # 切换到安装目录 57 | cd "$categraf_dir" || { echo "Error: Failed to change to directory $categraf_dir." >&2; exit 1; } 58 | 59 | # 解压并更新 60 | tar -xzf "$categraf_archive" --strip-components=1 || { echo "Error: Failed to extract new version." >&2; exit 1; } 61 | 62 | # 清理下载的压缩文件 63 | rm "$categraf_archive" 64 | 65 | # 安装新版本 66 | sudo ./categraf --install || { echo "Error: Failed to update Categraf service." >&2; exit 1; } 67 | 68 | echo "Categraf updated successfully." 69 | else 70 | echo "Categraf is already up to date." 71 | fi 72 | fi -------------------------------------------------------------------------------- /victoriametrics/binary/categraf/update-config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 更新后的路径 4 | BASE_DIR="/opt/categraf" 5 | 6 | # 配置文件路径 7 | CONFIG_FILE="$BASE_DIR/conf/config.toml" 8 | NVIDIA_SMI_CONFIG_FILE="$BASE_DIR/conf/input.nvidia_smi/nvidia_smi.toml" 9 | EXPORTER_CONFIG_FILE="$BASE_DIR/conf/input.dcgm/exporter.toml" 10 | EXEC_CONFIG_FILE="$BASE_DIR/conf/input.exec/exec.toml" 11 | 12 | # 检查配置文件是否存在 13 | if [ ! -f "$CONFIG_FILE" ]; then 14 | echo "配置文件 $CONFIG_FILE 不存在。" 15 | exit 1 16 | fi 17 | 18 | if [ ! -f "$NVIDIA_SMI_CONFIG_FILE" ]; then 19 | echo "配置文件 $NVIDIA_SMI_CONFIG_FILE 不存在。" 20 | exit 1 21 | fi 22 | 23 | if [ ! -f "$EXPORTER_CONFIG_FILE" ]; then 24 | echo "配置文件 $EXPORTER_CONFIG_FILE 不存在。" 25 | exit 1 26 | fi 27 | 28 | if [ ! -f "$EXEC_CONFIG_FILE" ]; then 29 | echo "配置文件 $EXEC_CONFIG_FILE 不存在。" 30 | exit 1 31 | fi 32 | 33 | # 检查是否安装 dcgmi 命令 34 | if ! command -v dcgmi &> /dev/null; then 35 | echo "系统中未安装 dcgmi 命令,开始安装..." 36 | 37 | # 删除旧的 apt-key 38 | sudo apt-key del 7fa2af80 39 | 40 | # 获取系统版本信息 41 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') 42 | 43 | # 下载并安装 CUDA keyring 44 | wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb 45 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 46 | 47 | # 更新 apt 源 48 | sudo apt-get update 49 | 50 | # 安装 datacenter-gpu-manager 51 | sudo apt-get install -y datacenter-gpu-manager 52 | 53 | # 启用和重启 nvidia-dcgm 服务 54 | sudo systemctl --now enable nvidia-dcgm 55 | sudo systemctl --now restart nvidia-dcgm 56 | 57 | echo "dcgmi 命令安装完成。" 58 | fi 59 | 60 | # 更新 config.toml 文件中的参数 61 | sed -i 's|^file_name = "stdout"|file_name = "'"$BASE_DIR"'/logs/categraf.log"|' "$CONFIG_FILE" 62 | sed -i 's|^\(url = \)".*prometheus/v1/write"$|\1"http://10.6.212.9:17000/prometheus/v1/write"|' "$CONFIG_FILE" 63 | sed -i 's|^\(url = \)".*v1/n9e/heartbeat"$|\1"http://10.6.212.9:17000/v1/n9e/heartbeat"|' "$CONFIG_FILE" 64 | 65 | echo "配置文件已更新:$CONFIG_FILE" 66 | 67 | # 更新 nvidia_smi.toml 文件中的参数 68 | sed -i 's|^nvidia_smi_command = ""|nvidia_smi_command = "nvidia-smi"|' "$NVIDIA_SMI_CONFIG_FILE" 69 | sed -i 's|^# interval = 15|interval = 15|' "$NVIDIA_SMI_CONFIG_FILE" 70 | 71 | echo "配置文件已更新:$NVIDIA_SMI_CONFIG_FILE" 72 | 73 | # 更新 exporter.toml 文件中的参数 74 | sed -i 's|^#\[\[instances\]\]|\[\[instances\]\]|' "$EXPORTER_CONFIG_FILE" 75 | sed -i 's|^# collectors = "conf/input.dcgm/default-counters.csv"|collectors = "conf/input.dcgm/dcp-metrics-included.csv"|' "$EXPORTER_CONFIG_FILE" 76 | 77 | echo "配置文件已更新:$EXPORTER_CONFIG_FILE" 78 | 79 | # 更新 exec.toml 文件中的参数 80 | sed -i 's|^# interval = 15|interval = 15|' "$EXEC_CONFIG_FILE" 81 | sed -i 's|^\(commands = \[\)|\1\n "'"$BASE_DIR"'/scripts/*.py"|' "$EXEC_CONFIG_FILE" 82 | sed -i 's|^# data_format = "influx"|data_format = "prometheus"|' "$EXEC_CONFIG_FILE" 83 | 84 | echo "配置文件已更新:$EXEC_CONFIG_FILE" 85 | 86 | # # 新建日志目录和脚本目录 87 | # LOGS_DIR="$BASE_DIR/logs" 88 | # SCRIPTS_DIR="$BASE_DIR/scripts" 89 | 90 | # mkdir -pv "$LOGS_DIR" "$SCRIPTS_DIR" 91 | 92 | # echo "目录已创建:$LOGS_DIR 和 $SCRIPTS_DIR" 93 | 94 | # 重启 Categraf 服务 95 | echo "正在重启 Categraf 服务..." 96 | sudo systemctl restart categraf 97 | 98 | # 检查服务状态 99 | echo "检查 Categraf 服务状态..." 100 | sudo systemctl status categraf --no-pager 101 | 102 | echo "Categraf 配置和重启完成。" 103 | -------------------------------------------------------------------------------- /victoriametrics/binary/grafana/README.md: -------------------------------------------------------------------------------- 1 | ## 本地部署安装 Grafana 教程 2 | 3 | ### Ubuntu 4 | 5 | APT软件库安装 6 | 7 | ```bash 8 | sudo apt-get install -y apt-transport-https software-properties-common wget 9 | 10 | sudo mkdir -p /etc/apt/keyrings/ 11 | wget -q -O - https://apt.grafana.com/gpg.key | gpg --dearmor | sudo tee /etc/apt/keyrings/grafana.gpg > /dev/null 12 | 13 | echo "deb [signed-by=/etc/apt/keyrings/grafana.gpg] https://apt.grafana.com stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list 14 | 15 | sudo apt-get update 16 | 17 | sudo apt list --all-versions grafana-enterprise 18 | 19 | sudo apt-get install grafana-enterprise 20 | ``` 21 | 22 | ### CentOS 23 | 24 | ```bash 25 | wget -q -O gpg.key https://rpm.grafana.com/gpg.key 26 | sudo rpm --import gpg.key 27 | 28 | sudo vim /etc/yum.repos.d/grafana.repo 29 | 30 | [grafana] 31 | name=grafana 32 | baseurl=https://rpm.grafana.com 33 | repo_gpgcheck=1 34 | enabled=1 35 | gpgcheck=1 36 | gpgkey=https://rpm.grafana.com/gpg.key 37 | sslverify=1 38 | sslcacert=/etc/pki/tls/certs/ca-bundle.crt 39 | 40 | # 开源版 41 | sudo dnf install grafana 42 | # 企业版 43 | sudo dnf install grafana-enterprise 44 | ``` 45 | 46 | 如果下载很慢可寻找国内镜像源进行替换下载 -------------------------------------------------------------------------------- /victoriametrics/binary/network_exporter/README.md: -------------------------------------------------------------------------------- 1 | ## 二进制部署单节点 network_exporter 脚本 2 | 3 | 安装完成后,关于启动参数和配置文件说明: 4 | 5 | network_exporter 二进制文件放置在新创建目录:`/opt/network_exporter` 目录中 6 | 7 | network_exporter 的配置参数文件在:`/opt/network_exporter/network_exporter.yml` 文件中,如果需要配置探测目标,以及自定义 network_exporter 目标,可直接修该文件。 8 | 9 | 二进制文件启动都使用 systemd 管理进程,可直接执行下面的命令查看prometheus进程状态: 10 | 11 | - 状态:sudo systemctl status network_exporter.service 12 | - 停止:sudo systemctl stop network_exporter.service 13 | - 启动:sudo systemctl start network_exporter.service 14 | - 重启:sudo systemctl restart network_exporter.service 15 | - 开机自启:sudo systemctl enable network_exporter.service 16 | 17 | 更多关于 network_exporter 的教程请查看官方文档:[network_exporter文档](https://github.com/syepes/network_exporter) -------------------------------------------------------------------------------- /victoriametrics/binary/network_exporter/install-network.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # 函数:安装依赖工具 5 | install_dependencies() { 6 | if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then 7 | apt-get update && apt-get install -y curl wget tar net-tools 8 | elif [ "$OS" == "centos" ] || [ "$OS" == "rocky" ]; then 9 | dnf update -y && dnf install -y curl wget tar net-tools 10 | else 11 | echo "Unsupported operating system." 12 | exit 1 13 | fi 14 | } 15 | 16 | # 函数:设置系统服务和目录 17 | setup_system() { 18 | # 创建 /opt/network_exporter 目录 19 | mkdir -p /opt/network_exporter 20 | } 21 | 22 | # 函数:获取最新版本 23 | get_latest_version() { 24 | curl -s "https://api.github.com/repos/syepes/network_exporter/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//' 25 | } 26 | 27 | # 确定操作系统类型 28 | OS="unknown" 29 | if [ -f /etc/os-release ]; then 30 | . /etc/os-release 31 | OS=$ID 32 | fi 33 | 34 | # 安装依赖工具 35 | install_dependencies 36 | 37 | # 设置系统服务和目录 38 | setup_system 39 | 40 | # 获取 network_exporter 最新版本 41 | NE_VERSION=$(get_latest_version) 42 | echo "Downloading network_exporter v${NE_VERSION}..." 43 | 44 | # 下载 network_exporter 二进制文件 45 | wget https://github.com/syepes/network_exporter/releases/download/${NE_VERSION}/network_exporter_${NE_VERSION}.Linux_x86_64.tar.gz -O /tmp/network_exporter.tar.gz 46 | 47 | # 解压并仅保留需要的文件 48 | tar -xzvf /tmp/network_exporter.tar.gz -C /opt/network_exporter/ 49 | 50 | # 添加执行权限 51 | chmod +x /opt/network_exporter/network_exporter 52 | 53 | # 删除临时文件 54 | rm -rf /tmp/network_exporter.tar.gz /tmp/network_exporter_${NE_VERSION}.Linux_x86_64 55 | 56 | # 创建 systemd 单元文件 57 | cat > /etc/systemd/system/network_exporter.service < /dev/null 2>&1; then 20 | groupadd --system node_exporter 21 | fi 22 | 23 | # 检查node_exporter用户是否存在,不存在则创建 24 | if ! id -u node_exporter > /dev/null 2>&1; then 25 | useradd --system --no-create-home --shell /sbin/nologin --gid node_exporter node_exporter 26 | fi 27 | 28 | # 创建textfile_collector目录并设置权限 29 | mkdir -p /var/lib/node_exporter/textfile_collector 30 | chown -R node_exporter:node_exporter /var/lib/node_exporter 31 | } 32 | 33 | # 函数:确定操作系统类型 34 | setup_config_path() { 35 | if [ -f /etc/os-release ]; then 36 | . /etc/os-release 37 | if [[ "$ID" == "ubuntu" ]]; then 38 | CONFIG_PATH="/etc/default/node_exporter" 39 | elif [[ "$ID" == "centos" ]] || [[ "$ID" == "rocky" ]]; then 40 | CONFIG_PATH="/etc/sysconfig/node_exporter" 41 | else 42 | echo "Unsupported operating system." 43 | exit 1 44 | fi 45 | else 46 | echo "Cannot detect the operating system." 47 | exit 1 48 | fi 49 | } 50 | 51 | # 确定操作系统类型 52 | OS="unknown" 53 | if [ -f /etc/os-release ]; then 54 | . /etc/os-release 55 | OS=$ID 56 | fi 57 | 58 | # 安装依赖工具 59 | install_dependencies 60 | 61 | # 设置系统服务和用户 62 | setup_system 63 | 64 | # 获取 node_exporter 最新版本 65 | NE_VERSION=$(curl -s https://api.github.com/repos/prometheus/node_exporter/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//') 66 | 67 | # 下载并安装 node_exporter 68 | wget https://github.com/prometheus/node_exporter/releases/download/v${NE_VERSION}/node_exporter-${NE_VERSION}.linux-amd64.tar.gz -O /tmp/node_exporter.tar.gz 69 | tar -xzvf /tmp/node_exporter.tar.gz -C /tmp 70 | mv /tmp/node_exporter-${NE_VERSION}.linux-amd64/node_exporter /usr/sbin/ 71 | chmod +x /usr/sbin/node_exporter 72 | 73 | # 写入配置文件 74 | setup_config_path 75 | cat > "$CONFIG_PATH" < /etc/systemd/system/node_exporter.service < /etc/systemd/system/node_exporter.socket < /dev/null 2>&1; then 25 | groupadd --system prometheus 26 | fi 27 | 28 | # 检查Prometheus用户是否存在,不存在则创建 29 | if ! id -u prometheus > /dev/null 2>&1; then 30 | useradd --system --home-dir /var/lib/prometheus --no-create-home --gid prometheus prometheus 31 | fi 32 | 33 | chown -R prometheus:prometheus /var/lib/prometheus 34 | } 35 | 36 | # 确定操作系统类型 37 | OS="unknown" 38 | if [ -f /etc/os-release ]; then 39 | . /etc/os-release 40 | OS=$ID 41 | fi 42 | 43 | # 安装依赖工具 44 | install_dependencies 45 | 46 | # 设置系统服务和用户 47 | setup_system 48 | 49 | # 获取Prometheus最新版本 50 | PROM_VERSION=$(curl -s "https://api.github.com/repos/prometheus/prometheus/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}' | sed 's/^v//') 51 | 52 | # 下载并安装Prometheus 53 | wget https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz -O /tmp/prometheus.tar.gz 54 | 55 | # 解压Prometheus文件 56 | tar -xzvf /tmp/prometheus.tar.gz -C /tmp 57 | 58 | # 移动Prometheus可执行文件到/usr/bin目录 59 | mv /tmp/prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/bin/ 60 | mv /tmp/prometheus-${PROM_VERSION}.linux-amd64/promtool /usr/bin/ 61 | chmod +x /usr/bin/prometheus /usr/bin/promtool 62 | 63 | # 将解压后的 prometheus.yml 配置文件复制到 /etc/prometheus/single 目录 64 | cp /tmp/prometheus-${PROM_VERSION}.linux-amd64/prometheus.yml /etc/prometheus/single/ 65 | 66 | # 将 consoles 和 console_libraries 目录复制到 /var/lib/prometheus 67 | cp -r /tmp/prometheus-${PROM_VERSION}.linux-amd64/consoles /var/lib/prometheus/ 68 | cp -r /tmp/prometheus-${PROM_VERSION}.linux-amd64/console_libraries /var/lib/prometheus/ 69 | 70 | # 清理临时文件 71 | rm -rf /tmp/prometheus-${PROM_VERSION}.linux-amd64 72 | rm /tmp/prometheus.tar.gz 73 | 74 | # 设置systemd服务 75 | cat> /etc/systemd/system/prometheus.service < /etc/prometheus/single/prometheus.conf < /dev/null 2>&1; then 25 | groupadd --system victoriametrics 26 | fi 27 | 28 | # 检查victoriametrics用户是否存在,不存在则创建 29 | if ! id -u victoriametrics > /dev/null 2>&1; then 30 | useradd --system --home-dir /var/lib/victoria-metrics-data --no-create-home --gid victoriametrics victoriametrics 31 | fi 32 | 33 | chown -R victoriametrics:victoriametrics /var/lib/victoria-metrics-data 34 | } 35 | 36 | # 确定操作系统类型 37 | OS="unknown" 38 | if [ -f /etc/os-release ]; then 39 | . /etc/os-release 40 | OS=$ID 41 | fi 42 | 43 | # 安装依赖工具 44 | install_dependencies 45 | 46 | # 设置系统服务和用户 47 | setup_system 48 | 49 | # 获取VictoriaMetrics最新版本 50 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}') 51 | 52 | # 下载并安装VictoriaMetrics 53 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-amd64-${VM_VERSION}.tar.gz -O /tmp/victoria-metrics.tar.gz 54 | 55 | tar -xzvf /tmp/victoria-metrics.tar.gz -C /tmp 56 | mv /tmp/victoria-metrics-prod /usr/bin/ 57 | chmod +x /usr/bin/victoria-metrics-prod 58 | 59 | # 清理 /tmp 目录中的压缩文件和解压后的文件 60 | rm -rf /tmp/victoria-metrics.tar.gz /tmp/victoria-metrics-prod* 61 | 62 | cat> /etc/systemd/system/victoria-metrics.service < /etc/victoriametrics/single/vmsingle.conf < /dev/null 2>&1; then 25 | groupadd --system victoriametrics 26 | fi 27 | 28 | # 检查victoriametrics用户是否存在,不存在则创建 29 | if ! id -u victoriametrics > /dev/null 2>&1; then 30 | useradd --system --home-dir /var/lib/vmagent-remotewrite-data --no-create-home --gid victoriametrics victoriametrics 31 | fi 32 | 33 | chown -R victoriametrics:victoriametrics /var/lib/vmagent-remotewrite-data 34 | } 35 | 36 | # 确定操作系统类型 37 | OS="unknown" 38 | if [ -f /etc/os-release ]; then 39 | . /etc/os-release 40 | OS=$ID 41 | fi 42 | 43 | # 安装依赖工具 44 | install_dependencies 45 | 46 | # 设置系统服务和用户 47 | setup_system 48 | 49 | # 获取vmagent最新版本 50 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}') 51 | 52 | # 下载并安装vmagent 53 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/vmutils-linux-amd64-${VM_VERSION}.tar.gz -O /tmp/vmutils.tar.gz 54 | 55 | cd /tmp && tar -xzvf /tmp/vmutils.tar.gz vmagent-prod 56 | mv /tmp/vmagent-prod /usr/bin 57 | chmod +x /usr/bin/vmagent-prod 58 | 59 | # 清理 /tmp 目录中的压缩文件和解压后的临时文件 60 | rm -rf /tmp/vmutils.tar.gz /tmp/vmagent-prod* 61 | 62 | cat> /etc/systemd/system/vmagent.service < /etc/victoriametrics/vmagent/vmagent.conf < /etc/victoriametrics/vmagent/scrape.yml < /dev/null 2>&1; then 23 | groupadd --system victoriametrics 24 | fi 25 | 26 | # 检查victoriametrics用户是否存在,不存在则创建 27 | if ! id -u victoriametrics > /dev/null 2>&1; then 28 | useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics 29 | fi 30 | } 31 | 32 | # 确定操作系统类型 33 | OS="unknown" 34 | if [ -f /etc/os-release ]; then 35 | . /etc/os-release 36 | OS=$ID 37 | fi 38 | 39 | # 安装依赖工具 40 | install_dependencies 41 | 42 | # 设置系统服务和用户 43 | setup_system 44 | 45 | # 获取vmalert最新版本 46 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}') 47 | 48 | # 下载并安装vmalert 49 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/vmutils-linux-amd64-${VM_VERSION}.tar.gz -O /tmp/vmutils.tar.gz 50 | 51 | cd /tmp && tar -xzvf /tmp/vmutils.tar.gz vmalert-prod 52 | mv /tmp/vmalert-prod /usr/bin 53 | chmod +x /usr/bin/vmalert-prod 54 | 55 | # 清理 /tmp 目录中的压缩文件和解压后的临时文件 56 | rm -rf /tmp/vmutils.tar.gz /tmp/vmalert-prod* 57 | 58 | cat> /etc/systemd/system/vmalert.service < /etc/victoriametrics/vmalert/vmalert.conf < /dev/null 2>&1; then 23 | groupadd --system victoriametrics 24 | fi 25 | 26 | # 检查victoriametrics用户是否存在,不存在则创建 27 | if ! id -u victoriametrics > /dev/null 2>&1; then 28 | useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics 29 | fi 30 | } 31 | 32 | # 确定操作系统类型 33 | OS="unknown" 34 | if [ -f /etc/os-release ]; then 35 | . /etc/os-release 36 | OS=$ID 37 | fi 38 | 39 | # 安装依赖工具 40 | install_dependencies 41 | 42 | # 设置系统服务和用户 43 | setup_system 44 | 45 | # 获取vmauth最新版本 46 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}') 47 | 48 | # 下载并安装vmauth 49 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/vmutils-linux-amd64-${VM_VERSION}.tar.gz -O /tmp/vmutils.tar.gz 50 | 51 | cd /tmp && tar -xzvf /tmp/vmutils.tar.gz vmauth-prod 52 | mv /tmp/vmauth-prod /usr/bin 53 | chmod +x /usr/bin/vmauth-prod 54 | 55 | cat> /etc/systemd/system/vmauth.service < /etc/victoriametrics/vmauth/vmauth.conf < /etc/victoriametrics/vmauth/config.yml < /dev/null 2>&1; then 23 | groupadd --system victoriametrics 24 | fi 25 | 26 | # 检查victoriametrics用户是否存在,不存在则创建 27 | if ! id -u victoriametrics > /dev/null 2>&1; then 28 | useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics 29 | fi 30 | } 31 | 32 | # 确定操作系统类型 33 | OS="unknown" 34 | if [ -f /etc/os-release ]; then 35 | . /etc/os-release 36 | OS=$ID 37 | fi 38 | 39 | # 安装依赖工具 40 | install_dependencies 41 | 42 | # 设置系统服务和用户 43 | setup_system 44 | 45 | # 获取VictoriaMetrics集群最新版本 46 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}') 47 | # 下载并安装VictoriaMetrics集群 48 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-amd64-${VM_VERSION}-cluster.tar.gz -O /tmp/vmcluster.tar.gz 49 | 50 | cd /tmp && tar -xzvf /tmp/vmcluster.tar.gz vminsert-prod 51 | mv /tmp/vminsert-prod /usr/bin 52 | chmod +x /usr/bin/vminsert-prod 53 | 54 | cat> /etc/systemd/system/vminsert.service < /etc/victoriametrics/vminsert/vminsert.conf < /dev/null 2>&1; then 23 | groupadd --system victoriametrics 24 | fi 25 | 26 | # 检查victoriametrics用户是否存在,不存在则创建 27 | if ! id -u victoriametrics > /dev/null 2>&1; then 28 | useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics 29 | fi 30 | } 31 | 32 | # 确定操作系统类型 33 | OS="unknown" 34 | if [ -f /etc/os-release ]; then 35 | . /etc/os-release 36 | OS=$ID 37 | fi 38 | 39 | # 安装依赖工具 40 | install_dependencies 41 | 42 | # 设置系统服务和用户 43 | setup_system 44 | 45 | # 获取VictoriaMetrics集群最新版本 46 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}') 47 | # 下载并安装VictoriaMetrics集群 48 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-amd64-${VM_VERSION}-cluster.tar.gz -O /tmp/vmcluster.tar.gz 49 | 50 | cd /tmp && tar -xzvf /tmp/vmcluster.tar.gz vmselect-prod 51 | mv /tmp/vmselect-prod /usr/bin 52 | chmod +x /usr/bin/vmselect-prod 53 | 54 | cat> /etc/systemd/system/vmselect.service < /etc/victoriametrics/vmselect/vmselect.conf < /dev/null 2>&1; then 25 | groupadd --system victoriametrics 26 | fi 27 | 28 | # 检查victoriametrics用户是否存在,不存在则创建 29 | if ! id -u victoriametrics > /dev/null 2>&1; then 30 | useradd --system --home-dir /var/lib/victoriametrics --no-create-home --gid victoriametrics victoriametrics 31 | fi 32 | 33 | chown -R victoriametrics:victoriametrics /var/lib/victoria-metrics-cluster-data 34 | } 35 | 36 | # 确定操作系统类型 37 | OS="unknown" 38 | if [ -f /etc/os-release ]; then 39 | . /etc/os-release 40 | OS=$ID 41 | fi 42 | 43 | # 安装依赖工具 44 | install_dependencies 45 | 46 | # 设置系统服务和用户 47 | setup_system 48 | 49 | # 获取VictoriaMetrics集群最新版本 50 | VM_VERSION=$(curl -s "https://api.github.com/repos/VictoriaMetrics/VictoriaMetrics/tags" | grep '"name":' | head -n 1 | awk -F '"' '{print $4}') 51 | # 下载并安装VictoriaMetrics集群 52 | wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${VM_VERSION}/victoria-metrics-linux-amd64-${VM_VERSION}-cluster.tar.gz -O /tmp/vmcluster.tar.gz 53 | 54 | cd /tmp && tar -xzvf /tmp/vmcluster.tar.gz vmstorage-prod 55 | mv /tmp/vmstorage-prod /usr/bin 56 | chmod +x /usr/bin/vmstorage-prod 57 | 58 | cat> /etc/systemd/system/vmstorage.service < /etc/victoriametrics/vmstorage/vmstorage.conf < 2 11 | labels: 12 | severity: critical 13 | annotations: 14 | summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})" 15 | description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes. 16 | It might be crashlooping." 17 | 18 | - alert: ServiceDown 19 | expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0 20 | for: 2m 21 | labels: 22 | severity: critical 23 | annotations: 24 | summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}" 25 | description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." 26 | 27 | - alert: ProcessNearFDLimits 28 | expr: (process_max_fds - process_open_fds) < 100 29 | for: 5m 30 | labels: 31 | severity: critical 32 | annotations: 33 | summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m" 34 | description: "Exhausting OS file descriptors limit can cause severe degradation of the process.Consider to increase the limit as fast as possible." 35 | 36 | - alert: TooHighMemoryUsage 37 | expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8 38 | for: 5m 39 | labels: 40 | severity: critical 41 | annotations: 42 | summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")" 43 | description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance. 44 | Consider to either increase available memory or decrease the load on the process." 45 | 46 | - alert: TooHighCPUUsage 47 | expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 48 | for: 5m 49 | labels: 50 | severity: critical 51 | annotations: 52 | summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" 53 | description: "Too high CPU usage may be a sign of insufficient resources and make process unstable. 54 | Consider to either increase available CPU resources or decrease the load on the process." 55 | 56 | - alert: TooManyLogs 57 | expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0 58 | for: 15m 59 | labels: 60 | severity: warning 61 | annotations: 62 | summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})" 63 | description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.Worth to check logs for specific error messages." 64 | 65 | - alert: TooManyTSIDMisses 66 | expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0 67 | for: 10m 68 | labels: 69 | severity: critical 70 | annotations: 71 | summary: "Too many TSID misses for job \"{{ $labels.job }}\" ({{ $labels.instance }})" 72 | description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).Make sure you're running VictoriaMetrics of v1.85.3 or higher.Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502" 73 | 74 | - alert: ConcurrentInsertsHitTheLimit 75 | expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity 76 | for: 15m 77 | labels: 78 | severity: warning 79 | annotations: 80 | summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit" 81 | description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n 82 | Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU. 83 | In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients 84 | making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then 85 | it might be worth adjusting `-maxConcurrentInserts` cmd-line flag." -------------------------------------------------------------------------------- /victoriametrics/deploy-cluster/vmalert/alerts-vmalert.yml: -------------------------------------------------------------------------------- 1 | # File contains default list of alerts for vmalert service. 2 | # The alerts below are just recommendations and may require some updates 3 | # and threshold calibration according to every specific setup. 4 | groups: 5 | # Alerts group for vmalert assumes that Grafana dashboard 6 | # https://grafana.com/grafana/dashboards/14950/ is installed. 7 | # Pls update the `dashboard` annotation according to your setup. 8 | - name: vmalert 9 | interval: 30s 10 | rules: 11 | - alert: ConfigurationReloadFailure 12 | expr: vmalert_config_last_reload_successful != 1 13 | labels: 14 | severity: warning 15 | annotations: 16 | summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}" 17 | description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}. 18 | Check vmalert's logs for detailed error message." 19 | 20 | - alert: AlertingRulesError 21 | expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0 22 | for: 5m 23 | labels: 24 | severity: warning 25 | annotations: 26 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" 27 | summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}" 28 | description: "Alerting rules execution is failing for group \"{{ $labels.group }}\". 29 | Check vmalert's logs for detailed error message." 30 | 31 | - alert: RecordingRulesError 32 | expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0 33 | for: 5m 34 | labels: 35 | severity: warning 36 | annotations: 37 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" 38 | summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}" 39 | description: "Recording rules execution is failing for group \"{{ $labels.group }}\". 40 | Check vmalert's logs for detailed error message." 41 | 42 | - alert: RecordingRulesNoData 43 | expr: sum(vmalert_recording_rules_last_evaluation_samples) without(recording, id) < 1 44 | for: 30m 45 | labels: 46 | severity: info 47 | annotations: 48 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}" 49 | summary: "Recording rule {{ $labels.recording }} ({ $labels.group }}) produces no data" 50 | description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" 51 | produces 0 samples over the last 30min. It might be caused by a misconfiguration 52 | or incorrect query expression." 53 | 54 | - alert: TooManyMissedIterations 55 | expr: increase(vmalert_iteration_missed_total[5m]) > 0 56 | for: 15m 57 | labels: 58 | severity: warning 59 | annotations: 60 | summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations" 61 | description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\". 62 | The group evaluation time takes longer than the configured evaluation interval. This may result in missed 63 | alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of 64 | group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert/#groups. 65 | If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/troubleshooting/#slow-queries." 66 | 67 | - alert: RemoteWriteErrors 68 | expr: increase(vmalert_remotewrite_errors_total[5m]) > 0 69 | for: 15m 70 | labels: 71 | severity: warning 72 | annotations: 73 | summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL" 74 | description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting 75 | or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message." 76 | 77 | - alert: AlertmanagerErrors 78 | expr: increase(vmalert_alerts_send_errors_total[5m]) > 0 79 | for: 15m 80 | labels: 81 | severity: warning 82 | annotations: 83 | summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager" 84 | description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\". 85 | Check vmalert's logs for detailed error message." -------------------------------------------------------------------------------- /victoriametrics/deploy-cluster/vmauth/auth-cluster.yml: -------------------------------------------------------------------------------- 1 | # balance load among vmselects 2 | # see https://docs.victoriametrics.com/vmauth/#load-balancing 3 | unauthorized_user: 4 | # 数据传入负载 5 | url_map: 6 | - src_paths: 7 | - "/insert/.+" 8 | url_prefix: 9 | # - "http://vminsert-1:8480/insert/0/prometheus" 10 | - "http://vminsert-1:8480/" 11 | - "http://vminsert-2:8480/" 12 | - "http://vminsert-3:8480/" 13 | - src_paths: 14 | - "/select/.+" 15 | url_prefix: 16 | - "http://vmselect-1:8481/" 17 | - "http://vmselect-2:8481/" 18 | - "http://vmselect-3:8481/" 19 | retry_status_codes: [500, 502, 503] 20 | load_balancing_policy: first_available -------------------------------------------------------------------------------- /victoriametrics/deploy-n9e/compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | mysql: 3 | image: "mysql:8" 4 | container_name: mysql 5 | hostname: mysql 6 | restart: always 7 | environment: 8 | TZ: Asia/Shanghai 9 | MYSQL_ROOT_PASSWORD: 1234 10 | volumes: 11 | - mysqldata:/var/lib/mysql/ 12 | - ./initsql:/docker-entrypoint-initdb.d/ 13 | - ./mysql/my.cnf:/etc/my.cnf 14 | networks: 15 | - nightingale 16 | ports: 17 | - "3306:3306" 18 | 19 | redis: 20 | image: "redis:6.2" 21 | container_name: redis 22 | hostname: redis 23 | restart: always 24 | environment: 25 | TZ: Asia/Shanghai 26 | networks: 27 | - nightingale 28 | ports: 29 | - "6379:6379" 30 | 31 | victoriametrics: 32 | image: victoriametrics/victoria-metrics:v1.100.1 33 | container_name: victoriametrics 34 | hostname: victoriametrics 35 | restart: always 36 | environment: 37 | TZ: Asia/Shanghai 38 | ports: 39 | - "8428:8428" 40 | networks: 41 | - nightingale 42 | command: 43 | - "--loggerTimezone=Asia/Shanghai" 44 | 45 | nightingale: 46 | image: flashcatcloud/nightingale:latest 47 | container_name: nightingale 48 | hostname: nightingale 49 | restart: always 50 | environment: 51 | GIN_MODE: release 52 | TZ: Asia/Shanghai 53 | WAIT_HOSTS: mysql:3306, redis:6379 54 | volumes: 55 | - ./nightingale:/app/etc 56 | networks: 57 | - nightingale 58 | ports: 59 | - "17000:17000" 60 | - "20090:20090" 61 | depends_on: 62 | - mysql 63 | - redis 64 | - victoriametrics 65 | command: > 66 | sh -c "/app/n9e" 67 | 68 | # 使用VictoriaMetrics作为数据源配置的Grafana实例 69 | grafana: 70 | container_name: grafana 71 | hostname: grafana 72 | image: grafana/grafana:10.4.1 73 | depends_on: 74 | - "victoriametrics" 75 | ports: 76 | - 3000:3000 77 | volumes: 78 | - grafanadata:/var/lib/grafana 79 | - ./provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources 80 | - ./provisioning/dashboards:/etc/grafana/provisioning/dashboards 81 | - ./dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json 82 | networks: 83 | - nightingale 84 | restart: always 85 | 86 | volumes: 87 | mysqldata: {} 88 | grafanadata: {} 89 | 90 | networks: 91 | nightingale: 92 | driver: bridge -------------------------------------------------------------------------------- /victoriametrics/deploy-n9e/initsql/c-init.sql: -------------------------------------------------------------------------------- 1 | GRANT ALL ON *.* TO 'root'@'127.0.0.1' IDENTIFIED BY '1234'; 2 | GRANT ALL ON *.* TO 'root'@'localhost' IDENTIFIED BY '1234'; 3 | GRANT ALL ON *.* TO 'root'@'%' IDENTIFIED BY '1234'; -------------------------------------------------------------------------------- /victoriametrics/deploy-n9e/mysql/my.cnf: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | pid-file = /var/run/mysqld/mysqld.pid 3 | socket = /var/run/mysqld/mysqld.sock 4 | datadir = /var/lib/mysql 5 | bind-address = 0.0.0.0 -------------------------------------------------------------------------------- /victoriametrics/deploy-n9e/nightingale/script/notify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: UTF-8 -*- 3 | import sys 4 | import json 5 | 6 | class Sender(object): 7 | @classmethod 8 | def send_email(cls, payload): 9 | # already done in go code 10 | pass 11 | 12 | @classmethod 13 | def send_wecom(cls, payload): 14 | # already done in go code 15 | pass 16 | 17 | @classmethod 18 | def send_dingtalk(cls, payload): 19 | # already done in go code 20 | pass 21 | 22 | @classmethod 23 | def send_feishu(cls, payload): 24 | # already done in go code 25 | pass 26 | 27 | @classmethod 28 | def send_mm(cls, payload): 29 | # already done in go code 30 | pass 31 | 32 | @classmethod 33 | def send_sms(cls, payload): 34 | users = payload.get('event').get("notify_users_obj") 35 | phones = {} 36 | for u in users: 37 | if u.get("phone"): 38 | phones[u.get("phone")] = 1 39 | if phones: 40 | print("send_sms not implemented, phones: {}".format(phones.keys())) 41 | 42 | @classmethod 43 | def send_voice(cls, payload): 44 | users = payload.get('event').get("notify_users_obj") 45 | phones = {} 46 | for u in users: 47 | if u.get("phone"): 48 | phones[u.get("phone")] = 1 49 | if phones: 50 | print("send_voice not implemented, phones: {}".format(phones.keys())) 51 | 52 | def main(): 53 | payload = json.load(sys.stdin) 54 | with open(".payload", 'w') as f: 55 | f.write(json.dumps(payload, indent=4)) 56 | for ch in payload.get('event').get('notify_channels'): 57 | send_func_name = "send_{}".format(ch.strip()) 58 | if not hasattr(Sender, send_func_name): 59 | print("function: {} not found", send_func_name) 60 | continue 61 | send_func = getattr(Sender, send_func_name) 62 | send_func(payload) 63 | 64 | def hello(): 65 | print("hello nightingale") 66 | 67 | if __name__ == "__main__": 68 | if len(sys.argv) == 1: 69 | main() 70 | elif sys.argv[1] == "hello": 71 | hello() 72 | else: 73 | print("I am confused") -------------------------------------------------------------------------------- /victoriametrics/deploy-n9e/nightingale/script/notify_feishu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | import sys 4 | import json 5 | import requests 6 | 7 | class Sender(object): 8 | @classmethod 9 | def send_email(cls, payload): 10 | # already done in go code 11 | pass 12 | 13 | @classmethod 14 | def send_wecom(cls, payload): 15 | # already done in go code 16 | pass 17 | 18 | @classmethod 19 | def send_dingtalk(cls, payload): 20 | # already done in go code 21 | pass 22 | 23 | @classmethod 24 | def send_ifeishu(cls, payload): 25 | users = payload.get('event').get("notify_users_obj") 26 | tokens = {} 27 | phones = {} 28 | 29 | for u in users: 30 | if u.get("phone"): 31 | phones[u.get("phone")] = 1 32 | 33 | contacts = u.get("contacts") 34 | if contacts.get("feishu_robot_token", ""): 35 | tokens[contacts.get("feishu_robot_token", "")] = 1 36 | 37 | headers = { 38 | "Content-Type": "application/json;charset=utf-8", 39 | "Host": "open.feishu.cn" 40 | } 41 | 42 | for t in tokens: 43 | url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t) 44 | body = { 45 | "msg_type": "text", 46 | "content": { 47 | "text": payload.get('tpls').get("feishu", "feishu not found") 48 | }, 49 | "at": { 50 | "atMobiles": list(phones.keys()), 51 | "isAtAll": False 52 | } 53 | } 54 | 55 | response = requests.post(url, headers=headers, data=json.dumps(body)) 56 | print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}") 57 | 58 | @classmethod 59 | def send_mm(cls, payload): 60 | # already done in go code 61 | pass 62 | 63 | @classmethod 64 | def send_sms(cls, payload): 65 | pass 66 | 67 | @classmethod 68 | def send_voice(cls, payload): 69 | pass 70 | 71 | def main(): 72 | payload = json.load(sys.stdin) 73 | with open(".payload", 'w') as f: 74 | f.write(json.dumps(payload, indent=4)) 75 | for ch in payload.get('event').get('notify_channels'): 76 | send_func_name = "send_{}".format(ch.strip()) 77 | if not hasattr(Sender, send_func_name): 78 | print("function: {} not found", send_func_name) 79 | continue 80 | send_func = getattr(Sender, send_func_name) 81 | send_func(payload) 82 | 83 | def hello(): 84 | print("hello nightingale") 85 | 86 | if __name__ == "__main__": 87 | if len(sys.argv) == 1: 88 | main() 89 | elif sys.argv[1] == "hello": 90 | hello() 91 | else: 92 | print("I am confused") -------------------------------------------------------------------------------- /victoriametrics/deploy-n9e/provisioning/dashboards/dashboard.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: Prometheus 5 | orgId: 1 6 | folder: '' 7 | type: file 8 | options: 9 | path: /var/lib/grafana/dashboards 10 | -------------------------------------------------------------------------------- /victoriametrics/deploy-n9e/provisioning/datasources/prometheus-datasource/prometheus-datasource.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: VictoriaMetrics 5 | type: prometheus 6 | access: proxy 7 | url: http://victoriametrics:8428 8 | isDefault: true 9 | jsonData: 10 | prometheusType: Prometheus 11 | prometheusVersion: 2.24.0 12 | -------------------------------------------------------------------------------- /victoriametrics/deploy/docker-prometheus/README.md: -------------------------------------------------------------------------------- 1 | ## docker compose 部署单节点Prometheus 2 | 3 | 使用 Grafana + Prometheus + alertmanager 组合。 4 | 5 | 使用前请修改Grafana配置文件中密码,当前admin密码为admin 6 | 7 | 使用外部配置文件挂载到容器内部 8 | 9 | 文件结构: 10 | 11 | ```bash 12 | docker-prometheus/ 13 | ├── alertmanager 14 | │   └── config.yml 15 | ├── docker-compose.yml 16 | ├── grafana 17 | │   ├── config.monitoring 18 | │   └── provisioning 19 | └── prometheus 20 | ├── alert.yml 21 | └── prometheus.yml 22 | ``` -------------------------------------------------------------------------------- /victoriametrics/deploy/docker-prometheus/alertmanager/alertmanager.yml: -------------------------------------------------------------------------------- 1 | global: 2 | # 阿里邮箱 3 | smtp_smarthost: 'smtp.qiye.aliyun.com:465' 4 | # 发邮件的邮箱 5 | smtp_from: 'your-email@example.com' 6 | # 发邮件的邮箱用户名,也就是你的邮箱      7 | smtp_auth_username: 'your-email@example.com' 8 | # 发邮件的邮箱密码 9 | smtp_auth_password: 'your-password' 10 | # 进行tls验证 11 | smtp_require_tls: true 12 | 13 | route: 14 | group_by: ['alertname'] 15 | group_wait: 10s 16 | group_interval: 10s 17 | repeat_interval: 10m 18 | receiver: live-monitoring 19 | 20 | receivers: 21 | - name: 'live-monitoring' 22 | # 收邮件的邮箱 23 | email_configs: 24 | - to: 'your-email@example.com' -------------------------------------------------------------------------------- /victoriametrics/deploy/docker-prometheus/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | # prometheus 部署 3 | prometheus: 4 | container_name: prometheus 5 | image: prom/prometheus:latest 6 | restart: always 7 | volumes: 8 | - /etc/localtime:/etc/localtime:ro 9 | - $PWD/prometheus/:/etc/prometheus/ 10 | - prometheus_data:/prometheus 11 | command: 12 | - '--config.file=/etc/prometheus/prometheus.yml' 13 | - '--storage.tsdb.path=/prometheus' 14 | - '--web.console.libraries=/usr/share/prometheus/console_libraries' 15 | - '--web.console.templates=/usr/share/prometheus/consoles' 16 | networks: 17 | - monitoring 18 | expose: 19 | - '9090' 20 | ports: 21 | - 9090:9090 22 | 23 | # alertmanager 部署 24 | alertmanager: 25 | container_name: alertmanager 26 | image: prom/alertmanager:latest 27 | restart: always 28 | volumes: 29 | - /etc/localtime:/etc/localtime:ro 30 | - $PWD/alertmanager/:/etc/alertmanager/ 31 | command: 32 | - '--config.file=/etc/alertmanager/alertmanager.yml' 33 | - '--storage.path=/alertmanager' 34 | networks: 35 | - monitoring 36 | expose: 37 | - '9093' 38 | ports: 39 | - 9093:9093 40 | 41 | # grafana 部署 42 | grafana: 43 | container_name: grafana 44 | image: grafana/grafana:latest 45 | restart: always 46 | volumes: 47 | - /etc/localtime:/etc/localtime:ro 48 | - grafana_data:/var/lib/grafana 49 | - $PWD/grafana/provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources 50 | - $PWD/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards 51 | networks: 52 | - monitoring 53 | ports: 54 | - 3000:3000 55 | depends_on: 56 | - prometheus 57 | 58 | volumes: 59 | prometheus_data: {} 60 | grafana_data: {} 61 | 62 | networks: 63 | monitoring: 64 | driver: bridge -------------------------------------------------------------------------------- /victoriametrics/deploy/docker-prometheus/grafana/provisioning/dashboards/dashboard.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: Prometheus 5 | orgId: 1 6 | folder: '' 7 | type: file 8 | options: 9 | path: /var/lib/grafana/dashboards 10 | -------------------------------------------------------------------------------- /victoriametrics/deploy/docker-prometheus/grafana/provisioning/datasources/prometheus-datasource/prometheus-datasource.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: Prometheus 5 | type: prometheus 6 | access: proxy 7 | url: http://prometheus:9090 8 | isDefault: true 9 | jsonData: 10 | prometheusType: Prometheus 11 | prometheusVersion: 2.24.0 -------------------------------------------------------------------------------- /victoriametrics/deploy/docker-prometheus/prometheus/alert.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: example 3 | rules: 4 | 5 | # Alert for any instance that is unreachable for >2 minutes. 6 | - alert: service_down 7 | expr: up == 0 8 | for: 2m 9 | labels: 10 | severity: warning 11 | annotations: 12 | summary: "Instance {{ $labels.instance }} down" 13 | description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." -------------------------------------------------------------------------------- /victoriametrics/deploy/docker-prometheus/prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | # my global config 2 | global: 3 | scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. 4 | evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. 5 | # scrape_timeout is set to the global default (10s). 6 | 7 | # Alertmanager configuration 8 | alerting: 9 | alertmanagers: 10 | - static_configs: 11 | - targets: ['alertmanager:9093'] 12 | # - alertmanager:9093 13 | 14 | # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. 15 | rule_files: 16 | - "alert.yml" 17 | # - "first_rules.yml" 18 | # - "second_rules.yml" 19 | 20 | # A scrape configuration containing exactly one endpoint to scrape: 21 | # Here it's Prometheus itself. 22 | scrape_configs: 23 | # The job name is added as a label `job=` to any timeseries scraped from this config. 24 | - job_name: 'prometheus' 25 | # Override the global default and scrape targets from this job every 5 seconds. 26 | scrape_interval: 5s 27 | static_configs: 28 | - targets: ['localhost:9090'] -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/README.md: -------------------------------------------------------------------------------- 1 | ## docker compose 部署单节点VictoriMetrics -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/alert/alerts-health.yml: -------------------------------------------------------------------------------- 1 | # File contains default list of alerts for various VM components. 2 | # The following alerts are recommended for use for any VM installation. 3 | # The alerts below are just recommendations and may require some updates 4 | # and threshold calibration according to every specific setup. 5 | groups: 6 | - name: vm-health 7 | # note the `job` filter and update accordingly to your setup 8 | rules: 9 | - alert: TooManyRestarts 10 | expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2 11 | labels: 12 | severity: critical 13 | annotations: 14 | summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})" 15 | description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes. 16 | It might be crashlooping." 17 | 18 | - alert: ServiceDown 19 | expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0 20 | for: 2m 21 | labels: 22 | severity: critical 23 | annotations: 24 | summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}" 25 | description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." 26 | 27 | - alert: ProcessNearFDLimits 28 | expr: (process_max_fds - process_open_fds) < 100 29 | for: 5m 30 | labels: 31 | severity: critical 32 | annotations: 33 | summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m" 34 | description: "Exhausting OS file descriptors limit can cause severe degradation of the process.Consider to increase the limit as fast as possible." 35 | 36 | - alert: TooHighMemoryUsage 37 | expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8 38 | for: 5m 39 | labels: 40 | severity: critical 41 | annotations: 42 | summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")" 43 | description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance. 44 | Consider to either increase available memory or decrease the load on the process." 45 | 46 | - alert: TooHighCPUUsage 47 | expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 48 | for: 5m 49 | labels: 50 | severity: critical 51 | annotations: 52 | summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" 53 | description: "Too high CPU usage may be a sign of insufficient resources and make process unstable. 54 | Consider to either increase available CPU resources or decrease the load on the process." 55 | 56 | - alert: TooManyLogs 57 | expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0 58 | for: 15m 59 | labels: 60 | severity: warning 61 | annotations: 62 | summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})" 63 | description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.Worth to check logs for specific error messages." 64 | 65 | - alert: TooManyTSIDMisses 66 | expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0 67 | for: 10m 68 | labels: 69 | severity: critical 70 | annotations: 71 | summary: "Too many TSID misses for job \"{{ $labels.job }}\" ({{ $labels.instance }})" 72 | description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).Make sure you're running VictoriaMetrics of v1.85.3 or higher.Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502" 73 | 74 | - alert: ConcurrentInsertsHitTheLimit 75 | expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity 76 | for: 15m 77 | labels: 78 | severity: warning 79 | annotations: 80 | summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit" 81 | description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n 82 | Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU. 83 | In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients 84 | making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then 85 | it might be worth adjusting `-maxConcurrentInserts` cmd-line flag." -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/alert/alerts-vmalert.yml: -------------------------------------------------------------------------------- 1 | # File contains default list of alerts for vmalert service. 2 | # The alerts below are just recommendations and may require some updates 3 | # and threshold calibration according to every specific setup. 4 | groups: 5 | # Alerts group for vmalert assumes that Grafana dashboard 6 | # https://grafana.com/grafana/dashboards/14950/ is installed. 7 | # Pls update the `dashboard` annotation according to your setup. 8 | - name: vmalert 9 | interval: 30s 10 | rules: 11 | - alert: ConfigurationReloadFailure 12 | expr: vmalert_config_last_reload_successful != 1 13 | labels: 14 | severity: warning 15 | annotations: 16 | summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}" 17 | description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}. 18 | Check vmalert's logs for detailed error message." 19 | 20 | - alert: AlertingRulesError 21 | expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0 22 | for: 5m 23 | labels: 24 | severity: warning 25 | annotations: 26 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" 27 | summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}" 28 | description: "Alerting rules execution is failing for group \"{{ $labels.group }}\". 29 | Check vmalert's logs for detailed error message." 30 | 31 | - alert: RecordingRulesError 32 | expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0 33 | for: 5m 34 | labels: 35 | severity: warning 36 | annotations: 37 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" 38 | summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}" 39 | description: "Recording rules execution is failing for group \"{{ $labels.group }}\". 40 | Check vmalert's logs for detailed error message." 41 | 42 | - alert: RecordingRulesNoData 43 | expr: sum(vmalert_recording_rules_last_evaluation_samples) without(recording, id) < 1 44 | for: 30m 45 | labels: 46 | severity: info 47 | annotations: 48 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}" 49 | summary: "Recording rule {{ $labels.recording }} ({ $labels.group }}) produces no data" 50 | description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" 51 | produces 0 samples over the last 30min. It might be caused by a misconfiguration 52 | or incorrect query expression." 53 | 54 | - alert: TooManyMissedIterations 55 | expr: increase(vmalert_iteration_missed_total[5m]) > 0 56 | for: 15m 57 | labels: 58 | severity: warning 59 | annotations: 60 | summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations" 61 | description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\". 62 | The group evaluation time takes longer than the configured evaluation interval. This may result in missed 63 | alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of 64 | group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert.html#groups. 65 | If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/Troubleshooting.html#slow-queries." 66 | 67 | - alert: RemoteWriteErrors 68 | expr: increase(vmalert_remotewrite_errors_total[5m]) > 0 69 | for: 15m 70 | labels: 71 | severity: warning 72 | annotations: 73 | summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL" 74 | description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting 75 | or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message." 76 | 77 | - alert: AlertmanagerErrors 78 | expr: increase(vmalert_alerts_send_errors_total[5m]) > 0 79 | for: 15m 80 | labels: 81 | severity: warning 82 | annotations: 83 | summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager" 84 | description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\". 85 | Check vmalert's logs for detailed error message." -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/alertmanager/alertmanager.yml: -------------------------------------------------------------------------------- 1 | route: 2 | receiver: blackhole 3 | 4 | receivers: 5 | - name: blackhole -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | # vmagent 前置代理 3 | # --promscrape.config 参数文件中定义了需要抓取的目标 4 | # --remoteWrite.url 把抓取到的数据转储到时序数据库 5 | vmagent: 6 | container_name: vmagent 7 | image: victoriametrics/vmagent:latest 8 | depends_on: 9 | - "victoriametrics" 10 | ports: 11 | - 8429:8429 12 | volumes: 13 | - vmagentdata:/vmagentdata 14 | - ./scrape/prometheus.yml:/etc/prometheus/prometheus.yml 15 | command: 16 | - "--promscrape.config=/etc/prometheus/prometheus.yml" 17 | - "--remoteWrite.url=http://victoriametrics:8428/api/v1/write" 18 | networks: 19 | - vm_net 20 | restart: always 21 | 22 | # VictoriaMetrics实例,一个负责存储指标和处理读请求的单一进程 23 | victoriametrics: 24 | container_name: victoriametrics 25 | image: victoriametrics/victoria-metrics:stable 26 | ports: 27 | - 8428:8428 28 | - 8089:8089 29 | - 8089:8089/udp 30 | - 2003:2003 31 | - 2003:2003/udp 32 | - 4242:4242 33 | volumes: 34 | - vmdata:/storage 35 | command: 36 | - "--storageDataPath=/storage" 37 | - "--graphiteListenAddr=:2003" 38 | - "--opentsdbListenAddr=:4242" 39 | - "--httpListenAddr=:8428" 40 | - "--influxListenAddr=:8089" 41 | - "--vmalert.proxyURL=http://vmalert:8880" 42 | networks: 43 | - vm_net 44 | restart: always 45 | 46 | # 使用VictoriaMetrics作为数据源配置的Grafana实例 47 | grafana: 48 | container_name: grafana 49 | image: grafana/grafana:latest 50 | depends_on: 51 | - "victoriametrics" 52 | ports: 53 | - 3000:3000 54 | volumes: 55 | - grafanadata:/var/lib/grafana 56 | - ./provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources 57 | - ./provisioning/dashboards:/etc/grafana/provisioning/dashboards 58 | - ./dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json 59 | - ./dashboards/vmagent.json:/var/lib/grafana/dashboards/vmagent.json 60 | - ./dashboards/vmalert.json:/var/lib/grafana/dashboards/vmalert.json 61 | networks: 62 | - vm_net 63 | restart: always 64 | 65 | # vmalert执行警报和记录规则 66 | vmalert: 67 | container_name: vmalert 68 | image: victoriametrics/vmalert:stable 69 | depends_on: 70 | - "victoriametrics" 71 | - "alertmanager" 72 | ports: 73 | - 8880:8880 74 | volumes: 75 | - ./alert/alerts.yml:/etc/alerts/alerts.yml 76 | - ./alert/alerts-health.yml:/etc/alerts/alerts-health.yml 77 | - ./alert/alerts-vmagent.yml:/etc/alerts/alerts-vmagent.yml 78 | - ./alert/alerts-vmalert.yml:/etc/alerts/alerts-vmalert.yml 79 | command: 80 | - "--datasource.url=http://victoriametrics:8428/" 81 | - "--remoteRead.url=http://victoriametrics:8428/" 82 | - "--remoteWrite.url=http://victoriametrics:8428/" 83 | - "--notifier.url=http://alertmanager:9093/" 84 | - "--rule=/etc/alerts/*.yml" 85 | # 在Grafana中显示警报的来源 86 | - "--external.url=http://127.0.0.1:3000" # 容器外的Grafana 87 | # 在复制粘贴这行时,请注意在 $expr 中使用 $$ 进行转义 88 | - '--external.alert.source=explore?orgId=1&left={"datasource":"VictoriaMetrics","queries":[{"expr":{{$$expr|jsonEscape|queryEscape}},"refId":"A"}],"range":{"from":"now-1h","to":"now"}}' 89 | networks: 90 | - vm_net 91 | restart: always 92 | 93 | # Alertmanager 接收来自 vmalert 的警报通知 94 | # 并根据 --config.file 分发它们 95 | alertmanager: 96 | container_name: alertmanager 97 | image: prom/alertmanager:latest 98 | volumes: 99 | - ./alertmanager/alertmanager.yml:/config/alertmanager.yml 100 | command: 101 | - "--config.file=/config/alertmanager.yml" 102 | ports: 103 | - 9093:9093 104 | networks: 105 | - vm_net 106 | restart: always 107 | 108 | volumes: 109 | vmagentdata: {} 110 | vmdata: {} 111 | grafanadata: {} 112 | networks: 113 | vm_net: -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/provisioning/dashboards/dashboard.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: Prometheus 5 | orgId: 1 6 | folder: '' 7 | type: file 8 | options: 9 | path: /var/lib/grafana/dashboards 10 | -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/provisioning/datasources/prometheus-datasource/prometheus-datasource.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: VictoriaMetrics 5 | type: prometheus 6 | access: proxy 7 | url: http://victoriametrics:8428 8 | isDefault: true 9 | jsonData: 10 | prometheusType: Prometheus 11 | prometheusVersion: 2.24.0 12 | 13 | - name: VictoriaMetrics - cluster 14 | type: prometheus 15 | access: proxy 16 | url: http://vmauth:8427/select/0/prometheus 17 | isDefault: false 18 | jsonData: 19 | prometheusType: Prometheus 20 | prometheusVersion: 2.24.0 21 | -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/provisioning/datasources/victoriametrics-datasource/victoriametrics-datasource.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | # List of data sources to insert/update depending on what's 4 | # available in the database. 5 | datasources: 6 | # Name of the VictoriaMetrics datasource 7 | # displayed in Grafana panels and queries. 8 | - name: VictoriaMetrics 9 | # Sets the data source type. 10 | type: victoriametrics-datasource 11 | # Sets the access mode, either 12 | # proxy or direct (Server or Browser in the UI). 13 | # Some data sources are incompatible with any setting 14 | # but proxy (Server). 15 | access: proxy 16 | # Sets default URL of the single node version of VictoriaMetrics 17 | url: http://victoriametrics:8428 18 | # Sets the pre-selected datasource for new panels. 19 | # You can set only one default data source per organization. 20 | isDefault: true 21 | 22 | # Name of the VictoriaMetrics datasource 23 | # displayed in Grafana panels and queries. 24 | - name: VictoriaMetrics - cluster 25 | # Sets the data source type. 26 | type: victoriametrics-datasource 27 | # Sets the access mode, either 28 | # proxy or direct (Server or Browser in the UI). 29 | # Some data sources are incompatible with any setting 30 | # but proxy (Server). 31 | access: proxy 32 | # Sets default URL of the cluster version of VictoriaMetrics 33 | url: http://vmauth:8427/select/0/prometheus 34 | # Sets the pre-selected datasource for new panels. 35 | # You can set only one default data source per organization. 36 | isDefault: false 37 | -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/scrape/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | 4 | scrape_configs: 5 | - job_name: 'vmagent' 6 | static_configs: 7 | - targets: ['vmagent:8429'] 8 | - job_name: 'victoriametrics' 9 | static_configs: 10 | - targets: ['victoriametrics:8428'] 11 | # - job_name: 'vmalert' 12 | # static_configs: 13 | # - targets: ['vmalert:8880'] -------------------------------------------------------------------------------- /victoriametrics/deploy/victoriametrics/single-victoriametrics.yml: -------------------------------------------------------------------------------- 1 | services: 2 | # 指标采集 3 | # 它从 "--promscrape.config" 中定义的目标中抓取数据 4 | # 并将它们转发到 "--remoteWrite.url" 5 | vmagent: 6 | container_name: vmagent 7 | image: victoriametrics/vmagent:v1.100.0 8 | depends_on: 9 | - "victoriametrics" 10 | ports: 11 | - 8429:8429 12 | volumes: 13 | - vmagentdata:/vmagentdata 14 | - ./scrape/prometheus.yml:/etc/prometheus/prometheus.yml 15 | command: 16 | - "--promscrape.config=/etc/prometheus/prometheus.yml" 17 | - "--remoteWrite.url=http://victoriametrics:8428/api/v1/write" 18 | networks: 19 | - vm_net 20 | restart: always 21 | 22 | # VictoriaMetrics实例,一个负责存储指标和处理读请求的单一进程 23 | victoriametrics: 24 | container_name: victoriametrics 25 | image: victoriametrics/victoria-metrics:v1.100.0 26 | ports: 27 | - 8428:8428 28 | - 8089:8089 29 | - 8089:8089/udp 30 | - 2003:2003 31 | - 2003:2003/udp 32 | - 4242:4242 33 | volumes: 34 | - vmdata:/storage 35 | command: 36 | - "--storageDataPath=/storage" 37 | - "--graphiteListenAddr=:2003" 38 | - "--opentsdbListenAddr=:4242" 39 | - "--httpListenAddr=:8428" 40 | - "--influxListenAddr=:8089" 41 | networks: 42 | - vm_net 43 | restart: always 44 | 45 | # 使用VictoriaMetrics作为数据源配置的Grafana实例 46 | grafana: 47 | container_name: grafana 48 | image: grafana/grafana:10.4.1 49 | depends_on: 50 | - "victoriametrics" 51 | ports: 52 | - 3000:3000 53 | volumes: 54 | - grafanadata:/var/lib/grafana 55 | - ./provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources 56 | - ./provisioning/dashboards:/etc/grafana/provisioning/dashboards 57 | - ./dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json 58 | - ./dashboards/vmagent.json:/var/lib/grafana/dashboards/vmagent.json 59 | networks: 60 | - vm_net 61 | restart: always 62 | 63 | volumes: 64 | vmagentdata: {} 65 | vmdata: {} 66 | grafanadata: {} 67 | networks: 68 | vm_net: -------------------------------------------------------------------------------- /victoriametrics/promxy/alert/alerts-health.yml: -------------------------------------------------------------------------------- 1 | # File contains default list of alerts for various VM components. 2 | # The following alerts are recommended for use for any VM installation. 3 | # The alerts below are just recommendations and may require some updates 4 | # and threshold calibration according to every specific setup. 5 | groups: 6 | - name: vm-health 7 | # note the `job` filter and update accordingly to your setup 8 | rules: 9 | - alert: TooManyRestarts 10 | expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2 11 | labels: 12 | severity: critical 13 | annotations: 14 | summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})" 15 | description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes. 16 | It might be crashlooping." 17 | 18 | - alert: ServiceDown 19 | expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0 20 | for: 2m 21 | labels: 22 | severity: critical 23 | annotations: 24 | summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}" 25 | description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." 26 | 27 | - alert: ProcessNearFDLimits 28 | expr: (process_max_fds - process_open_fds) < 100 29 | for: 5m 30 | labels: 31 | severity: critical 32 | annotations: 33 | summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m" 34 | description: "Exhausting OS file descriptors limit can cause severe degradation of the process.Consider to increase the limit as fast as possible." 35 | 36 | - alert: TooHighMemoryUsage 37 | expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8 38 | for: 5m 39 | labels: 40 | severity: critical 41 | annotations: 42 | summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")" 43 | description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance. 44 | Consider to either increase available memory or decrease the load on the process." 45 | 46 | - alert: TooHighCPUUsage 47 | expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 48 | for: 5m 49 | labels: 50 | severity: critical 51 | annotations: 52 | summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" 53 | description: "Too high CPU usage may be a sign of insufficient resources and make process unstable. 54 | Consider to either increase available CPU resources or decrease the load on the process." 55 | 56 | - alert: TooManyLogs 57 | expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0 58 | for: 15m 59 | labels: 60 | severity: warning 61 | annotations: 62 | summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})" 63 | description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.Worth to check logs for specific error messages." 64 | 65 | - alert: TooManyTSIDMisses 66 | expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0 67 | for: 10m 68 | labels: 69 | severity: critical 70 | annotations: 71 | summary: "Too many TSID misses for job \"{{ $labels.job }}\" ({{ $labels.instance }})" 72 | description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).Make sure you're running VictoriaMetrics of v1.85.3 or higher.Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502" 73 | 74 | - alert: ConcurrentInsertsHitTheLimit 75 | expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity 76 | for: 15m 77 | labels: 78 | severity: warning 79 | annotations: 80 | summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit" 81 | description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n 82 | Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU. 83 | In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients 84 | making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then 85 | it might be worth adjusting `-maxConcurrentInserts` cmd-line flag." -------------------------------------------------------------------------------- /victoriametrics/promxy/alert/alerts-vmalert.yml: -------------------------------------------------------------------------------- 1 | # File contains default list of alerts for vmalert service. 2 | # The alerts below are just recommendations and may require some updates 3 | # and threshold calibration according to every specific setup. 4 | groups: 5 | # Alerts group for vmalert assumes that Grafana dashboard 6 | # https://grafana.com/grafana/dashboards/14950/ is installed. 7 | # Pls update the `dashboard` annotation according to your setup. 8 | - name: vmalert 9 | interval: 30s 10 | rules: 11 | - alert: ConfigurationReloadFailure 12 | expr: vmalert_config_last_reload_successful != 1 13 | labels: 14 | severity: warning 15 | annotations: 16 | summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}" 17 | description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}. 18 | Check vmalert's logs for detailed error message." 19 | 20 | - alert: AlertingRulesError 21 | expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0 22 | for: 5m 23 | labels: 24 | severity: warning 25 | annotations: 26 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" 27 | summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}" 28 | description: "Alerting rules execution is failing for group \"{{ $labels.group }}\". 29 | Check vmalert's logs for detailed error message." 30 | 31 | - alert: RecordingRulesError 32 | expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0 33 | for: 5m 34 | labels: 35 | severity: warning 36 | annotations: 37 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" 38 | summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}" 39 | description: "Recording rules execution is failing for group \"{{ $labels.group }}\". 40 | Check vmalert's logs for detailed error message." 41 | 42 | - alert: RecordingRulesNoData 43 | expr: sum(vmalert_recording_rules_last_evaluation_samples) without(recording, id) < 1 44 | for: 30m 45 | labels: 46 | severity: info 47 | annotations: 48 | dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}" 49 | summary: "Recording rule {{ $labels.recording }} ({ $labels.group }}) produces no data" 50 | description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" 51 | produces 0 samples over the last 30min. It might be caused by a misconfiguration 52 | or incorrect query expression." 53 | 54 | - alert: TooManyMissedIterations 55 | expr: increase(vmalert_iteration_missed_total[5m]) > 0 56 | for: 15m 57 | labels: 58 | severity: warning 59 | annotations: 60 | summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations" 61 | description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\". 62 | The group evaluation time takes longer than the configured evaluation interval. This may result in missed 63 | alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of 64 | group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert.html#groups. 65 | If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/Troubleshooting.html#slow-queries." 66 | 67 | - alert: RemoteWriteErrors 68 | expr: increase(vmalert_remotewrite_errors_total[5m]) > 0 69 | for: 15m 70 | labels: 71 | severity: warning 72 | annotations: 73 | summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL" 74 | description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting 75 | or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message." 76 | 77 | - alert: AlertmanagerErrors 78 | expr: increase(vmalert_alerts_send_errors_total[5m]) > 0 79 | for: 15m 80 | labels: 81 | severity: warning 82 | annotations: 83 | summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager" 84 | description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\". 85 | Check vmalert's logs for detailed error message." -------------------------------------------------------------------------------- /victoriametrics/promxy/alertmanager/alertmanager.yml: -------------------------------------------------------------------------------- 1 | route: 2 | receiver: blackhole 3 | 4 | receivers: 5 | - name: blackhole -------------------------------------------------------------------------------- /victoriametrics/promxy/cmd/promxy/config.yaml: -------------------------------------------------------------------------------- 1 | ## 2 | ## 常规的 Prometheus 配置 3 | ## 4 | global: 5 | # 评估间隔 6 | evaluation_interval: 5s 7 | # 外部标签 8 | external_labels: 9 | source: promxy 10 | 11 | # 规则文件指定一组通配符。所有匹配的文件中都会读取规则和警报。 12 | rule_files: 13 | - "*rule" 14 | 15 | # Alerting 指定与 Alertmanager 相关的设置。 16 | alerting: 17 | alertmanagers: 18 | - scheme: http 19 | static_configs: 20 | - targets: 21 | - "127.0.0.1:12345" 22 | 23 | # remote_write 配置用于 promxy 作为其本地 Appender 使用,意味着 promxy 将发送所有“写入”(而不是导出)的指标到这里。 24 | # 这些包括:记录规则、警报规则上的指标等。 25 | remote_write: 26 | - url: http://localhost:8083/receive 27 | 28 | ## 29 | ### Promxy 配置 30 | ## 31 | promxy: 32 | server_groups: 33 | # 所有上游 Prometheus 服务发现机制都使用相同的标记,全部在 https://github.com/prometheus/prometheus/blob/master/discovery/config/config.go#L33 中定义 34 | - static_configs: 35 | - targets: 36 | - localhost:9090 37 | # 要添加到从此 server_group 检索到的指标的标签 38 | labels: 39 | sg: localhost_9090 40 | # 在 server_group 中的主机之间合并时间序列值的反亲和性 41 | anti_affinity: 10s 42 | # 等待服务器响应头的时间,单位毫秒 43 | timeout: 5s 44 | # 控制是否使用 remote_read 还是 prom API 用于获取远程 RAW 数据(例如矩阵选择器) 45 | # 注意,某些 Prometheus 实现(例如 VictoriaMetrics)不支持 remote_read。 46 | remote_read: true 47 | # 配置发送远程读取请求的路径。默认为 "api/v1/read" 48 | remote_read_path: api/v1/read 49 | # path_prefix 定义要添加到此 servergroup 中所有查询的前缀 50 | # 这可以使用 __path_prefix__ 进行重标记 51 | path_prefix: /example/prefix 52 | # query_params 将以下查询参数映射添加到下游请求。 53 | # 最初的用例是将 `nocache=1` 添加到 VictoriaMetrics 下游 54 | query_params: 55 | nocache: 1 56 | # 配置用于请求的协议方案。默认为 http 57 | scheme: http 58 | # promxy 与 server_groups 中的主机通信时的 HTTP 客户端选项 59 | http_client: 60 | # 连接下游的等待时间,默认为 200 毫秒。 61 | dial_timeout: 1s 62 | tls_config: 63 | insecure_skip_verify: true 64 | 65 | # relative_time_range 定义相对于当前时间的时间范围,此 server_group 包含该范围内的数据。 66 | # 这是完全可选的,start/end 也都是可选的 67 | # 例如,如果此 servergroup 仅包含最近的 3 小时数据 68 | # "start" 将为 -3h,而 end 将被省略 69 | relative_time_range: 70 | start: -3h 71 | end: -1h 72 | truncate: false 73 | 74 | # 在合并样本流时,将优先考虑给定时间戳的最大值 75 | prefer_max: false 76 | 77 | # absolute_time_range 定义此 server_group 包含的绝对时间范围。 78 | # 这是完全可选的,start/end 也都是可选的 79 | # 例如,如果 servergroup 已被弃用且不再接收数据 80 | # 您可以设置其具有数据的特定时间。 81 | absolute_time_range: 82 | start: '2009-10-10T23:00:00Z' 83 | end: '2009-10-11T23:00:00Z' 84 | truncate: true 85 | 86 | # 可以有任意数量的其他 server_groups 87 | - static_configs: 88 | - targets: 89 | - localhost:9091 90 | labels: 91 | sg: localhost_9091 92 | anti_affinity: 10s 93 | scheme: http 94 | http_client: 95 | tls_config: 96 | insecure_skip_verify: true 97 | # ignore_error 将使给定安全组的响应“可选” 98 | # 这意味着如果此 servergroup 返回错误而其他 servergroup 不返回,则总体查询仍然可以成功 99 | ignore_error: true 100 | -------------------------------------------------------------------------------- /victoriametrics/promxy/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | # 指标采集 3 | # 它从 "--promscrape.config" 中定义的目标中抓取数据 4 | # 并将它们转发到 "--remoteWrite.url" 5 | vmagent: 6 | container_name: vmagent 7 | image: victoriametrics/vmagent:v1.100.0 8 | depends_on: 9 | - "victoriametrics" 10 | ports: 11 | - 8429:8429 12 | volumes: 13 | - vmagentdata:/vmagentdata 14 | - ./scrape/prometheus.yml:/etc/prometheus/prometheus.yml 15 | command: 16 | - "--promscrape.config=/etc/prometheus/prometheus.yml" 17 | - "--remoteWrite.url=http://victoriametrics:8428/api/v1/write" 18 | networks: 19 | - vm_net 20 | restart: always 21 | 22 | # VictoriaMetrics实例,一个负责存储指标和处理读请求的单一进程 23 | victoriametrics: 24 | container_name: victoriametrics 25 | image: victoriametrics/victoria-metrics:v1.100.0 26 | ports: 27 | - 8428:8428 28 | - 8089:8089 29 | - 8089:8089/udp 30 | - 2003:2003 31 | - 2003:2003/udp 32 | - 4242:4242 33 | volumes: 34 | - vmdata:/storage 35 | command: 36 | - "--storageDataPath=/storage" 37 | - "--graphiteListenAddr=:2003" 38 | - "--opentsdbListenAddr=:4242" 39 | - "--httpListenAddr=:8428" 40 | - "--influxListenAddr=:8089" 41 | networks: 42 | - vm_net 43 | restart: always 44 | 45 | promxy: 46 | container_name: promxy 47 | image: quay.io/jacksontj/promxy 48 | hostname: promxy 49 | ports: 50 | - "8082:8082" 51 | volumes: 52 | - promxydata:/var/log 53 | - ./cmd/promxy/config.yaml:/etc/promxy/config.yaml 54 | command: 55 | - --config=/etc/promxy/config.yaml 56 | - --log-level=info 57 | - --web.enable-lifecycle 58 | networks: 59 | - vm_net 60 | 61 | # 使用VictoriaMetrics作为数据源配置的Grafana实例 62 | grafana: 63 | container_name: grafana 64 | image: grafana/grafana:10.4.1 65 | depends_on: 66 | - "victoriametrics" 67 | ports: 68 | - 3000:3000 69 | volumes: 70 | - grafanadata:/var/lib/grafana 71 | - ./provisioning/datasources/prometheus-datasource:/etc/grafana/provisioning/datasources 72 | - ./provisioning/dashboards:/etc/grafana/provisioning/dashboards 73 | - ./dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json 74 | - ./dashboards/vmagent.json:/var/lib/grafana/dashboards/vmagent.json 75 | networks: 76 | - vm_net 77 | restart: always 78 | 79 | volumes: 80 | vmagentdata: {} 81 | vmdata: {} 82 | promxydata: {} 83 | grafanadata: {} 84 | networks: 85 | vm_net: -------------------------------------------------------------------------------- /victoriametrics/promxy/install-promxy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # 函数:安装依赖工具 5 | install_dependencies() { 6 | if [ "$OS" == "ubuntu" ] || [ "$OS" == "debian" ]; then 7 | apt-get update && apt-get install -y curl wget net-tools jq 8 | elif [ "$OS" == "centos" ]; then 9 | yum update && yum install -y curl wget net-tools jq 10 | else 11 | echo "Unsupported operating system." 12 | exit 1 13 | fi 14 | } 15 | 16 | # 函数:设置系统服务和用户 17 | setup_system() { 18 | # 创建promxy配置文件目录 19 | mkdir -p /etc/promxy 20 | # 创建promxy数据保存目录 21 | mkdir -p /var/lib/promxy 22 | 23 | # 检查promxy组是否存在,不存在则创建 24 | if ! getent group promxy > /dev/null 2>&1; then 25 | groupadd --system promxy 26 | fi 27 | 28 | # 检查promxy用户是否存在,不存在则创建 29 | if ! id -u promxy > /dev/null 2>&1; then 30 | useradd --system --home-dir /var/lib/promxy --no-create-home --gid promxy promxy 31 | fi 32 | 33 | chown -R promxy:promxy /var/lib/promxy 34 | } 35 | 36 | # 确定操作系统类型 37 | OS="unknown" 38 | if [ -f /etc/os-release ]; then 39 | . /etc/os-release 40 | OS=$ID 41 | fi 42 | 43 | # 安装依赖工具 44 | install_dependencies 45 | 46 | # 设置系统服务和用户 47 | setup_system 48 | 49 | # 获取Promxy最新版本 50 | PROMXY_VERSION=$(curl -s "https://api.github.com/repos/jacksontj/promxy/tags" | jq -r '.[0].name') 51 | 52 | # 下载并安装Promxy 53 | wget https://github.com/jacksontj/promxy/releases/download/${PROMXY_VERSION}/promxy-${PROMXY_VERSION}-linux-amd64 -O /usr/local/promxy 54 | chmod +x /usr/local/promxy 55 | 56 | cat> /etc/systemd/system/promxy.service < /etc/promxy/config.yaml <