├── .DS_Store
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── custom.md
│ └── feature_request.md
├── pull-request-template.md
└── workflows
│ ├── docker-image.yml
│ ├── follow.yml
│ ├── go-binary-release.yml
│ └── reademe-contributors.yml
├── .idea
├── .gitignore
├── PromAI.iml
└── vcs.xml
├── Dockerfile
├── PromAI.exe
├── README.md
├── config
└── config.yaml
├── deploy
└── deployment.yaml
├── go.mod
├── go.sum
├── images
├── image.png
├── image2.png
├── status.png
└── 资源概览.png
├── main.go
├── outputs
└── readme.md
├── pkg
├── config
│ └── config.go
├── metrics
│ └── collector.go
├── notify
│ └── notify.go
├── prometheus
│ ├── client.go
│ └── prometheus.go
├── report
│ ├── cleanup.go
│ └── generator.go
├── status
│ └── status.go
└── utils
│ └── utils.go
├── reports
├── .DS_Store
├── inspection_report_20241227_123648.html
└── inspection_report_20241231_201838.html
└── templates
├── report.html
└── status.html
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/.DS_Store
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Custom issue template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/pull-request-template.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | **在提出此拉取请求时,我确认了以下几点(请复选框):**
4 |
5 | - [ ] 我已阅读并理解[贡献者指南]()。
6 | - [ ] 我已检查没有与此请求重复的拉取请求。
7 | - [ ] 我已经考虑过,并确认这份呈件对其他人很有价值。
8 | - [ ] 我接受此提交可能不会被使用,并根据维护人员的意愿关闭拉取请求。
9 |
10 | **填写 PR 内容:**
11 |
12 | -
13 |
--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
1 | # This is a basic workflow to help you get started with Actions
2 | name: build docker image
3 | # Controls when the action will run.
4 | on:
5 | push:
6 | branches:
7 | - main
8 | # Allows you to run this workflow manually from the Actions tab
9 | # 可以手动触发
10 | workflow_dispatch:
11 | inputs:
12 | logLevel:
13 | description: "Log level"
14 | required: true
15 | default: "warning"
16 | tags:
17 | description: "Test scenario tags"
18 |
19 | jobs:
20 | buildx:
21 | runs-on: ubuntu-latest
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v2
25 |
26 | - name: Get current date
27 | id: date
28 | run: echo "::set-output name=today::$(date +'%Y-%m-%d_%H-%M')"
29 |
30 | - name: Set up QEMU
31 | uses: docker/setup-qemu-action@v1
32 |
33 | - name: Set up Docker Buildx
34 | id: buildx
35 | uses: docker/setup-buildx-action@v1
36 |
37 | - name: Available platforms
38 | run: echo ${{ steps.buildx.outputs.platforms }}
39 |
40 | - name: Login to DockerHub
41 | uses: docker/login-action@v1
42 | with:
43 | username: ${{ secrets.DOCKERHUB_USERNAME }}
44 | password: ${{ secrets.DOCKERHUB_TOKEN }}
45 |
46 | - name: Build and push
47 | uses: docker/build-push-action@v2
48 | with:
49 | context: .
50 | file: ./Dockerfile
51 | # 所需要的体系结构,可以在 Available platforms 步骤中获取所有的可用架构
52 | platforms: linux/amd64,linux/arm64/v8
53 | # 镜像推送时间
54 | push: ${{ github.event_name != 'pull_request' }}
55 | # 给清单打上多个标签
56 | tags: |
57 | kubehan/promai:${{ github.ref_name }}-${{ steps.date.outputs.today }}
58 | kubehan/promai:latest
--------------------------------------------------------------------------------
/.github/workflows/follow.yml:
--------------------------------------------------------------------------------
1 | name: Get Top Followers
2 | on:
3 | push:
4 | branches:
5 | - master
6 | schedule:
7 | - cron: "0 20 * * *"
8 | jobs:
9 | github_followers_job:
10 | runs-on: ubuntu-latest
11 | name: A job to display github followers in your profile
12 | steps:
13 | - uses: actions/checkout@v3
14 |
15 | - name: use github-follower-action to update README.md
16 | id: github-follower
17 | uses: JieDing/github-followers@main
18 | env:
19 | login: ${{ github.repository_owner }}
20 | pat: ${{ secrets.ACCESS_TOKEN }}
21 | - name: Commit changes
22 | run: |
23 | git config --local user.email "kubehan@163.com"
24 | git config --local user.name "Kubehan"
25 | git add -A
26 | git diff-index --quiet HEAD || git commit -m "Update GitHub followers"
27 | - name: Pull changes
28 | run: git pull -r
29 | - name: Push changes
30 | uses: ad-m/github-push-action@master
31 | with:
32 | github_token: ${{ secrets.ACCESS_TOKEN }}
33 | branch: ${{ github.ref }}
--------------------------------------------------------------------------------
/.github/workflows/go-binary-release.yml:
--------------------------------------------------------------------------------
1 | name: build-go-binary
2 |
3 | on:
4 | release:
5 | types: [created, published] # 表示在创建新的 Release 时触发
6 |
7 | jobs:
8 | build-go-binary:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | matrix:
12 | goos: [linux, windows, darwin] # 需要打包的系统
13 | goarch: [amd64, arm64] # 需要打包的架构
14 | exclude: # 排除某些平台和架构
15 | - goarch: arm64
16 | goos: windows
17 | steps:
18 | - uses: actions/checkout@v3
19 | - uses: wangyoucao577/go-release-action@v1.30
20 | with:
21 | github_token: ${{ secrets.GITHUB_TOKEN }} # 一个默认的变量,用来实现往 Release 中添加文件
22 | goos: ${{ matrix.goos }}
23 | goarch: ${{ matrix.goarch }}
24 | goversion: 1.23 # 可以指定编译使用的 Golang 版本
25 | binary_name: "PromAI" # 可以指定二进制文件的名称
26 | extra_files: README.md config outputs reports templates # 需要包含的额外文件
27 |
--------------------------------------------------------------------------------
/.github/workflows/reademe-contributors.yml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | branches:
4 | - main
5 |
6 | name: Generate a list of contributors
7 |
8 | jobs:
9 | contrib-readme-en-job:
10 | runs-on: ubuntu-latest
11 | name: A job to automate contrib in readme
12 | steps:
13 | - name: Contribute List
14 | uses: akhilmhdh/contributors-readme-action@v2.3.4
15 | env:
16 | GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }}
17 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # 默认忽略的文件
2 | /shelf/
3 | /workspace.xml
4 | # 基于编辑器的 HTTP 客户端请求
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | /.idea/
--------------------------------------------------------------------------------
/.idea/PromAI.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.io/library/golang:1.23.4-alpine3.20 AS builder
2 |
3 | WORKDIR /build
4 | COPY . .
5 | RUN go env -w GO111MODULE=on && go mod download && go build && ls -la /build
6 |
7 | FROM docker.io/alpine:3.21.0
8 | # 添加标识信息
9 | LABEL version="1.0" \
10 | description="Prometheus Automated Inspection" \
11 | maintainer="Kubehan"
12 | WORKDIR /app
13 | COPY --from=builder /build/PromAI /app/
14 | COPY --from=builder /build/config /app/config/
15 | COPY --from=builder /build/outputs /app/outputs/
16 | COPY --from=builder /build/reports /app/reports/
17 | COPY --from=builder /build/templates /app/templates/
18 | EXPOSE 8091
19 | # 运行应用程序
20 | CMD ["./PromAI", "-port", "8091"]
--------------------------------------------------------------------------------
/PromAI.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/PromAI.exe
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Prometheus 监控报告生成器
2 |
3 | > Prometheus Automated Inspection
4 |
5 | ## 项目简介
6 |
7 | 这是一个基于 Prometheus 的监控报告自动生成工具,可以自动收集、分析指标数据并生成可视化的 HTML 报告。该工具旨在简化监控数据的收集和展示过程,帮助运维人员快速了解系统状态。
8 |
9 | ## 报告样式
10 | ### 获取报告
11 | http://localhost:8091/getreport
12 |
13 | [报告样式](reports/inspection_report_20241214_131709.html)
14 | 
15 |
16 | 
17 |
18 | ## 服务健康看板
19 | ### 获取服务健康看板
20 | http://localhost:8091/status
21 |
22 | 
23 |
24 |
25 | ## 功能特点
26 |
27 | - 支持多种指标类型的监控(基础资源、Kubernetes、应用服务等)
28 | - 自动计算指标状态和告警级别(正常、警告、严重)
29 | - 生成包含数据表格和图表的 HTML 报告
30 | - 支持自定义指标阈值和标签别名
31 | - 灵活的配置文件系统
32 | - 支持多维度数据分析和展示
33 | - 自动计算关键统计指标(最大值、最小值、平均值等)
34 | - 美观的可视化界面,支持响应式布局
35 |
36 | ## 系统要求
37 |
38 | - Go 1.22 或更高版本
39 | - 可访问的 Prometheus 服务器
40 | - 现代浏览器(支持 HTML5 和 JavaScript)
41 | - 至少 512MB 可用内存
42 | - 50MB 可用磁盘空间
43 |
44 | ## 配置说明
45 |
46 | 配置文件采用 YAML 格式,主要包含以下几个部分:
47 |
48 | ### Prometheus 配置
49 |
50 | 在 `config/config.yaml` 中配置 Prometheus 服务器地址和监控指标。
51 |
52 | ```yaml
53 | prometheus_url: "http://prometheus.k8s.kubehan.cn"
54 |
55 | metric_types:
56 | - type: "基础资源使用情况"
57 | metrics:
58 | - name: "CPU使用率"
59 | description: "节点CPU使用率统计"
60 | query: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)"
61 | trend_query: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)[6h:5m]"
62 | threshold: 80
63 | unit: "%"
64 | labels:
65 | instance: "节点"
66 | # 其他指标...
67 | ```
68 |
69 | ### 指标说明
70 |
71 | 每个指标可以配置以下内容:
72 |
73 | - `name`: 指标名称
74 | - `description`: 指标描述
75 | - `query`: 用于表格显示的即时查询
76 | - `trend_query`: 用于图表显示的趋势查询
77 | - `threshold`: 指标阈值
78 | - `unit`: 指标单位
79 | - `labels`: 标签别名
80 | - `threshold_type`: 阈值比较方式: "greater", "less", "equal", "greater_equal", "less_equal"
81 |
82 | ```txt
83 | greater: 表示值必须大于阈值才被视为 "critical" 状态。
84 | greater_equal: 表示值必须大于或等于阈值才被视为 "critical" 状态。
85 | less: 表示值必须小于阈值才被视为 "normal" 状态。
86 | less_equal: 表示值必须小于或等于阈值才被视为 "normal" 状态。
87 | equal: 表示值必须等于阈值才被视为 "normal" 状态。
88 | ```
89 |
90 | ## 快速开始
91 |
92 | ### 源码编译
93 |
94 | 1. 克隆仓库:
95 |
96 | ```bash
97 | git clone https://github.com/kubehan/PromAI.git
98 | cd PromAI
99 | ```
100 | 2. 安装依赖:
101 |
102 | ```bash
103 | go mod download
104 | ```
105 | 3. 修改配置文件:
106 |
107 | ```bash
108 | cp config/config.yaml config/config.yaml
109 | # 编辑 config.yaml 设置 Prometheus 服务器地址和监控指标
110 | ```
111 | 4. 构建并运行:
112 |
113 | ```bash
114 | go build -o PromAI main.go
115 | ./PromAI -config config/config.yaml
116 | ```
117 | 5. 查看报告:
118 | 生成的报告将保存在 `reports` 目录下。
119 |
120 | ### Docker 部署
121 |
122 | ```bash
123 | docker run -d --name PromAI -p 8091:8091 kubehan/promai:latest
124 | ```
125 |
126 | ### Kubernetes 部署
127 |
128 | ```bash
129 | kubectl apply -f deploy/deployment.yaml
130 | ```
131 |
132 | ## 使用示例
133 |
134 | 在配置文件中添加所需的监控指标后,运行程序将生成 HTML 报告。报告中将包含各个指标的当前状态、历史趋势图表以及详细的表格数据。
135 |
136 | 1. 修改配置文件中的Prometheus地址为自己的地址
137 | 2. 修改配置文件中的指标
138 | 3. 运行程序 默认运行在8091端口,通过访问http://localhost:8091/getreport 查看报告
139 |
140 | ```bash
141 | go build -o PromAI main.go
142 | ./PromAI -config config/config.yaml
143 | ```
144 |
145 | # Prometheus Automated Inspection 未来新功能规划列表
146 |
147 | 1. 多数据源支持
148 | 2. 自定义仪表板
149 | 3. 历史数据存储
150 | 4. 智能告警
151 | 5. API 接口
152 | 6. 用户角色和权限管理
153 | 7. 数据导出功能
154 | 8. 集成 CI/CD 流程
155 | 9. 可视化组件库
156 | 10. 多语言支持
157 | 11. 移动端支持
158 | 12. 社区和插件支持
159 | 13. 性能优化
160 | 14. 用户反馈和建议收集
161 | 15. xxx
162 |
163 | ## 贡献
164 |
165 | 欢迎任何形式的贡献!请提交问题、建议或拉取请求。
166 |
167 | ## 贡献者
168 |
169 |
170 |
208 |
209 |
210 | ## 许可证
211 |
212 | 该项目采用 MIT 许可证,详细信息请查看 LICENSE 文件。
213 |
--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
1 | prometheus_url: "http://prometheus-k8s.kubesphere-monitoring-system.svc.cluster.local:9090"
2 |
3 |
4 | project_name: "测试项目巡检报告"
5 |
6 |
7 | # 定时任务:每天9点半和17半执行
8 |
9 | cron_schedule: "30 9,17 * * *"
10 |
11 |
12 | # 报告清理
13 |
14 | report_cleanup:
15 | enabled: true
16 | max_age: 7 # 保留最近7天的报告
17 | cron_schedule: "0 0 * * *" # 如果为空,则执行执行上面定时任务,即生成报告时清理
18 |
19 | # 配置发送钉钉和邮件
20 |
21 | notifications:
22 | dingtalk:
23 | enabled: true
24 | webhook: "https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxx" # 这里填写自己的webhook
25 | secret: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # 这里填写的是加钉钉机器人加签的secret
26 | report_url: "http://192.168.5.125:41174" # 这里可以填写ip+端口,也可以填写域名,如果是k8s里部署,推荐采用域名的方式,如果不行可以将 svc 以nodeport方式暴露出来,这里就可以使用ip+端口方式
27 | email:
28 | enabled: true
29 | smtp_host: "smtp.exmail.qq.com" # 我这里用的是腾讯企业邮箱,需要改成自己的
30 | smtp_port: 465
31 | username: "demo@demo.cn" # 填写自己的邮箱账号
32 | password: "xxxxxxxxxxxxxxxxxxxx" # 这里填写的是授权码
33 | from: "demo@demo.cn"
34 | to:
35 | - "demo@demo.cn"
36 | report_url: "https://promai.lichengjun.top" # 这里可以填写ip+端口,也可以填写域名,如果是k8s里部署,推荐采用域名的方式,如果不行可以将 svc 以nodeport方式暴露出来,这里就可以使用ip+端口方式,如果是部署在k8s里,ingress 的需要自己去编写
37 |
38 | metric_types:
39 | - type: "基础资源使用情况"
40 | metrics:
41 | - name: "CPU使用率"
42 | description: "节点CPU使用率统计"
43 | query: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)"
44 | threshold: 80
45 | threshold_type: "greater"
46 | unit: "%"
47 | labels:
48 | instance: "节点"
49 |
50 | - name: "内存使用率"
51 | description: "节点内存使用率统计"
52 | query: "100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes)"
53 | threshold: 85
54 | threshold_type: "greater"
55 | unit: "%"
56 | labels:
57 | instance: "节点"
58 |
59 | - name: "磁盘使用率"
60 | description: "节点磁盘使用率统计"
61 | query: >-
62 | (((100 -((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes))
63 | and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint!~"/run.*|/var.*|/boot.*|/tmp.*"}== 0)
64 | + on(instance) group_left(node_uname_info) node_uname_info)
65 | * on(instance) group_left(nodename) node_uname_info
66 | threshold: 80
67 | threshold_type: "greater"
68 | unit: "%"
69 | labels:
70 | instance: "节点"
71 | mountpoint: "挂载点"
72 | device: "磁盘"
73 | nodename: "节点名称"
74 |
75 |
76 | - name: "固定机器内存使用率"
77 | description: "固定机器内存使用率统计"
78 | query: >-
79 | 100 - ((node_memory_MemAvailable_bytes{instance="172.16.5.132:9100"} * 100) / node_memory_MemTotal_bytes{instance="172.16.5.132:9100"})
80 | threshold: 16.84
81 | threshold_type: "greater"
82 | unit: "%"
83 | labels:
84 | instance: "节点"
85 |
86 |
87 | # - type: "PaaS平台巡检"
88 | # metrics:
89 | # - name: "K8s集群关键服务"
90 | # description: "K8s集群关键服务状态统计"
91 | # query: "key_pod_status"
92 | # threshold: 1
93 | # threshold_type: "equal"
94 | # unit: ""
95 | # labels:
96 | # component: "服务名称"
97 | # namespace: "命名空间"
98 | # # describe: "服务描述"
99 | # hostname: "主机名称"
100 | # owner: "负责人"
101 | # instance: "节点"
102 |
103 | - type: "kubernetes集群监控状态"
104 | metrics:
105 | - name: "K8s集群巡检"
106 | description: "K8s集群巡检"
107 | query: "k8s_cluster_auto_check"
108 | threshold: 1
109 | threshold_type: "equal"
110 | unit: ""
111 | labels:
112 | component: "服务名称"
113 | hostname: "主机名称"
114 | owner: "负责人"
115 |
116 | - name: "自定义监控脚本执行情况"
117 | description: "script-exporter监控脚本执行情况"
118 | query: "script_success"
119 | threshold: 1
120 | threshold_type: "equal"
121 | unit: ""
122 | labels:
123 | instance: "宿主机器"
124 | script: "脚本名称"
125 |
126 | - name: "Pod运行状态"
127 | description: "集群Pod运行状态统计"
128 | query: "sum by (namespace, pod) (kube_pod_status_phase{phase='Running',namespace='kube-system'})"
129 | threshold: 1
130 | threshold_type: "equal"
131 | unit: ""
132 | labels:
133 | namespace: "命名空间"
134 | pod: "Pod名称"
135 |
136 | - name: "节点就绪状态"
137 | description: "K8s节点就绪状态检查"
138 | query: "kube_node_status_condition{condition='Ready',status!='true'}"
139 | threshold: 0
140 | threshold_type: "equal"
141 | unit: ""
142 | labels:
143 | node: "节点"
144 | condition: "状态类型"
145 |
146 | - name: "PVC使用率"
147 | description: "持久化存储使用率"
148 | query: >-
149 | 100 * (1 - kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes)
150 | threshold: 90
151 | threshold_type: "greater"
152 | unit: "%"
153 | labels:
154 | namespace: "命名空间"
155 | persistentvolumeclaim: "PVC名称"
156 |
--------------------------------------------------------------------------------
/deploy/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | labels:
5 | kubernetes.io/metadata.name: promai
6 | name: promai
7 |
8 | ---
9 |
10 | kind: ConfigMap
11 | apiVersion: v1
12 | metadata:
13 | name: config
14 | namespace: promai
15 | annotations:
16 | kubesphere.io/creator: admin
17 | data:
18 | config.yaml: >-
19 | prometheus_url:
20 | "http://prometheus-k8s.kubesphere-monitoring-system.svc.cluster.local:9090"
21 |
22 |
23 | project_name: "测试项目巡检报告"
24 |
25 |
26 | # 定时任务:每天9点半和17半执行
27 |
28 | cron_schedule: "30 9,17 * * *"
29 |
30 |
31 | # 报告清理
32 |
33 | report_cleanup:
34 | enabled: true
35 | max_age: 7 # 保留最近7天的报告
36 | cron_schedule: "0 0 * * *" # 如果为空,则执行执行上面定时任务,即生成报告时清理
37 |
38 | # 配置发送钉钉和邮件
39 |
40 | notifications:
41 | dingtalk:
42 | enabled: true
43 | webhook: "https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxx" # 这里填写自己的webhook
44 | secret: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # 这里填写的是加钉钉机器人加签的secret
45 | report_url: "http://192.168.5.125:41174" # 这里可以填写ip+端口,也可以填写域名,如果是k8s里部署,推荐采用域名的方式,如果不行可以将 svc 以nodeport方式暴露出来,这里就可以使用ip+端口方式
46 | email:
47 | enabled: true
48 | smtp_host: "smtp.exmail.qq.com" # 我这里用的是腾讯企业邮箱,需要改成自己的
49 | smtp_port: 465
50 | username: "demo@demo.cn" # 填写自己的邮箱账号
51 | password: "xxxxxxxxxxxxxxxxxxxx" # 这里填写的是授权码
52 | from: "demo@demo.cn"
53 | to:
54 | - "demo@demo.cn"
55 | report_url: "https://promai.lichengjun.top" # 这里可以填写ip+端口,也可以填写域名,如果是k8s里部署,推荐采用域名的方式,如果不行可以将 svc 以nodeport方式暴露出来,这里就可以使用ip+端口方式,如果是部署在k8s里,ingress 的需要自己去编写
56 |
57 | metric_types:
58 | - type: "基础资源使用情况"
59 | metrics:
60 | - name: "CPU使用率"
61 | description: "节点CPU使用率统计"
62 | query: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)"
63 | threshold: 80
64 | threshold_type: "greater"
65 | unit: "%"
66 | labels:
67 | instance: "节点"
68 |
69 | - name: "内存使用率"
70 | description: "节点内存使用率统计"
71 | query: "100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes)"
72 | threshold: 85
73 | threshold_type: "greater"
74 | unit: "%"
75 | labels:
76 | instance: "节点"
77 |
78 | - name: "磁盘使用率"
79 | description: "节点磁盘使用率统计"
80 | query: >-
81 | (((100 -((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes))
82 | and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint!~"/run.*|/var.*|/boot.*|/tmp.*"}== 0)
83 | + on(instance) group_left(node_uname_info) node_uname_info)
84 | * on(instance) group_left(nodename) node_uname_info
85 | threshold: 80
86 | threshold_type: "greater"
87 | unit: "%"
88 | labels:
89 | instance: "节点"
90 | mountpoint: "挂载点"
91 | device: "磁盘"
92 | nodename: "节点名称"
93 |
94 |
95 | - name: "固定机器内存使用率"
96 | description: "固定机器内存使用率统计"
97 | query: >-
98 | 100 - ((node_memory_MemAvailable_bytes{instance="172.16.5.132:9100"} * 100) / node_memory_MemTotal_bytes{instance="172.16.5.132:9100"})
99 | threshold: 16.84
100 | threshold_type: "greater"
101 | unit: "%"
102 | labels:
103 | instance: "节点"
104 |
105 |
106 | # - type: "PaaS平台巡检"
107 | # metrics:
108 | # - name: "K8s集群关键服务"
109 | # description: "K8s集群关键服务状态统计"
110 | # query: "key_pod_status"
111 | # threshold: 1
112 | # threshold_type: "equal"
113 | # unit: ""
114 | # labels:
115 | # component: "服务名称"
116 | # namespace: "命名空间"
117 | # # describe: "服务描述"
118 | # hostname: "主机名称"
119 | # owner: "负责人"
120 | # instance: "节点"
121 |
122 | - type: "kubernetes集群监控状态"
123 | metrics:
124 | - name: "K8s集群巡检"
125 | description: "K8s集群巡检"
126 | query: "k8s_cluster_auto_check"
127 | threshold: 1
128 | threshold_type: "equal"
129 | unit: ""
130 | labels:
131 | component: "服务名称"
132 | hostname: "主机名称"
133 | owner: "负责人"
134 |
135 | - name: "自定义监控脚本执行情况"
136 | description: "script-exporter监控脚本执行情况"
137 | query: "script_success"
138 | threshold: 1
139 | threshold_type: "equal"
140 | unit: ""
141 | labels:
142 | instance: "宿主机器"
143 | script: "脚本名称"
144 |
145 | - name: "Pod运行状态"
146 | description: "集群Pod运行状态统计"
147 | query: "sum by (namespace, pod) (kube_pod_status_phase{phase='Running',namespace='kube-system'})"
148 | threshold: 1
149 | threshold_type: "equal"
150 | unit: ""
151 | labels:
152 | namespace: "命名空间"
153 | pod: "Pod名称"
154 |
155 | - name: "节点就绪状态"
156 | description: "K8s节点就绪状态检查"
157 | query: "kube_node_status_condition{condition='Ready',status!='true'}"
158 | threshold: 0
159 | threshold_type: "equal"
160 | unit: ""
161 | labels:
162 | node: "节点"
163 | condition: "状态类型"
164 |
165 | - name: "PVC使用率"
166 | description: "持久化存储使用率"
167 | query: >-
168 | 100 * (1 - kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes)
169 | threshold: 90
170 | threshold_type: "greater"
171 | unit: "%"
172 | labels:
173 | namespace: "命名空间"
174 | persistentvolumeclaim: "PVC名称"
175 |
176 |
177 | ---
178 |
179 | kind: Deployment
180 | apiVersion: apps/v1
181 | metadata:
182 | name: promai
183 | namespace: promai
184 | labels:
185 | app: promai
186 | spec:
187 | replicas: 1
188 | selector:
189 | matchLabels:
190 | app: promai
191 | template:
192 | metadata:
193 | labels:
194 | app: promai
195 | spec:
196 | volumes:
197 | - name: volume-kv418v
198 | configMap:
199 | name: config
200 | defaultMode: 420
201 | - name: host-time
202 | hostPath:
203 | path: /etc/localtime
204 | type: ''
205 | containers:
206 | - name: promai
207 | image: 'kubehan/promai:latest'
208 | ports:
209 | - name: tcp-8091
210 | containerPort: 8091
211 | protocol: TCP
212 | env:
213 | - name: PROMETHEUS_URL
214 | value: >-
215 | http://prometheus-k8s.kubesphere-monitoring-system.svc.cluster.local:9090
216 | resources:
217 | limits:
218 | cpu: 20m
219 | memory: 50Mi
220 | requests:
221 | cpu: 5m
222 | memory: 15Mi
223 | volumeMounts:
224 | - name: volume-kv418v
225 | readOnly: true
226 | mountPath: /app/config
227 | - name: host-time
228 | readOnly: true
229 | mountPath: /etc/localtime
230 | livenessProbe:
231 | tcpSocket:
232 | port: 8091
233 | initialDelaySeconds: 15
234 | timeoutSeconds: 30
235 | periodSeconds: 10
236 | successThreshold: 1
237 | failureThreshold: 8
238 | readinessProbe:
239 | tcpSocket:
240 | port: 8091
241 | initialDelaySeconds: 15
242 | timeoutSeconds: 30
243 | periodSeconds: 10
244 | successThreshold: 1
245 | failureThreshold: 8
246 | terminationMessagePath: /dev/termination-log
247 | terminationMessagePolicy: File
248 | imagePullPolicy: IfNotPresent
249 | restartPolicy: Always
250 | terminationGracePeriodSeconds: 30
251 | dnsPolicy: ClusterFirst
252 | securityContext: {}
253 | schedulerName: default-scheduler
254 | strategy:
255 | type: RollingUpdate
256 | rollingUpdate:
257 | maxUnavailable: 25%
258 | maxSurge: 25%
259 | revisionHistoryLimit: 10
260 | progressDeadlineSeconds: 600
261 |
262 |
263 |
264 | ---
265 |
266 | kind: Service
267 | apiVersion: v1
268 | metadata:
269 | name: promai
270 | namespace: promai
271 | labels:
272 | app: promai
273 | spec:
274 | ports:
275 | - name: http-80
276 | protocol: TCP
277 | port: 80
278 | targetPort: 8091
279 | selector:
280 | app: promai
281 | type: NodePort
282 | sessionAffinity: None
283 | ipFamilies:
284 | - IPv4
285 | ipFamilyPolicy: SingleStack
286 | internalTrafficPolicy: Cluster
287 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module PromAI
2 |
3 | go 1.22.3
4 |
5 | require (
6 | github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible
7 | github.com/prometheus/client_golang v1.20.5
8 | github.com/prometheus/common v0.61.0
9 | github.com/robfig/cron/v3 v3.0.1
10 | gopkg.in/yaml.v2 v2.4.0
11 | )
12 |
13 | require (
14 | github.com/json-iterator/go v1.1.12 // indirect
15 | github.com/kr/text v0.2.0 // indirect
16 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
17 | github.com/modern-go/reflect2 v1.0.2 // indirect
18 | github.com/prometheus/client_model v0.6.1 // indirect
19 | google.golang.org/protobuf v1.35.2 // indirect
20 | )
21 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
2 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
3 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
4 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
5 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
6 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
7 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
8 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
9 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
10 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
11 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
12 | github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible h1:jdpOPRN1zP63Td1hDQbZW73xKmzDvZHzVdNYxhnTMDA=
13 | github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible/go.mod h1:1c7szIrayyPPB/987hsnvNzLushdWf4o/79s3P08L8A=
14 | github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
15 | github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
16 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
17 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
18 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
19 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
20 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
21 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
22 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
23 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
24 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
25 | github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
26 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
27 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
28 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
29 | github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
30 | github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
31 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
32 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
33 | github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
34 | github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
35 | github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
36 | github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
37 | github.com/prometheus/common v0.61.0 h1:3gv/GThfX0cV2lpO7gkTUwZru38mxevy90Bj8YFSRQQ=
38 | github.com/prometheus/common v0.61.0/go.mod h1:zr29OCN/2BsJRaFwG8QOBr41D6kkchKbpeNH7pAjb/s=
39 | github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
40 | github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
41 | github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
42 | github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
43 | github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
44 | github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
45 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
46 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
47 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
48 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
49 | golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
50 | golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
51 | golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE=
52 | golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
53 | golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
54 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
55 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
56 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
57 | google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io=
58 | google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
59 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
60 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
61 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
62 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
63 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
64 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
65 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
66 |
--------------------------------------------------------------------------------
/images/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/images/image.png
--------------------------------------------------------------------------------
/images/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/images/image2.png
--------------------------------------------------------------------------------
/images/status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/images/status.png
--------------------------------------------------------------------------------
/images/资源概览.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/images/资源概览.png
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "fmt"
6 | "html/template"
7 | "log"
8 | "net/http"
9 | "os"
10 | "time"
11 |
12 | "PromAI/pkg/config"
13 | "PromAI/pkg/metrics"
14 | "PromAI/pkg/prometheus"
15 | "PromAI/pkg/report"
16 | "PromAI/pkg/status"
17 | "PromAI/pkg/notify"
18 | "PromAI/pkg/utils"
19 |
20 | "github.com/robfig/cron/v3"
21 | "gopkg.in/yaml.v2"
22 | )
23 |
24 | // loadConfig 加载配置文件
25 | func loadConfig(path string) (*config.Config, error) {
26 | data, err := os.ReadFile(path) // 读取配置文件
27 | if err != nil {
28 | return nil, fmt.Errorf("reading config file: %w", err)
29 | }
30 |
31 | var config config.Config // 定义配置结构体
32 | if err := yaml.Unmarshal(data, &config); err != nil {
33 | return nil, fmt.Errorf("parsing config file: %w", err)
34 | } // 解析配置文件
35 | // 从环境变量中获取 PrometheusURL
36 | if envPrometheusURL := os.Getenv("PROMETHEUS_URL"); envPrometheusURL != "" {
37 | log.Printf("使用环境变量中的 Prometheus URL: %s", envPrometheusURL)
38 | config.PrometheusURL = envPrometheusURL
39 | } else {
40 | log.Printf("使用配置文件中的 Prometheus URL: %s", config.PrometheusURL)
41 | }
42 | return &config, nil // 返回配置结构体
43 | }
44 |
45 | // setup 初始化应用程序
46 | func setup(configPath string) (*prometheus.Client, *config.Config, error) {
47 | config, err := loadConfig(configPath)
48 | if err != nil {
49 | return nil, nil, fmt.Errorf("loading config: %w", err)
50 | }
51 |
52 | client, err := prometheus.NewClient(config.PrometheusURL)
53 | if err != nil {
54 | return nil, nil, fmt.Errorf("initializing Prometheus client: %w", err)
55 | }
56 |
57 | return client, config, nil
58 | }
59 |
60 | func main() {
61 | configPath := flag.String("config", "config/config.yaml", "Path to configuration file")
62 | port := flag.String("port", "8091", "Port to run the HTTP server on")
63 | flag.Parse()
64 |
65 | utils.SetGlobalPort(*port)
66 |
67 | client, config, err := setup(*configPath)
68 | if err != nil {
69 | log.Fatalf("Error setting up: %v", err)
70 | }
71 |
72 | collector := metrics.NewCollector(client.API, config)
73 |
74 | // 设置定时任务
75 | if config.CronSchedule != "" {
76 | c := cron.New()
77 | _, err := c.AddFunc(config.CronSchedule, func() {
78 | data, err := collector.CollectMetrics()
79 | if err != nil {
80 | log.Printf("定时任务收集指标失败: %v", err)
81 | return
82 | }
83 |
84 | reportFilePath, err := report.GenerateReport(*data)
85 | if err != nil {
86 | log.Printf("定时任务生成报告失败: %v", err)
87 | return
88 | }
89 | log.Printf("定时任务成功生成报告: %s", reportFilePath)
90 |
91 | if config.Notifications.Dingtalk.Enabled {
92 | log.Printf("发送钉钉消息")
93 | if err := notify.SendDingtalk(config.Notifications.Dingtalk, reportFilePath); err != nil {
94 | log.Printf("发送钉钉消息失败: %v", err)
95 | }
96 | }
97 |
98 | if config.Notifications.Email.Enabled {
99 | log.Printf("发送邮件")
100 | notify.SendEmail(config.Notifications.Email, reportFilePath)
101 | }
102 |
103 |
104 | })
105 |
106 | if err != nil {
107 | log.Printf("设置定时任务失败: %v", err)
108 | } else {
109 | c.Start()
110 | log.Printf("已启动定时任务,执行计划: %s", config.CronSchedule)
111 | }
112 | } else {
113 | log.Printf("未配置定时任务,请手动触发生成报告")
114 | }
115 | if config.ReportCleanup.Enabled {
116 | // 确定使用哪个计划
117 | cleanupSchedule := config.ReportCleanup.CronSchedule
118 | if cleanupSchedule == "" {
119 | cleanupSchedule = config.CronSchedule
120 | }
121 |
122 | if cleanupSchedule != "" {
123 | c := cron.New()
124 | _, err := c.AddFunc(cleanupSchedule, func() {
125 | if err := report.CleanupReports(config.ReportCleanup.MaxAge); err != nil {
126 | log.Printf("报告清理失败: %v", err)
127 | return
128 | }
129 | log.Printf("报告清理成功")
130 | })
131 |
132 | if err != nil {
133 | log.Printf("设置清理定时任务失败: %v", err)
134 | } else {
135 | c.Start()
136 | log.Printf("已启动清理定时任务,执行计划: %s", cleanupSchedule)
137 | }
138 | } else {
139 | log.Printf("未配置任何定时任务计划,请手动清理报告")
140 | }
141 | }
142 |
143 | // 设置路由处理器
144 | setupRoutes(collector, config)
145 |
146 | // 启动服务器
147 | log.Printf("Starting server on port: %s with config: %s", *port, *configPath)
148 | log.Printf("Prometheus URL: %s", config.PrometheusURL)
149 | log.Printf("获取报告地址: http://localhost:%s/getreport", *port)
150 | log.Printf("健康看板地址: http://localhost:%s/status", *port)
151 | if err := http.ListenAndServe(":"+*port, nil); err != nil {
152 | log.Fatalf("Error starting HTTP server: %v", err)
153 | }
154 | }
155 |
156 | // setupRoutes 设置 HTTP 路由
157 | func setupRoutes(collector *metrics.Collector, config *config.Config) {
158 | // 设置报告生成路由
159 | http.HandleFunc("/getreport", makeReportHandler(collector))
160 |
161 | // 设置静态文件服务
162 | http.Handle("/reports/", http.StripPrefix("/reports/", http.FileServer(http.Dir("reports"))))
163 |
164 | // 设置状态页面路由
165 | http.HandleFunc("/status", makeStatusHandler(collector.Client, config))
166 |
167 | }
168 |
169 | // makeReportHandler 创建报告处理器
170 | func makeReportHandler(collector *metrics.Collector) http.HandlerFunc {
171 | return func(w http.ResponseWriter, r *http.Request) {
172 | data, err := collector.CollectMetrics()
173 | if err != nil {
174 | http.Error(w, "Failed to collect metrics", http.StatusInternalServerError)
175 | log.Printf("Error collecting metrics: %v", err)
176 | return
177 | }
178 |
179 | reportFilePath, err := report.GenerateReport(*data)
180 | if err != nil {
181 | http.Error(w, "Failed to generate report", http.StatusInternalServerError)
182 | log.Printf("Error generating report: %v", err)
183 | return
184 | }
185 |
186 | http.Redirect(w, r, "/"+reportFilePath, http.StatusSeeOther)
187 | }
188 | }
189 |
190 | // makeStatusHandler 创建状态页面处理器
191 | func makeStatusHandler(client metrics.PrometheusAPI, config *config.Config) http.HandlerFunc {
192 | return func(w http.ResponseWriter, r *http.Request) {
193 | data, err := status.CollectMetricStatus(client, config)
194 | if err != nil {
195 | http.Error(w, "Failed to collect status data", http.StatusInternalServerError)
196 | log.Printf("Error collecting status data: %v", err)
197 | return
198 | }
199 |
200 | // 创建模板函数映射
201 | funcMap := template.FuncMap{
202 | "now": time.Now,
203 | "date": func(format string, t time.Time) string {
204 | return t.Format(format)
205 | },
206 | }
207 |
208 | tmpl := template.New("status.html").Funcs(funcMap)
209 | tmpl, err = tmpl.ParseFiles("templates/status.html")
210 | if err != nil {
211 | http.Error(w, "Failed to parse template", http.StatusInternalServerError)
212 | log.Printf("Error parsing template: %v", err)
213 | return
214 | }
215 |
216 | if err := tmpl.Execute(w, data); err != nil {
217 | http.Error(w, "Failed to render template", http.StatusInternalServerError)
218 | log.Printf("Error rendering template: %v", err)
219 | return
220 | }
221 | }
222 | }
223 |
--------------------------------------------------------------------------------
/outputs/readme.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/outputs/readme.md
--------------------------------------------------------------------------------
/pkg/config/config.go:
--------------------------------------------------------------------------------
1 | package config
2 |
3 | import "PromAI/pkg/notify"
4 |
5 | type Config struct {
6 | PrometheusURL string `yaml:"prometheus_url"`
7 | MetricTypes []MetricType `yaml:"metric_types"`
8 | ProjectName string `yaml:"project_name"`
9 | CronSchedule string `yaml:"cron_schedule"`
10 | ReportCleanup struct {
11 | Enabled bool `yaml:"enabled"`
12 | MaxAge int `yaml:"max_age"`
13 | CronSchedule string `yaml:"cron_schedule"`
14 | } `yaml:"report_cleanup"`
15 | Notifications struct {
16 | Dingtalk notify.DingtalkConfig `yaml:"dingtalk"`
17 | Email notify.EmailConfig `yaml:"email"`
18 | } `yaml:"notifications"`
19 | Port string `yaml:"port"`
20 | }
21 |
22 | type MetricType struct {
23 | Type string `yaml:"type"`
24 | Metrics []MetricConfig `yaml:"metrics"`
25 | }
26 |
27 | type MetricConfig struct {
28 | Name string `yaml:"name"`
29 | Description string `yaml:"description"`
30 | Query string `yaml:"query"`
31 | Threshold float64 `yaml:"threshold"`
32 | Unit string `yaml:"unit"`
33 | Labels map[string]string `yaml:"labels"`
34 | ThresholdType string `yaml:"threshold_type"`
35 | }
36 |
--------------------------------------------------------------------------------
/pkg/metrics/collector.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "html/template"
7 | "log"
8 | "time"
9 |
10 | v1 "github.com/prometheus/client_golang/api/prometheus/v1"
11 | "github.com/prometheus/common/model"
12 |
13 | "PromAI/pkg/config"
14 | "PromAI/pkg/report"
15 | )
16 |
17 | // Collector 处理指标收集
18 | type Collector struct {
19 | Client PrometheusAPI
20 | config *config.Config
21 | }
22 |
23 | type PrometheusAPI interface {
24 | Query(ctx context.Context, query string, ts time.Time, opts ...v1.Option) (model.Value, v1.Warnings, error)
25 | QueryRange(ctx context.Context, query string, r v1.Range, opts ...v1.Option) (model.Value, v1.Warnings, error)
26 | }
27 |
28 | // NewCollector 创建新的收集器
29 | func NewCollector(client PrometheusAPI, config *config.Config) *Collector {
30 | return &Collector{
31 | Client: client,
32 | config: config,
33 | }
34 | }
35 |
36 | // CollectMetrics 收集指标数据
37 | func (c *Collector) CollectMetrics() (*report.ReportData, error) {
38 | ctx := context.Background()
39 |
40 | data := &report.ReportData{
41 | Timestamp: time.Now(),
42 | MetricGroups: make(map[string]*report.MetricGroup),
43 | ChartData: make(map[string]template.JS),
44 | Project: c.config.ProjectName,
45 | }
46 |
47 | for _, metricType := range c.config.MetricTypes {
48 | group := &report.MetricGroup{
49 | Type: metricType.Type,
50 | MetricsByName: make(map[string][]report.MetricData),
51 | }
52 | data.MetricGroups[metricType.Type] = group
53 |
54 | for _, metric := range metricType.Metrics {
55 | result, _, err := c.Client.Query(ctx, metric.Query, time.Now())
56 | if err != nil {
57 | log.Printf("警告: 查询指标 %s 失败: %v", metric.Name, err)
58 | continue
59 | }
60 | log.Printf("指标 [%s] 查询结果: %+v", metric.Name, result)
61 |
62 | switch v := result.(type) {
63 | case model.Vector:
64 | metrics := make([]report.MetricData, 0, len(v))
65 | for _, sample := range v {
66 | log.Printf("指标 [%s] 原始数据: %+v, 值: %+v", metric.Name, sample.Metric, sample.Value)
67 |
68 | availableLabels := make(map[string]string)
69 | for labelName, labelValue := range sample.Metric {
70 | availableLabels[string(labelName)] = string(labelValue)
71 | }
72 |
73 | labels := make([]report.LabelData, 0, len(metric.Labels))
74 | for configLabel, configAlias := range metric.Labels {
75 | labelValue := "-"
76 | if rawValue, exists := availableLabels[configLabel]; exists && rawValue != "" {
77 | labelValue = rawValue
78 | } else {
79 | log.Printf("警告: 指标 [%s] 标签 [%s] 缺失或为空", metric.Name, configLabel)
80 | }
81 |
82 | labels = append(labels, report.LabelData{
83 | Name: configLabel,
84 | Alias: configAlias,
85 | Value: labelValue,
86 | })
87 | }
88 |
89 | if !validateLabels(labels) {
90 | log.Printf("警告: 指标 [%s] 标签数据不完整,跳过该条记录", metric.Name)
91 | continue
92 | }
93 |
94 | metricData := report.MetricData{
95 | Name: metric.Name,
96 | Description: metric.Description,
97 | Value: float64(sample.Value),
98 | Threshold: metric.Threshold,
99 | Unit: metric.Unit,
100 | Status: getStatus(float64(sample.Value), metric.Threshold, metric.ThresholdType),
101 | StatusText: report.GetStatusText(getStatus(float64(sample.Value), metric.Threshold, metric.ThresholdType)),
102 | Timestamp: time.Now(),
103 | Labels: labels,
104 | }
105 |
106 | if err := validateMetricData(metricData, metric.Labels); err != nil {
107 | log.Printf("警告: 指标 [%s] 数据验证失败: %v", metric.Name, err)
108 | continue
109 | }
110 |
111 | metrics = append(metrics, metricData)
112 | }
113 | group.MetricsByName[metric.Name] = metrics
114 | }
115 | }
116 | }
117 | return data, nil
118 | }
119 |
120 | // validateMetricData 验证指标数据的完整性
121 | func validateMetricData(data report.MetricData, configLabels map[string]string) error {
122 | if len(data.Labels) != len(configLabels) {
123 | return fmt.Errorf("标签数量不匹配: 期望 %d, 实际 %d",
124 | len(configLabels), len(data.Labels))
125 | }
126 |
127 | labelMap := make(map[string]bool)
128 | for _, label := range data.Labels {
129 | if _, exists := configLabels[label.Name]; !exists {
130 | return fmt.Errorf("发现未配置的标签: %s", label.Name)
131 | }
132 | if label.Value == "" || label.Value == "-" {
133 | return fmt.Errorf("标签 %s 值为空", label.Name)
134 | }
135 | labelMap[label.Name] = true
136 | }
137 |
138 | return nil
139 | }
140 |
141 | // getStatus 获取状态
142 | func getStatus(value, threshold float64, thresholdType string) string {
143 | if thresholdType == "" {
144 | thresholdType = "greater"
145 | }
146 | switch thresholdType {
147 | case "greater":
148 | if value > threshold {
149 | return "critical"
150 | } else if value >= threshold*0.8 {
151 | return "warning"
152 | }
153 | case "greater_equal":
154 | if value >= threshold {
155 | return "critical"
156 | } else if value >= threshold*0.8 {
157 | return "warning"
158 | }
159 | case "less":
160 | if value < threshold {
161 | return "normal"
162 | } else if value <= threshold*1.2 {
163 | return "warning"
164 | }
165 | case "less_equal":
166 | if value <= threshold {
167 | return "normal"
168 | } else if value <= threshold*1.2 {
169 | return "warning"
170 | }
171 | case "equal":
172 | if value == threshold {
173 | return "normal"
174 | } else if value > threshold {
175 | return "critical"
176 | }
177 | return "critical"
178 | }
179 | return "normal"
180 | }
181 |
182 | // validateLabels 验证标签数据的完整性
183 | func validateLabels(labels []report.LabelData) bool {
184 | for _, label := range labels {
185 | if label.Value == "" || label.Value == "-" {
186 | return false
187 | }
188 | }
189 | return true
190 | }
191 |
--------------------------------------------------------------------------------
/pkg/notify/notify.go:
--------------------------------------------------------------------------------
1 | package notify
2 |
3 | import (
4 | // "PromAI/pkg/utils"
5 | "bytes"
6 | "crypto/hmac"
7 | "crypto/sha256"
8 | "crypto/tls"
9 | "encoding/base64"
10 | "encoding/json"
11 | "fmt"
12 | "io"
13 | "log"
14 | "mime/multipart"
15 | "net/http"
16 | "net/smtp"
17 | "net/url"
18 | "os"
19 | "path/filepath"
20 | "time"
21 |
22 | "github.com/jordan-wright/email"
23 | )
24 |
25 | type DingtalkConfig struct {
26 | Enabled bool `yaml:"enabled"`
27 | Webhook string `yaml:"webhook"`
28 | Secret string `yaml:"secret"`
29 | ReportURL string `yaml:"report_url"`
30 | }
31 |
32 | type EmailConfig struct {
33 | Enabled bool `yaml:"enabled"`
34 | SMTPHost string `yaml:"smtp_host"`
35 | SMTPPort int `yaml:"smtp_port"`
36 | Username string `yaml:"username"`
37 | Password string `yaml:"password"`
38 | From string `yaml:"from"`
39 | To []string `yaml:"to"`
40 | ReportURL string `yaml:"report_url"`
41 | }
42 |
43 | // config/config.yaml 中 dingtalk 配置
44 | // notifications:
45 | // dingtalk:
46 | // enabled: true
47 | // webhook: "https://oapi.dingtalk.com/robot/send?access_token=29f727c8c973e5fb8d8339968d059393a4b4bb0bdcd667d592996035a8c0e135"
48 | // secret: "SEC75fd20834b42064b86c1aa97930738befeb2fe214044649397752212c5894848"
49 |
50 | // SendDingtalk 发送钉钉通知
51 | func SendDingtalk(config DingtalkConfig, reportPath string) error {
52 | if !config.Enabled {
53 | log.Printf("钉钉通知未启用")
54 | return nil
55 | }
56 | log.Printf("开始发送钉钉通知...")
57 | // 计算时间戳和签名
58 | timestamp := time.Now().UnixMilli()
59 | sign := calculateDingtalkSign(timestamp, config.Secret)
60 | webhook := fmt.Sprintf("%s×tamp=%d&sign=%s", config.Webhook, timestamp, sign)
61 |
62 | log.Printf("准备发送请求到 webhook: %s", webhook)
63 | // 创建multipart表单
64 | body := &bytes.Buffer{}
65 | writer := multipart.NewWriter(body)
66 |
67 | // 添加文件
68 | file, err := os.Open(reportPath)
69 | if err != nil {
70 | log.Printf("打开文件失败: %v", err)
71 | return fmt.Errorf("打开文件失败: %v", err)
72 | }
73 | defer file.Close()
74 |
75 | part, err := writer.CreateFormFile("file", filepath.Base(reportPath))
76 | if err != nil {
77 | log.Printf("创建表单文件失败: %v", err)
78 | return fmt.Errorf("创建表单文件失败: %v", err)
79 | }
80 |
81 | fileContent, err := os.ReadFile(reportPath)
82 | if err != nil {
83 | log.Printf("读取文件失败: %v", err)
84 | return fmt.Errorf("读取文件失败: %v", err)
85 | }
86 | part.Write(fileContent)
87 |
88 | // 正确生成报告的访问链接
89 | reportFileName := filepath.Base(reportPath)
90 | reportLink := fmt.Sprintf("%s/reports/%s", config.ReportURL, reportFileName)
91 |
92 | // 添加消息内容
93 | messageContent := map[string]interface{}{
94 | "msgtype": "markdown",
95 | "markdown": map[string]string{
96 | "title": "巡检报告",
97 | "text": fmt.Sprintf("## 🔍 巡检报告已生成\n\n"+
98 | "### ⏰ 生成时间\n"+
99 | "> %s\n\n"+
100 | "### 📄 报告详情\n"+
101 | "- **文件名**:`%s`\n"+
102 | "- **访问链接**:[点击查看报告](%s)\n\n"+
103 | "---\n"+
104 | "💡 请登录环境查看完整报告内容",
105 | time.Now().Format("2006-01-02 15:04:05"),
106 | reportFileName,
107 | reportLink),
108 | },
109 | }
110 |
111 | jsonData, err := json.Marshal(messageContent)
112 | if err != nil {
113 | log.Printf("JSON编码失败: %v", err)
114 | return fmt.Errorf("JSON编码失败: %v", err)
115 | }
116 |
117 | // 发送请求
118 | req, err := http.NewRequest("POST", webhook, bytes.NewBuffer(jsonData))
119 | if err != nil {
120 | log.Printf("创建请求失败: %v", err)
121 | return fmt.Errorf("创建请求失败: %v", err)
122 | }
123 | req.Header.Set("Content-Type", "application/json")
124 |
125 | client := &http.Client{}
126 | resp, err := client.Do(req)
127 | if err != nil {
128 | log.Printf("发送请求失败: %v", err)
129 | return fmt.Errorf("发送请求失败: %v", err)
130 | }
131 | defer resp.Body.Close()
132 | respBody, _ := io.ReadAll(resp.Body)
133 | log.Printf("钉钉响应状态码: %d, 响应内容: %s", resp.StatusCode, string(respBody))
134 |
135 | if resp.StatusCode != http.StatusOK {
136 | return fmt.Errorf("钉钉发送失败,状态码: %d", resp.StatusCode)
137 | }
138 |
139 | log.Printf("钉钉通知发送成功")
140 | return nil
141 | }
142 |
143 | // SendEmail 发送邮件通知
144 | func SendEmail(config EmailConfig, reportPath string) error {
145 | if !config.Enabled {
146 | log.Printf("邮件通知未启用")
147 | return nil
148 | }
149 |
150 | log.Printf("开始发送邮件通知...")
151 | log.Printf("SMTP服务器: %s:%d", config.SMTPHost, config.SMTPPort)
152 | log.Printf("发件人: %s", config.From)
153 | log.Printf("收件人: %v", config.To)
154 |
155 | e := email.NewEmail()
156 | e.From = config.From
157 | e.To = config.To
158 | e.Subject = "巡检报告"
159 |
160 | // 正确生成报告的访问链接
161 | reportFileName := filepath.Base(reportPath)
162 | reportLink := fmt.Sprintf("%s/reports/%s", config.ReportURL, reportFileName)
163 |
164 | // 添加更丰富的邮件内容
165 | e.HTML = []byte(fmt.Sprintf(`
166 | 🔍 巡检报告已生成
167 | 生成时间:%s
168 | 报告文件:%s
169 | 在线查看:点击查看报告
170 | 请登录环境查看完整报告内容!
171 | `,
172 | time.Now().Format("2006-01-02 15:04:05"),
173 | reportFileName,
174 | reportLink))
175 |
176 | // 添加附件
177 | if _, err := e.AttachFile(reportPath); err != nil {
178 | log.Printf("添加附件失败: %v", err)
179 | return fmt.Errorf("添加附件失败: %v", err)
180 | }
181 |
182 | // 发送邮件(使用TLS)
183 | addr := fmt.Sprintf("%s:%d", config.SMTPHost, config.SMTPPort)
184 | auth := smtp.PlainAuth("", config.Username, config.Password, config.SMTPHost)
185 |
186 | tlsConfig := &tls.Config{
187 | InsecureSkipVerify: true,
188 | ServerName: config.SMTPHost,
189 | }
190 |
191 | log.Printf("正在发送邮件...")
192 | if err := e.SendWithTLS(addr, auth, tlsConfig); err != nil {
193 | log.Printf("发送邮件失败: %v", err)
194 | log.Printf("SMTP配置信息:")
195 | log.Printf("- 服务器: %s", config.SMTPHost)
196 | log.Printf("- 端口: %d", config.SMTPPort)
197 | log.Printf("- 用户名: %s", config.Username)
198 | return fmt.Errorf("发送邮件失败: %v", err)
199 | }
200 |
201 | log.Printf("邮件发送成功")
202 | return nil
203 | }
204 |
205 | // calculateDingtalkSign 计算钉钉签名
206 | func calculateDingtalkSign(timestamp int64, secret string) string {
207 | stringToSign := fmt.Sprintf("%d\n%s", timestamp, secret)
208 | h := hmac.New(sha256.New, []byte(secret))
209 | h.Write([]byte(stringToSign))
210 | return url.QueryEscape(base64.StdEncoding.EncodeToString(h.Sum(nil)))
211 | }
212 |
--------------------------------------------------------------------------------
/pkg/prometheus/client.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/prometheus/client_golang/api"
7 | v1 "github.com/prometheus/client_golang/api/prometheus/v1"
8 | )
9 |
10 | // Client 封装 Prometheus 客户端
11 | type Client struct {
12 | API v1.API
13 | }
14 |
15 | // NewClient 创建新的 Prometheus 客户端
16 | func NewClient(url string) (*Client, error) {
17 | client, err := api.NewClient(api.Config{
18 | Address: url,
19 | })
20 | if err != nil {
21 | return nil, fmt.Errorf("creating prometheus client: %w", err)
22 | }
23 |
24 | return &Client{
25 | API: v1.NewAPI(client),
26 | }, nil
27 | }
28 |
--------------------------------------------------------------------------------
/pkg/prometheus/prometheus.go:
--------------------------------------------------------------------------------
1 | package prometheus
2 |
3 | import (
4 | "io"
5 | "net/http"
6 | )
7 |
8 | func FetchData(url string) ([]byte, error) {
9 | resp, err := http.Get(url)
10 | if err != nil {
11 | return nil, err
12 | }
13 | defer resp.Body.Close()
14 | return io.ReadAll(resp.Body)
15 | }
16 |
--------------------------------------------------------------------------------
/pkg/report/cleanup.go:
--------------------------------------------------------------------------------
1 | package report
2 |
3 | import (
4 | "log"
5 | "os"
6 | "path/filepath"
7 | "time"
8 | )
9 |
10 | // CleanupReports 清理旧报告
11 | func CleanupReports(maxAge int) error {
12 | reportsDir := "reports"
13 | now := time.Now()
14 |
15 | // 遍历报告目录
16 | return filepath.Walk(reportsDir, func(path string, info os.FileInfo, err error) error {
17 | if err != nil {
18 | return err
19 | }
20 |
21 | // 跳过目录本身
22 | if path == reportsDir {
23 | return nil
24 | }
25 |
26 | // 检查文件年龄
27 | if info.ModTime().Add(time.Duration(maxAge) * 24 * time.Hour).Before(now) {
28 | if err := os.Remove(path); err != nil {
29 | log.Printf("删除报告文件失败 %s: %v", path, err)
30 | return err
31 | }
32 | log.Printf("已删除过期报告: %s", path)
33 | }
34 |
35 | return nil
36 | })
37 | }
--------------------------------------------------------------------------------
/pkg/report/generator.go:
--------------------------------------------------------------------------------
1 | package report
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "html/template"
7 | "log"
8 | "math"
9 | "os"
10 | "sort"
11 | "time"
12 | )
13 |
14 | type LabelData struct {
15 | Name string // 原始标签名
16 | Alias string // 显示的别名
17 | Value string // 标签值
18 | }
19 | type GroupStats struct {
20 | MaxValue float64
21 | MinValue float64
22 | Average float64
23 | AlertCount int // 告警数量
24 | CriticalCount int // 严重告警数量
25 | WarningCount int // 警告数量
26 | TotalCount int // 总指标数
27 | }
28 | type MetricData struct {
29 | Instance string
30 | Name string
31 | Description string
32 | Value float64
33 | Threshold float64
34 | Unit string
35 | Status string
36 | StatusText string
37 | Timestamp time.Time
38 | Labels []LabelData // 改用结构化的标签数据
39 | }
40 |
41 | type MetricGroup struct {
42 | Type string
43 | MetricsByName map[string][]MetricData
44 | Stats GroupStats // 替换原来的 Average
45 | }
46 | type ReportData struct {
47 | Timestamp time.Time
48 | MetricGroups map[string]*MetricGroup
49 | ChartData map[string]template.JS
50 | Project string
51 | }
52 |
53 | func GetStatusText(status string) string {
54 | switch status {
55 | case "critical":
56 | return "严重"
57 | case "warning":
58 | return "警告"
59 | default:
60 | return "正常"
61 | }
62 | }
63 |
64 | func GenerateReport(data ReportData) (string, error) {
65 | // 计算每个组的统计信息
66 | for _, group := range data.MetricGroups {
67 | stats := GroupStats{
68 | MinValue: math.MaxFloat64,
69 | }
70 |
71 | for _, metrics := range group.MetricsByName {
72 | for _, metric := range metrics {
73 | // 更新最大最小值
74 | stats.MaxValue = math.Max(stats.MaxValue, metric.Value)
75 | stats.MinValue = math.Min(stats.MinValue, metric.Value)
76 | stats.TotalCount++
77 |
78 | // 累加值用于计算平均值
79 | // stats.Average += metric.Value
80 |
81 | // 统计告警数量
82 | switch metric.Status {
83 | case "warning":
84 | stats.WarningCount++
85 | stats.AlertCount++
86 | case "critical":
87 | stats.CriticalCount++
88 | stats.AlertCount++
89 | }
90 | }
91 | }
92 |
93 | // 计算平均值 平均值无意义,先暂时取消
94 | // if stats.TotalCount > 0 {
95 | // stats.Average = stats.Average / float64(stats.TotalCount)
96 | // }
97 | group.Stats = stats
98 | }
99 |
100 | // 处理图表数据
101 | allLabels := make(map[string]bool) // 用于存储所有唯一的标签值
102 | chartData := make(map[string][]float64) // 用于存储图表数据
103 | // 收集所有唯一的标签值和准备图表数据
104 | labelValuesByMetric := make(map[string]map[string]bool) // 按指标存储唯一标签值
105 |
106 | // 第一次遍历收集每个指标的唯一标签值
107 | for _, group := range data.MetricGroups {
108 | for metricName, metrics := range group.MetricsByName {
109 | metricKey := fmt.Sprintf("%s_%s", group.Type, metricName)
110 | labelValuesByMetric[metricKey] = make(map[string]bool)
111 | // log.Println("指标组:", group.Type, "指标:", metricName, "指标键:", metricKey)
112 | for _, metric := range metrics {
113 | for _, label := range metric.Labels {
114 | labelValuesByMetric[metricKey][label.Value] = true
115 | // log.Println("指标组:", group.Type, "指标:", metricName, "指标键:", metricKey, "标签值:", label.Value)
116 | allLabels[label.Value] = true
117 |
118 | }
119 | }
120 | }
121 | }
122 |
123 | // 第二次遍历按标签值顺序生成图表数据
124 | for _, group := range data.MetricGroups {
125 | for metricName, metrics := range group.MetricsByName {
126 | metricKey := fmt.Sprintf("%s_%s", group.Type, metricName)
127 | metricValues := make(map[string]float64)
128 | // log.Println("指标类型:", group.Type, "指标名称:", metricName, "指标Key:", metricKey)
129 |
130 | // 初始化所有标签值对应的指标值为0
131 | for labelValue := range labelValuesByMetric[metricKey] {
132 |
133 | metricValues[labelValue] = 0
134 |
135 | log.Println("标签值:", labelValue, "指标值:", metricValues[labelValue])
136 | }
137 |
138 | // 填充实际的指标值
139 | for _, metric := range metrics {
140 | if len(metric.Labels) > 0 {
141 | metricValues[metric.Labels[0].Value] = metric.Value
142 | }
143 | }
144 |
145 | // 按标签值顺序添加到图表数据
146 | chartData[metricKey] = make([]float64, 0)
147 | for labelValue := range labelValuesByMetric[metricKey] {
148 | chartData[metricKey] = append(chartData[metricKey], metricValues[labelValue])
149 | }
150 | // log.Println("图表数据:", metricKey, "图表数据值:", chartData[metricKey])
151 | }
152 | }
153 |
154 | // 转换标签为数组并排序
155 | labels := make([]string, 0, len(allLabels))
156 | for label := range allLabels {
157 | labels = append(labels, label)
158 | }
159 | sort.Strings(labels)
160 |
161 | // 转换为JSON
162 | labelsJSON, _ := json.Marshal(labels)
163 | data.ChartData["labels"] = template.JS(labelsJSON)
164 | // log.Println("标签:", labels)
165 | // 为每个指标生成图表数据
166 | for key, values := range chartData {
167 | valuesJSON, _ := json.Marshal(values)
168 | data.ChartData[key] = template.JS(valuesJSON)
169 | }
170 |
171 | // 生成报告
172 | tmpl, err := template.ParseFiles("templates/report.html")
173 | if err != nil {
174 | return "", fmt.Errorf("parsing template: %w", err)
175 | }
176 |
177 | // 创建输出文件
178 | filename := fmt.Sprintf("reports/inspection_report_%s.html", time.Now().Format("20060102_150405"))
179 | file, err := os.Create(filename)
180 | if err != nil {
181 | return "", fmt.Errorf("creating output file: %w", err)
182 | }
183 | defer file.Close()
184 |
185 | // 执行模板
186 | if err := tmpl.Execute(file, data); err != nil {
187 | return "", fmt.Errorf("executing template: %w", err)
188 | }
189 |
190 | // log.Println("Report generated successfully:", filename)
191 | log.Printf("项目[%s]报告生成成功: %s", data.Project, filename)
192 |
193 | return filename, nil // 添加返回语句
194 | }
195 |
--------------------------------------------------------------------------------
/pkg/status/status.go:
--------------------------------------------------------------------------------
1 | package status
2 |
3 | import (
4 | "context"
5 | "log"
6 | "time"
7 |
8 | "PromAI/pkg/config"
9 | "PromAI/pkg/metrics"
10 |
11 | v1 "github.com/prometheus/client_golang/api/prometheus/v1"
12 | "github.com/prometheus/common/model"
13 | )
14 |
15 | // 添加配置相关的类型定义
16 | type Config struct {
17 | PrometheusURL string `yaml:"prometheus_url"`
18 | MetricTypes []MetricType `yaml:"metric_types"`
19 | }
20 |
21 | type MetricType struct {
22 | Type string `yaml:"type"`
23 | Metrics []MetricConfig `yaml:"metrics"`
24 | }
25 |
26 | type MetricConfig struct {
27 | Name string `yaml:"name"`
28 | Description string `yaml:"description"`
29 | Query string `yaml:"query"`
30 | Threshold float64 `yaml:"threshold"`
31 | Unit string `yaml:"unit"`
32 | Labels map[string]string `yaml:"labels"`
33 | ThresholdType string `yaml:"threshold_type"`
34 | }
35 |
36 | type StatusSummary struct {
37 | Normal int
38 | Warning int // 新增警告状态计数
39 | Abnormal int
40 | TotalMetrics int // 总指标数
41 | TypeCounts map[string]int // 每种类型的指标数量
42 | }
43 |
44 | type MetricStatus struct {
45 | Name string
46 | DailyStatus map[string]string // key是日期,value是状态("normal"/"warning"/"abnormal")
47 | Threshold float64
48 | Unit string
49 | ThresholdType string
50 | }
51 |
52 | type StatusData struct {
53 | Summary StatusSummary
54 | Metrics []MetricStatus
55 | Dates []string
56 | }
57 |
58 | func GenerateStatusData(days int) (*StatusData, error) {
59 | data := &StatusData{
60 | Summary: StatusSummary{
61 | TypeCounts: make(map[string]int), // 初始化类型计数map
62 | },
63 | Metrics: []MetricStatus{},
64 | Dates: make([]string, days),
65 | }
66 |
67 | // 生成最近n天的日期
68 | now := time.Now()
69 | for i := 0; i < days; i++ {
70 | date := now.AddDate(0, 0, -i)
71 | data.Dates[days-1-i] = date.Format("01-02") // MM-DD格式
72 | }
73 |
74 | return data, nil
75 | }
76 |
77 | func CollectMetricStatus(client metrics.PrometheusAPI, config *config.Config) (*StatusData, error) {
78 | data, err := GenerateStatusData(7) // 显示最近7天的数据
79 | if err != nil {
80 | log.Printf("生成状态数据失败: %v", err)
81 | return nil, err
82 | }
83 |
84 | log.Printf("开始收集指标状态数据,时间范围: %v", data.Dates)
85 |
86 | // 遍历所有指标类型
87 | for _, metricType := range config.MetricTypes {
88 | log.Printf("处理指标类型: %s", metricType.Type)
89 |
90 | // 统计每种类型的指标数量
91 | data.Summary.TypeCounts[metricType.Type] = len(metricType.Metrics)
92 | // 累加总指标数
93 | data.Summary.TotalMetrics += len(metricType.Metrics)
94 |
95 | // 遍历每个指标
96 | for _, metric := range metricType.Metrics {
97 | log.Printf("处理指标: %s (阈值: %v %s, 阈值类型: %s)",
98 | metric.Name, metric.Threshold, metric.Unit, metric.ThresholdType)
99 |
100 | metricStatus := MetricStatus{
101 | Name: metric.Name,
102 | DailyStatus: make(map[string]string),
103 | Threshold: metric.Threshold,
104 | Unit: metric.Unit,
105 | ThresholdType: metric.ThresholdType,
106 | }
107 |
108 | // 查询每天的状态
109 | for _, date := range data.Dates {
110 | status, err := queryMetricStatus(client, metric, date)
111 | if err != nil {
112 | log.Printf("查询指标 [%s] 在 %s 的状态失败: %v", metric.Name, date, err)
113 | metricStatus.DailyStatus[date] = "abnormal"
114 | data.Summary.Abnormal++
115 | } else {
116 | metricStatus.DailyStatus[date] = status
117 | switch status {
118 | case "normal":
119 | log.Printf("指标 [%s] 在 %s 状态正常", metric.Name, date)
120 | data.Summary.Normal++
121 | case "warning":
122 | log.Printf("指标 [%s] 在 %s 状态警告", metric.Name, date)
123 | data.Summary.Warning++
124 | case "abnormal":
125 | log.Printf("指标 [%s] 在 %s 状态异常", metric.Name, date)
126 | data.Summary.Abnormal++
127 | }
128 | }
129 | }
130 |
131 | data.Metrics = append(data.Metrics, metricStatus)
132 | }
133 | }
134 |
135 | log.Printf("状态数据收集完成. 总指标数: %d, 正常: %d, 警告: %d, 异常: %d",
136 | data.Summary.TotalMetrics, data.Summary.Normal, data.Summary.Warning, data.Summary.Abnormal)
137 |
138 | // 打印每种类型的指标数量
139 | for typeName, count := range data.Summary.TypeCounts {
140 | log.Printf("指标类型 [%s] 包含 %d 个指标", typeName, count)
141 | }
142 |
143 | return data, nil
144 | }
145 |
146 | func queryMetricStatus(client metrics.PrometheusAPI, metric config.MetricConfig, date string) (string, error) {
147 | ctx := context.Background()
148 |
149 | dateTime, err := time.Parse("01-02", date)
150 | if err != nil {
151 | return "abnormal", err
152 | }
153 |
154 | // 设置查询时间范围为那一天的0点到23:59:59
155 | startTime := time.Date(time.Now().Year(), dateTime.Month(), dateTime.Day(), 0, 0, 0, 0, time.Local)
156 | endTime := startTime.Add(24 * time.Hour).Add(-time.Second)
157 |
158 | log.Printf(`
159 | 查询指标: [%s]
160 | 时间范围: %s 到 %s
161 | PromQL: %s
162 | 调试步骤:
163 | 1. 打开 Prometheus UI
164 | 2. 粘贴查询: %s
165 | 3. 设置时间范围为: %s 到 %s
166 | -------------------`,
167 | metric.Name,
168 | startTime.Format("2006-01-02 15:04:05"),
169 | endTime.Format("2006-01-02 15:04:05"),
170 | metric.Query,
171 | metric.Query,
172 | startTime.Format("2006-01-02 15:04:05"),
173 | endTime.Format("2006-01-02 15:04:05"))
174 |
175 | // 直接使用原始查询语句
176 | result, _, err := client.QueryRange(ctx, metric.Query, v1.Range{
177 | Start: startTime,
178 | End: endTime,
179 | Step: time.Hour, // 每小时一个采样点
180 | })
181 |
182 | if err != nil {
183 | log.Printf("执行查询失败 [%s]: %v", metric.Query, err)
184 | return "abnormal", err
185 | }
186 |
187 | switch v := result.(type) {
188 | case model.Matrix:
189 | if len(v) == 0 {
190 | log.Printf("指标 [%s] 查询结果为空", metric.Name)
191 | return "abnormal", nil
192 | }
193 |
194 | log.Printf("指标 [%s] 返回 %d 个时间序列", metric.Name, len(v))
195 |
196 | maxValue := float64(0)
197 | // 遍历每个时间序列
198 | for _, series := range v {
199 | // 遍历每个采样点,找出最大值
200 | for _, sample := range series.Values {
201 | value := float64(sample.Value)
202 | if value > maxValue {
203 | maxValue = value
204 | }
205 | log.Printf("指标 [%s] 时间: %v, 值: %v",
206 | metric.Name,
207 | sample.Timestamp.Time().Format("15:04:05"),
208 | value)
209 | }
210 | }
211 |
212 | // 使用最大值进行阈值判断
213 | status := checkThreshold(maxValue, metric.Threshold, metric.ThresholdType)
214 | log.Printf("指标 [%s] 最大值: %v, 阈值: %v, 阈值类型: %s, 状态: %s",
215 | metric.Name,
216 | maxValue,
217 | metric.Threshold,
218 | metric.ThresholdType,
219 | status)
220 |
221 | return status, nil
222 |
223 | default:
224 | log.Printf("指标 [%s] 返回了意外的结果类型: %T", metric.Name, result)
225 | return "abnormal", nil
226 | }
227 | }
228 |
229 | // 根据阈值类型判断状态
230 | func checkThreshold(value, threshold float64, thresholdType string) string {
231 | if thresholdType == "" {
232 | thresholdType = "greater" // 默认值
233 | }
234 |
235 | // 警告阈值为正常阈值的90%
236 | warningFactor := 0.9
237 |
238 | switch thresholdType {
239 | case "greater":
240 | // 当值大于阈值时告警
241 | // 例如:CPU使用率 > 80% 告警
242 | if value > threshold {
243 | return "abnormal"
244 | } else if value > threshold*warningFactor {
245 | return "warning"
246 | }
247 | return "normal"
248 | case "greater_equal":
249 | // 当值大于等于阈值时告警
250 | if value >= threshold {
251 | return "abnormal"
252 | } else if value >= threshold*warningFactor {
253 | return "warning"
254 | }
255 | return "normal"
256 | case "less":
257 | // 当值小于阈值时告警
258 | // 例如:可用节点数 < 3 告警
259 | if value < threshold {
260 | return "abnormal"
261 | } else if value < threshold/warningFactor {
262 | return "warning"
263 | }
264 | return "normal"
265 | case "less_equal":
266 | // 当值小于等于阈值时告警
267 | if value <= threshold {
268 | return "abnormal"
269 | } else if value <= threshold/warningFactor {
270 | return "warning"
271 | }
272 | return "normal"
273 | case "equal":
274 | // 值必须等于阈值才正常
275 | if value == threshold {
276 | return "normal"
277 | }
278 | return "abnormal"
279 | case "not_equal":
280 | // 值不等于阈值才正常
281 | if value != threshold {
282 | return "normal"
283 | }
284 | return "abnormal"
285 | default:
286 | // 默认情况:大于阈值告警
287 | if value > threshold {
288 | return "abnormal"
289 | } else if value > threshold*warningFactor {
290 | return "warning"
291 | }
292 | return "normal"
293 | }
294 | }
295 |
--------------------------------------------------------------------------------
/pkg/utils/utils.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "net"
5 | )
6 |
7 | var globalPort string
8 |
9 | func GetLocalIPs() []string {
10 | interfaces, err := net.Interfaces()
11 | if err != nil {
12 | return nil
13 | }
14 |
15 | var ips []string
16 | for _, iface := range interfaces {
17 | // 跳过禁用的接口
18 | if iface.Flags&net.FlagUp == 0 {
19 | continue
20 | }
21 |
22 | // 获取接口的所有地址
23 | addrs, err := iface.Addrs()
24 | if err != nil {
25 | continue
26 | }
27 |
28 | for _, addr := range addrs {
29 | if ipnet, ok := addr.(*net.IPNet); ok {
30 | // 排除回环地址和 IPv6 地址
31 | if !ipnet.IP.IsLoopback() && ipnet.IP.To4() != nil {
32 | ips = append(ips, ipnet.IP.String())
33 | }
34 | }
35 | }
36 | }
37 | return ips
38 | }
39 |
40 | func SetGlobalPort(port string) {
41 | globalPort = port
42 | }
43 |
44 | func GetGlobalPort() string {
45 | return globalPort
46 | }
47 |
48 |
49 |
--------------------------------------------------------------------------------
/reports/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/reports/.DS_Store
--------------------------------------------------------------------------------
/templates/report.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{.Project}}
6 |
7 |
154 |
155 |
156 |
157 |
158 |
{{.Project}}
159 |
生成时间: {{.Timestamp.Format "2006-01-02 15:04:05"}}
160 |
161 |
162 |
163 | {{range $type, $group := .MetricGroups}}
164 |
165 |
{{$type}}
166 |
167 |
168 |
最大值
169 |
{{printf "%.2f" $group.Stats.MaxValue}}
170 |
171 |
172 |
最小值
173 |
{{printf "%.2f" $group.Stats.MinValue}}
174 |
175 |
179 |
180 |
告警数
181 |
182 | {{$group.Stats.AlertCount}}/{{$group.Stats.TotalCount}}
183 |
184 |
185 |
186 |
告警详情
187 |
188 | 严重:{{$group.Stats.CriticalCount}}
189 | 警告:{{$group.Stats.WarningCount}}
190 |
191 |
192 |
193 |
194 | {{end}}
195 |
196 |
197 |
198 |
199 |
资源使用概览
200 |
201 |
202 |
203 |
204 |
205 |
206 | {{range $type, $group := .MetricGroups}}
207 |
208 |
{{$type}} 监控指标
209 | {{range $metricName, $metrics := $group.MetricsByName}}
210 |
{{$metricName}}
211 | {{if eq (len $metrics) 0 }}
212 |
未查询到数据
213 | {{end}}
214 | {{if gt (len $metrics) 0}}
215 |
216 |
217 | 指标名称 |
218 | {{$headerLabels := (index $metrics 0).Labels}}
219 | {{range $headerLabels}}
220 | {{.Alias}} |
221 | {{end}}
222 | 值 |
223 | 状态 |
224 | 检测时间 |
225 |
226 | {{range $metric := $metrics}}
227 |
228 | {{.Name}} |
229 | {{range $headerLabels}}
230 | {{$labelName := .Name}}
231 | {{range $metricLabel := $metric.Labels}}
232 | {{if eq $metricLabel.Name $labelName}}
233 |
234 | {{$metricLabel.Value}}
235 | |
236 | {{end}}
237 | {{end}}
238 | {{end}}
239 | {{printf "%.2f" $metric.Value}}{{$metric.Unit}} |
240 |
241 | {{if eq .Status "normal"}}正常
242 | {{else if eq .Status "warning"}}警告
243 | {{else if eq .Status "critical"}}严重
244 | {{else}}{{.Status}}
245 | {{end}}
246 | |
247 | {{.Timestamp.Format "2006-01-02 15:04:05"}} |
248 |
249 | {{end}}
250 |
251 | {{end}}
252 | {{end}}
253 |
254 | {{end}}
255 |
256 |
257 |
386 |
387 |
--------------------------------------------------------------------------------
/templates/status.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 服务健康看板
5 |
6 |
7 |
217 |
218 |
219 |
220 |
226 |
227 |
228 |
229 |
总指标数
230 |
{{.Summary.TotalMetrics}}
231 |
232 |
233 |
正常服务
234 |
{{.Summary.Normal}}
235 |
236 |
237 |
异常服务
238 |
{{.Summary.Abnormal}}
239 |
240 |
241 |
警告服务
242 |
{{.Summary.Warning}}
243 |
244 |
245 |
246 |
247 |
指标类型统计
248 |
249 | {{range $type, $count := .Summary.TypeCounts}}
250 |
251 |
{{$type}}
252 |
{{$count}}
253 |
254 | {{end}}
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 | 指标信息 |
263 | {{range $date := .Dates}}
264 |
265 | {{end}}
266 |
267 |
268 |
269 | {{range $metric := .Metrics}}
270 |
271 |
272 | {{$metric.Name}}
273 |
274 | 阈值: {{$metric.Threshold}}{{$metric.Unit}}
275 | {{if eq $metric.ThresholdType "greater"}}
276 | (>报警)
277 | {{else if eq $metric.ThresholdType "greater_equal"}}
278 | (>=报警)
279 | {{else if eq $metric.ThresholdType "less"}}
280 | (<报警)
281 | {{else if eq $metric.ThresholdType "less_equal"}}
282 | (<=报警)
283 | {{else if eq $metric.ThresholdType "equal"}}
284 | (=正常)
285 | {{else if eq $metric.ThresholdType "not_equal"}}
286 | (!=正常)
287 | {{end}}
288 |
289 | |
290 | {{range $date := $.Dates}}
291 |
292 |
293 | {{if eq (index $metric.DailyStatus $date) "normal"}}
294 | ✓
295 | {{else if eq (index $metric.DailyStatus $date) "warning"}}
296 | ⚠
297 | {{else}}
298 | ✗
299 | {{end}}
300 |
301 | |
302 | {{end}}
303 |
304 | {{end}}
305 |
306 |
307 |
308 |
309 |
310 |
311 |
--------------------------------------------------------------------------------