├── .DS_Store ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom.md │ └── feature_request.md ├── pull-request-template.md └── workflows │ ├── docker-image.yml │ ├── follow.yml │ ├── go-binary-release.yml │ └── reademe-contributors.yml ├── .idea ├── .gitignore ├── PromAI.iml └── vcs.xml ├── Dockerfile ├── PromAI.exe ├── README.md ├── config └── config.yaml ├── deploy └── deployment.yaml ├── go.mod ├── go.sum ├── images ├── image.png ├── image2.png ├── status.png └── 资源概览.png ├── main.go ├── outputs └── readme.md ├── pkg ├── config │ └── config.go ├── metrics │ └── collector.go ├── notify │ └── notify.go ├── prometheus │ ├── client.go │ └── prometheus.go ├── report │ ├── cleanup.go │ └── generator.go ├── status │ └── status.go └── utils │ └── utils.go ├── reports ├── .DS_Store ├── inspection_report_20241227_123648.html └── inspection_report_20241231_201838.html └── templates ├── report.html └── status.html /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/.DS_Store -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/pull-request-template.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | **在提出此拉取请求时,我确认了以下几点(请复选框):** 4 | 5 | - [ ] 我已阅读并理解[贡献者指南]()。 6 | - [ ] 我已检查没有与此请求重复的拉取请求。 7 | - [ ] 我已经考虑过,并确认这份呈件对其他人很有价值。 8 | - [ ] 我接受此提交可能不会被使用,并根据维护人员的意愿关闭拉取请求。 9 | 10 | **填写 PR 内容:** 11 | 12 | - 13 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | name: build docker image 3 | # Controls when the action will run. 4 | on: 5 | push: 6 | branches: 7 | - main 8 | # Allows you to run this workflow manually from the Actions tab 9 | # 可以手动触发 10 | workflow_dispatch: 11 | inputs: 12 | logLevel: 13 | description: "Log level" 14 | required: true 15 | default: "warning" 16 | tags: 17 | description: "Test scenario tags" 18 | 19 | jobs: 20 | buildx: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v2 25 | 26 | - name: Get current date 27 | id: date 28 | run: echo "::set-output name=today::$(date +'%Y-%m-%d_%H-%M')" 29 | 30 | - name: Set up QEMU 31 | uses: docker/setup-qemu-action@v1 32 | 33 | - name: Set up Docker Buildx 34 | id: buildx 35 | uses: docker/setup-buildx-action@v1 36 | 37 | - name: Available platforms 38 | run: echo ${{ steps.buildx.outputs.platforms }} 39 | 40 | - name: Login to DockerHub 41 | uses: docker/login-action@v1 42 | with: 43 | username: ${{ secrets.DOCKERHUB_USERNAME }} 44 | password: ${{ secrets.DOCKERHUB_TOKEN }} 45 | 46 | - name: Build and push 47 | uses: docker/build-push-action@v2 48 | with: 49 | context: . 50 | file: ./Dockerfile 51 | # 所需要的体系结构,可以在 Available platforms 步骤中获取所有的可用架构 52 | platforms: linux/amd64,linux/arm64/v8 53 | # 镜像推送时间 54 | push: ${{ github.event_name != 'pull_request' }} 55 | # 给清单打上多个标签 56 | tags: | 57 | kubehan/promai:${{ github.ref_name }}-${{ steps.date.outputs.today }} 58 | kubehan/promai:latest -------------------------------------------------------------------------------- /.github/workflows/follow.yml: -------------------------------------------------------------------------------- 1 | name: Get Top Followers 2 | on: 3 | push: 4 | branches: 5 | - master 6 | schedule: 7 | - cron: "0 20 * * *" 8 | jobs: 9 | github_followers_job: 10 | runs-on: ubuntu-latest 11 | name: A job to display github followers in your profile 12 | steps: 13 | - uses: actions/checkout@v3 14 | 15 | - name: use github-follower-action to update README.md 16 | id: github-follower 17 | uses: JieDing/github-followers@main 18 | env: 19 | login: ${{ github.repository_owner }} 20 | pat: ${{ secrets.ACCESS_TOKEN }} 21 | - name: Commit changes 22 | run: | 23 | git config --local user.email "kubehan@163.com" 24 | git config --local user.name "Kubehan" 25 | git add -A 26 | git diff-index --quiet HEAD || git commit -m "Update GitHub followers" 27 | - name: Pull changes 28 | run: git pull -r 29 | - name: Push changes 30 | uses: ad-m/github-push-action@master 31 | with: 32 | github_token: ${{ secrets.ACCESS_TOKEN }} 33 | branch: ${{ github.ref }} -------------------------------------------------------------------------------- /.github/workflows/go-binary-release.yml: -------------------------------------------------------------------------------- 1 | name: build-go-binary 2 | 3 | on: 4 | release: 5 | types: [created, published] # 表示在创建新的 Release 时触发 6 | 7 | jobs: 8 | build-go-binary: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | goos: [linux, windows, darwin] # 需要打包的系统 13 | goarch: [amd64, arm64] # 需要打包的架构 14 | exclude: # 排除某些平台和架构 15 | - goarch: arm64 16 | goos: windows 17 | steps: 18 | - uses: actions/checkout@v3 19 | - uses: wangyoucao577/go-release-action@v1.30 20 | with: 21 | github_token: ${{ secrets.GITHUB_TOKEN }} # 一个默认的变量,用来实现往 Release 中添加文件 22 | goos: ${{ matrix.goos }} 23 | goarch: ${{ matrix.goarch }} 24 | goversion: 1.23 # 可以指定编译使用的 Golang 版本 25 | binary_name: "PromAI" # 可以指定二进制文件的名称 26 | extra_files: README.md config outputs reports templates # 需要包含的额外文件 27 | -------------------------------------------------------------------------------- /.github/workflows/reademe-contributors.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | name: Generate a list of contributors 7 | 8 | jobs: 9 | contrib-readme-en-job: 10 | runs-on: ubuntu-latest 11 | name: A job to automate contrib in readme 12 | steps: 13 | - name: Contribute List 14 | uses: akhilmhdh/contributors-readme-action@v2.3.4 15 | env: 16 | GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} 17 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | # 基于编辑器的 HTTP 客户端请求 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | /.idea/ -------------------------------------------------------------------------------- /.idea/PromAI.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/golang:1.23.4-alpine3.20 AS builder 2 | 3 | WORKDIR /build 4 | COPY . . 5 | RUN go env -w GO111MODULE=on && go mod download && go build && ls -la /build 6 | 7 | FROM docker.io/alpine:3.21.0 8 | # 添加标识信息 9 | LABEL version="1.0" \ 10 | description="Prometheus Automated Inspection" \ 11 | maintainer="Kubehan" 12 | WORKDIR /app 13 | COPY --from=builder /build/PromAI /app/ 14 | COPY --from=builder /build/config /app/config/ 15 | COPY --from=builder /build/outputs /app/outputs/ 16 | COPY --from=builder /build/reports /app/reports/ 17 | COPY --from=builder /build/templates /app/templates/ 18 | EXPOSE 8091 19 | # 运行应用程序 20 | CMD ["./PromAI", "-port", "8091"] -------------------------------------------------------------------------------- /PromAI.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/PromAI.exe -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prometheus 监控报告生成器 2 | 3 | > Prometheus Automated Inspection 4 | 5 | ## 项目简介 6 | 7 | 这是一个基于 Prometheus 的监控报告自动生成工具,可以自动收集、分析指标数据并生成可视化的 HTML 报告。该工具旨在简化监控数据的收集和展示过程,帮助运维人员快速了解系统状态。 8 | 9 | ## 报告样式 10 | ### 获取报告 11 | http://localhost:8091/getreport 12 | 13 | [报告样式](reports/inspection_report_20241214_131709.html) 14 | ![report](images/资源概览.png) 15 | 16 | ![report](images/image2.png) 17 | 18 | ## 服务健康看板 19 | ### 获取服务健康看板 20 | http://localhost:8091/status 21 | 22 | ![status](images/status.png) 23 | 24 | 25 | ## 功能特点 26 | 27 | - 支持多种指标类型的监控(基础资源、Kubernetes、应用服务等) 28 | - 自动计算指标状态和告警级别(正常、警告、严重) 29 | - 生成包含数据表格和图表的 HTML 报告 30 | - 支持自定义指标阈值和标签别名 31 | - 灵活的配置文件系统 32 | - 支持多维度数据分析和展示 33 | - 自动计算关键统计指标(最大值、最小值、平均值等) 34 | - 美观的可视化界面,支持响应式布局 35 | 36 | ## 系统要求 37 | 38 | - Go 1.22 或更高版本 39 | - 可访问的 Prometheus 服务器 40 | - 现代浏览器(支持 HTML5 和 JavaScript) 41 | - 至少 512MB 可用内存 42 | - 50MB 可用磁盘空间 43 | 44 | ## 配置说明 45 | 46 | 配置文件采用 YAML 格式,主要包含以下几个部分: 47 | 48 | ### Prometheus 配置 49 | 50 | 在 `config/config.yaml` 中配置 Prometheus 服务器地址和监控指标。 51 | 52 | ```yaml 53 | prometheus_url: "http://prometheus.k8s.kubehan.cn" 54 | 55 | metric_types: 56 | - type: "基础资源使用情况" 57 | metrics: 58 | - name: "CPU使用率" 59 | description: "节点CPU使用率统计" 60 | query: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)" 61 | trend_query: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)[6h:5m]" 62 | threshold: 80 63 | unit: "%" 64 | labels: 65 | instance: "节点" 66 | # 其他指标... 67 | ``` 68 | 69 | ### 指标说明 70 | 71 | 每个指标可以配置以下内容: 72 | 73 | - `name`: 指标名称 74 | - `description`: 指标描述 75 | - `query`: 用于表格显示的即时查询 76 | - `trend_query`: 用于图表显示的趋势查询 77 | - `threshold`: 指标阈值 78 | - `unit`: 指标单位 79 | - `labels`: 标签别名 80 | - `threshold_type`: 阈值比较方式: "greater", "less", "equal", "greater_equal", "less_equal" 81 | 82 | ```txt 83 | greater: 表示值必须大于阈值才被视为 "critical" 状态。 84 | greater_equal: 表示值必须大于或等于阈值才被视为 "critical" 状态。 85 | less: 表示值必须小于阈值才被视为 "normal" 状态。 86 | less_equal: 表示值必须小于或等于阈值才被视为 "normal" 状态。 87 | equal: 表示值必须等于阈值才被视为 "normal" 状态。 88 | ``` 89 | 90 | ## 快速开始 91 | 92 | ### 源码编译 93 | 94 | 1. 克隆仓库: 95 | 96 | ```bash 97 | git clone https://github.com/kubehan/PromAI.git 98 | cd PromAI 99 | ``` 100 | 2. 安装依赖: 101 | 102 | ```bash 103 | go mod download 104 | ``` 105 | 3. 修改配置文件: 106 | 107 | ```bash 108 | cp config/config.yaml config/config.yaml 109 | # 编辑 config.yaml 设置 Prometheus 服务器地址和监控指标 110 | ``` 111 | 4. 构建并运行: 112 | 113 | ```bash 114 | go build -o PromAI main.go 115 | ./PromAI -config config/config.yaml 116 | ``` 117 | 5. 查看报告: 118 | 生成的报告将保存在 `reports` 目录下。 119 | 120 | ### Docker 部署 121 | 122 | ```bash 123 | docker run -d --name PromAI -p 8091:8091 kubehan/promai:latest 124 | ``` 125 | 126 | ### Kubernetes 部署 127 | 128 | ```bash 129 | kubectl apply -f deploy/deployment.yaml 130 | ``` 131 | 132 | ## 使用示例 133 | 134 | 在配置文件中添加所需的监控指标后,运行程序将生成 HTML 报告。报告中将包含各个指标的当前状态、历史趋势图表以及详细的表格数据。 135 | 136 | 1. 修改配置文件中的Prometheus地址为自己的地址 137 | 2. 修改配置文件中的指标 138 | 3. 运行程序 默认运行在8091端口,通过访问http://localhost:8091/getreport 查看报告 139 | 140 | ```bash 141 | go build -o PromAI main.go 142 | ./PromAI -config config/config.yaml 143 | ``` 144 | 145 | # Prometheus Automated Inspection 未来新功能规划列表 146 | 147 | 1. 多数据源支持 148 | 2. 自定义仪表板 149 | 3. 历史数据存储 150 | 4. 智能告警 151 | 5. API 接口 152 | 6. 用户角色和权限管理 153 | 7. 数据导出功能 154 | 8. 集成 CI/CD 流程 155 | 9. 可视化组件库 156 | 10. 多语言支持 157 | 11. 移动端支持 158 | 12. 社区和插件支持 159 | 13. 性能优化 160 | 14. 用户反馈和建议收集 161 | 15. xxx 162 | 163 | ## 贡献 164 | 165 | 欢迎任何形式的贡献!请提交问题、建议或拉取请求。 166 | 167 | ## 贡献者 168 | 169 | 170 | 171 | 172 | 179 | 186 | 193 | 200 | 207 |
173 | 174 | kubehan 175 |
176 | Kubehan 177 |
178 |
180 | 181 | junlintianxiazhifulinzhongguo 182 |
183 | Junlintianxiazhifulinzhongguo 184 |
185 |
187 | 188 | liushiju 189 |
190 | Shiju Liu 191 |
192 |
194 | 195 | wevsmy 196 |
197 | Wilson_wu 198 |
199 |
201 | 202 | liaofan-0710 203 |
204 | 了凡 205 |
206 |
208 | 209 | 210 | ## 许可证 211 | 212 | 该项目采用 MIT 许可证,详细信息请查看 LICENSE 文件。 213 | -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | prometheus_url: "http://prometheus-k8s.kubesphere-monitoring-system.svc.cluster.local:9090" 2 | 3 | 4 | project_name: "测试项目巡检报告" 5 | 6 | 7 | # 定时任务:每天9点半和17半执行 8 | 9 | cron_schedule: "30 9,17 * * *" 10 | 11 | 12 | # 报告清理 13 | 14 | report_cleanup: 15 | enabled: true 16 | max_age: 7 # 保留最近7天的报告 17 | cron_schedule: "0 0 * * *" # 如果为空,则执行执行上面定时任务,即生成报告时清理 18 | 19 | # 配置发送钉钉和邮件 20 | 21 | notifications: 22 | dingtalk: 23 | enabled: true 24 | webhook: "https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxx" # 这里填写自己的webhook 25 | secret: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # 这里填写的是加钉钉机器人加签的secret 26 | report_url: "http://192.168.5.125:41174" # 这里可以填写ip+端口,也可以填写域名,如果是k8s里部署,推荐采用域名的方式,如果不行可以将 svc 以nodeport方式暴露出来,这里就可以使用ip+端口方式 27 | email: 28 | enabled: true 29 | smtp_host: "smtp.exmail.qq.com" # 我这里用的是腾讯企业邮箱,需要改成自己的 30 | smtp_port: 465 31 | username: "demo@demo.cn" # 填写自己的邮箱账号 32 | password: "xxxxxxxxxxxxxxxxxxxx" # 这里填写的是授权码 33 | from: "demo@demo.cn" 34 | to: 35 | - "demo@demo.cn" 36 | report_url: "https://promai.lichengjun.top" # 这里可以填写ip+端口,也可以填写域名,如果是k8s里部署,推荐采用域名的方式,如果不行可以将 svc 以nodeport方式暴露出来,这里就可以使用ip+端口方式,如果是部署在k8s里,ingress 的需要自己去编写 37 | 38 | metric_types: 39 | - type: "基础资源使用情况" 40 | metrics: 41 | - name: "CPU使用率" 42 | description: "节点CPU使用率统计" 43 | query: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)" 44 | threshold: 80 45 | threshold_type: "greater" 46 | unit: "%" 47 | labels: 48 | instance: "节点" 49 | 50 | - name: "内存使用率" 51 | description: "节点内存使用率统计" 52 | query: "100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes)" 53 | threshold: 85 54 | threshold_type: "greater" 55 | unit: "%" 56 | labels: 57 | instance: "节点" 58 | 59 | - name: "磁盘使用率" 60 | description: "节点磁盘使用率统计" 61 | query: >- 62 | (((100 -((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes)) 63 | and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint!~"/run.*|/var.*|/boot.*|/tmp.*"}== 0) 64 | + on(instance) group_left(node_uname_info) node_uname_info) 65 | * on(instance) group_left(nodename) node_uname_info 66 | threshold: 80 67 | threshold_type: "greater" 68 | unit: "%" 69 | labels: 70 | instance: "节点" 71 | mountpoint: "挂载点" 72 | device: "磁盘" 73 | nodename: "节点名称" 74 | 75 | 76 | - name: "固定机器内存使用率" 77 | description: "固定机器内存使用率统计" 78 | query: >- 79 | 100 - ((node_memory_MemAvailable_bytes{instance="172.16.5.132:9100"} * 100) / node_memory_MemTotal_bytes{instance="172.16.5.132:9100"}) 80 | threshold: 16.84 81 | threshold_type: "greater" 82 | unit: "%" 83 | labels: 84 | instance: "节点" 85 | 86 | 87 | # - type: "PaaS平台巡检" 88 | # metrics: 89 | # - name: "K8s集群关键服务" 90 | # description: "K8s集群关键服务状态统计" 91 | # query: "key_pod_status" 92 | # threshold: 1 93 | # threshold_type: "equal" 94 | # unit: "" 95 | # labels: 96 | # component: "服务名称" 97 | # namespace: "命名空间" 98 | # # describe: "服务描述" 99 | # hostname: "主机名称" 100 | # owner: "负责人" 101 | # instance: "节点" 102 | 103 | - type: "kubernetes集群监控状态" 104 | metrics: 105 | - name: "K8s集群巡检" 106 | description: "K8s集群巡检" 107 | query: "k8s_cluster_auto_check" 108 | threshold: 1 109 | threshold_type: "equal" 110 | unit: "" 111 | labels: 112 | component: "服务名称" 113 | hostname: "主机名称" 114 | owner: "负责人" 115 | 116 | - name: "自定义监控脚本执行情况" 117 | description: "script-exporter监控脚本执行情况" 118 | query: "script_success" 119 | threshold: 1 120 | threshold_type: "equal" 121 | unit: "" 122 | labels: 123 | instance: "宿主机器" 124 | script: "脚本名称" 125 | 126 | - name: "Pod运行状态" 127 | description: "集群Pod运行状态统计" 128 | query: "sum by (namespace, pod) (kube_pod_status_phase{phase='Running',namespace='kube-system'})" 129 | threshold: 1 130 | threshold_type: "equal" 131 | unit: "" 132 | labels: 133 | namespace: "命名空间" 134 | pod: "Pod名称" 135 | 136 | - name: "节点就绪状态" 137 | description: "K8s节点就绪状态检查" 138 | query: "kube_node_status_condition{condition='Ready',status!='true'}" 139 | threshold: 0 140 | threshold_type: "equal" 141 | unit: "" 142 | labels: 143 | node: "节点" 144 | condition: "状态类型" 145 | 146 | - name: "PVC使用率" 147 | description: "持久化存储使用率" 148 | query: >- 149 | 100 * (1 - kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) 150 | threshold: 90 151 | threshold_type: "greater" 152 | unit: "%" 153 | labels: 154 | namespace: "命名空间" 155 | persistentvolumeclaim: "PVC名称" 156 | -------------------------------------------------------------------------------- /deploy/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | labels: 5 | kubernetes.io/metadata.name: promai 6 | name: promai 7 | 8 | --- 9 | 10 | kind: ConfigMap 11 | apiVersion: v1 12 | metadata: 13 | name: config 14 | namespace: promai 15 | annotations: 16 | kubesphere.io/creator: admin 17 | data: 18 | config.yaml: >- 19 | prometheus_url: 20 | "http://prometheus-k8s.kubesphere-monitoring-system.svc.cluster.local:9090" 21 | 22 | 23 | project_name: "测试项目巡检报告" 24 | 25 | 26 | # 定时任务:每天9点半和17半执行 27 | 28 | cron_schedule: "30 9,17 * * *" 29 | 30 | 31 | # 报告清理 32 | 33 | report_cleanup: 34 | enabled: true 35 | max_age: 7 # 保留最近7天的报告 36 | cron_schedule: "0 0 * * *" # 如果为空,则执行执行上面定时任务,即生成报告时清理 37 | 38 | # 配置发送钉钉和邮件 39 | 40 | notifications: 41 | dingtalk: 42 | enabled: true 43 | webhook: "https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxx" # 这里填写自己的webhook 44 | secret: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # 这里填写的是加钉钉机器人加签的secret 45 | report_url: "http://192.168.5.125:41174" # 这里可以填写ip+端口,也可以填写域名,如果是k8s里部署,推荐采用域名的方式,如果不行可以将 svc 以nodeport方式暴露出来,这里就可以使用ip+端口方式 46 | email: 47 | enabled: true 48 | smtp_host: "smtp.exmail.qq.com" # 我这里用的是腾讯企业邮箱,需要改成自己的 49 | smtp_port: 465 50 | username: "demo@demo.cn" # 填写自己的邮箱账号 51 | password: "xxxxxxxxxxxxxxxxxxxx" # 这里填写的是授权码 52 | from: "demo@demo.cn" 53 | to: 54 | - "demo@demo.cn" 55 | report_url: "https://promai.lichengjun.top" # 这里可以填写ip+端口,也可以填写域名,如果是k8s里部署,推荐采用域名的方式,如果不行可以将 svc 以nodeport方式暴露出来,这里就可以使用ip+端口方式,如果是部署在k8s里,ingress 的需要自己去编写 56 | 57 | metric_types: 58 | - type: "基础资源使用情况" 59 | metrics: 60 | - name: "CPU使用率" 61 | description: "节点CPU使用率统计" 62 | query: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)" 63 | threshold: 80 64 | threshold_type: "greater" 65 | unit: "%" 66 | labels: 67 | instance: "节点" 68 | 69 | - name: "内存使用率" 70 | description: "节点内存使用率统计" 71 | query: "100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes)" 72 | threshold: 85 73 | threshold_type: "greater" 74 | unit: "%" 75 | labels: 76 | instance: "节点" 77 | 78 | - name: "磁盘使用率" 79 | description: "节点磁盘使用率统计" 80 | query: >- 81 | (((100 -((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes)) 82 | and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint!~"/run.*|/var.*|/boot.*|/tmp.*"}== 0) 83 | + on(instance) group_left(node_uname_info) node_uname_info) 84 | * on(instance) group_left(nodename) node_uname_info 85 | threshold: 80 86 | threshold_type: "greater" 87 | unit: "%" 88 | labels: 89 | instance: "节点" 90 | mountpoint: "挂载点" 91 | device: "磁盘" 92 | nodename: "节点名称" 93 | 94 | 95 | - name: "固定机器内存使用率" 96 | description: "固定机器内存使用率统计" 97 | query: >- 98 | 100 - ((node_memory_MemAvailable_bytes{instance="172.16.5.132:9100"} * 100) / node_memory_MemTotal_bytes{instance="172.16.5.132:9100"}) 99 | threshold: 16.84 100 | threshold_type: "greater" 101 | unit: "%" 102 | labels: 103 | instance: "节点" 104 | 105 | 106 | # - type: "PaaS平台巡检" 107 | # metrics: 108 | # - name: "K8s集群关键服务" 109 | # description: "K8s集群关键服务状态统计" 110 | # query: "key_pod_status" 111 | # threshold: 1 112 | # threshold_type: "equal" 113 | # unit: "" 114 | # labels: 115 | # component: "服务名称" 116 | # namespace: "命名空间" 117 | # # describe: "服务描述" 118 | # hostname: "主机名称" 119 | # owner: "负责人" 120 | # instance: "节点" 121 | 122 | - type: "kubernetes集群监控状态" 123 | metrics: 124 | - name: "K8s集群巡检" 125 | description: "K8s集群巡检" 126 | query: "k8s_cluster_auto_check" 127 | threshold: 1 128 | threshold_type: "equal" 129 | unit: "" 130 | labels: 131 | component: "服务名称" 132 | hostname: "主机名称" 133 | owner: "负责人" 134 | 135 | - name: "自定义监控脚本执行情况" 136 | description: "script-exporter监控脚本执行情况" 137 | query: "script_success" 138 | threshold: 1 139 | threshold_type: "equal" 140 | unit: "" 141 | labels: 142 | instance: "宿主机器" 143 | script: "脚本名称" 144 | 145 | - name: "Pod运行状态" 146 | description: "集群Pod运行状态统计" 147 | query: "sum by (namespace, pod) (kube_pod_status_phase{phase='Running',namespace='kube-system'})" 148 | threshold: 1 149 | threshold_type: "equal" 150 | unit: "" 151 | labels: 152 | namespace: "命名空间" 153 | pod: "Pod名称" 154 | 155 | - name: "节点就绪状态" 156 | description: "K8s节点就绪状态检查" 157 | query: "kube_node_status_condition{condition='Ready',status!='true'}" 158 | threshold: 0 159 | threshold_type: "equal" 160 | unit: "" 161 | labels: 162 | node: "节点" 163 | condition: "状态类型" 164 | 165 | - name: "PVC使用率" 166 | description: "持久化存储使用率" 167 | query: >- 168 | 100 * (1 - kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) 169 | threshold: 90 170 | threshold_type: "greater" 171 | unit: "%" 172 | labels: 173 | namespace: "命名空间" 174 | persistentvolumeclaim: "PVC名称" 175 | 176 | 177 | --- 178 | 179 | kind: Deployment 180 | apiVersion: apps/v1 181 | metadata: 182 | name: promai 183 | namespace: promai 184 | labels: 185 | app: promai 186 | spec: 187 | replicas: 1 188 | selector: 189 | matchLabels: 190 | app: promai 191 | template: 192 | metadata: 193 | labels: 194 | app: promai 195 | spec: 196 | volumes: 197 | - name: volume-kv418v 198 | configMap: 199 | name: config 200 | defaultMode: 420 201 | - name: host-time 202 | hostPath: 203 | path: /etc/localtime 204 | type: '' 205 | containers: 206 | - name: promai 207 | image: 'kubehan/promai:latest' 208 | ports: 209 | - name: tcp-8091 210 | containerPort: 8091 211 | protocol: TCP 212 | env: 213 | - name: PROMETHEUS_URL 214 | value: >- 215 | http://prometheus-k8s.kubesphere-monitoring-system.svc.cluster.local:9090 216 | resources: 217 | limits: 218 | cpu: 20m 219 | memory: 50Mi 220 | requests: 221 | cpu: 5m 222 | memory: 15Mi 223 | volumeMounts: 224 | - name: volume-kv418v 225 | readOnly: true 226 | mountPath: /app/config 227 | - name: host-time 228 | readOnly: true 229 | mountPath: /etc/localtime 230 | livenessProbe: 231 | tcpSocket: 232 | port: 8091 233 | initialDelaySeconds: 15 234 | timeoutSeconds: 30 235 | periodSeconds: 10 236 | successThreshold: 1 237 | failureThreshold: 8 238 | readinessProbe: 239 | tcpSocket: 240 | port: 8091 241 | initialDelaySeconds: 15 242 | timeoutSeconds: 30 243 | periodSeconds: 10 244 | successThreshold: 1 245 | failureThreshold: 8 246 | terminationMessagePath: /dev/termination-log 247 | terminationMessagePolicy: File 248 | imagePullPolicy: IfNotPresent 249 | restartPolicy: Always 250 | terminationGracePeriodSeconds: 30 251 | dnsPolicy: ClusterFirst 252 | securityContext: {} 253 | schedulerName: default-scheduler 254 | strategy: 255 | type: RollingUpdate 256 | rollingUpdate: 257 | maxUnavailable: 25% 258 | maxSurge: 25% 259 | revisionHistoryLimit: 10 260 | progressDeadlineSeconds: 600 261 | 262 | 263 | 264 | --- 265 | 266 | kind: Service 267 | apiVersion: v1 268 | metadata: 269 | name: promai 270 | namespace: promai 271 | labels: 272 | app: promai 273 | spec: 274 | ports: 275 | - name: http-80 276 | protocol: TCP 277 | port: 80 278 | targetPort: 8091 279 | selector: 280 | app: promai 281 | type: NodePort 282 | sessionAffinity: None 283 | ipFamilies: 284 | - IPv4 285 | ipFamilyPolicy: SingleStack 286 | internalTrafficPolicy: Cluster 287 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module PromAI 2 | 3 | go 1.22.3 4 | 5 | require ( 6 | github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible 7 | github.com/prometheus/client_golang v1.20.5 8 | github.com/prometheus/common v0.61.0 9 | github.com/robfig/cron/v3 v3.0.1 10 | gopkg.in/yaml.v2 v2.4.0 11 | ) 12 | 13 | require ( 14 | github.com/json-iterator/go v1.1.12 // indirect 15 | github.com/kr/text v0.2.0 // indirect 16 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 17 | github.com/modern-go/reflect2 v1.0.2 // indirect 18 | github.com/prometheus/client_model v0.6.1 // indirect 19 | google.golang.org/protobuf v1.35.2 // indirect 20 | ) 21 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= 2 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 3 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 4 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 5 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 6 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 7 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 8 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 9 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 10 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 11 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 12 | github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible h1:jdpOPRN1zP63Td1hDQbZW73xKmzDvZHzVdNYxhnTMDA= 13 | github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible/go.mod h1:1c7szIrayyPPB/987hsnvNzLushdWf4o/79s3P08L8A= 14 | github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= 15 | github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= 16 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= 17 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= 18 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 19 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 20 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 21 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 22 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 23 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= 24 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 25 | github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= 26 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= 27 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 28 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 29 | github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= 30 | github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= 31 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 32 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 33 | github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= 34 | github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= 35 | github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= 36 | github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= 37 | github.com/prometheus/common v0.61.0 h1:3gv/GThfX0cV2lpO7gkTUwZru38mxevy90Bj8YFSRQQ= 38 | github.com/prometheus/common v0.61.0/go.mod h1:zr29OCN/2BsJRaFwG8QOBr41D6kkchKbpeNH7pAjb/s= 39 | github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= 40 | github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= 41 | github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= 42 | github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= 43 | github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= 44 | github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= 45 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 46 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 47 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 48 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 49 | golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI= 50 | golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs= 51 | golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= 52 | golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= 53 | golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= 54 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 55 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= 56 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 57 | google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io= 58 | google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= 59 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 60 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 61 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 62 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= 63 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 64 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 65 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 66 | -------------------------------------------------------------------------------- /images/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/images/image.png -------------------------------------------------------------------------------- /images/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/images/image2.png -------------------------------------------------------------------------------- /images/status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/images/status.png -------------------------------------------------------------------------------- /images/资源概览.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/images/资源概览.png -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "html/template" 7 | "log" 8 | "net/http" 9 | "os" 10 | "time" 11 | 12 | "PromAI/pkg/config" 13 | "PromAI/pkg/metrics" 14 | "PromAI/pkg/prometheus" 15 | "PromAI/pkg/report" 16 | "PromAI/pkg/status" 17 | "PromAI/pkg/notify" 18 | "PromAI/pkg/utils" 19 | 20 | "github.com/robfig/cron/v3" 21 | "gopkg.in/yaml.v2" 22 | ) 23 | 24 | // loadConfig 加载配置文件 25 | func loadConfig(path string) (*config.Config, error) { 26 | data, err := os.ReadFile(path) // 读取配置文件 27 | if err != nil { 28 | return nil, fmt.Errorf("reading config file: %w", err) 29 | } 30 | 31 | var config config.Config // 定义配置结构体 32 | if err := yaml.Unmarshal(data, &config); err != nil { 33 | return nil, fmt.Errorf("parsing config file: %w", err) 34 | } // 解析配置文件 35 | // 从环境变量中获取 PrometheusURL 36 | if envPrometheusURL := os.Getenv("PROMETHEUS_URL"); envPrometheusURL != "" { 37 | log.Printf("使用环境变量中的 Prometheus URL: %s", envPrometheusURL) 38 | config.PrometheusURL = envPrometheusURL 39 | } else { 40 | log.Printf("使用配置文件中的 Prometheus URL: %s", config.PrometheusURL) 41 | } 42 | return &config, nil // 返回配置结构体 43 | } 44 | 45 | // setup 初始化应用程序 46 | func setup(configPath string) (*prometheus.Client, *config.Config, error) { 47 | config, err := loadConfig(configPath) 48 | if err != nil { 49 | return nil, nil, fmt.Errorf("loading config: %w", err) 50 | } 51 | 52 | client, err := prometheus.NewClient(config.PrometheusURL) 53 | if err != nil { 54 | return nil, nil, fmt.Errorf("initializing Prometheus client: %w", err) 55 | } 56 | 57 | return client, config, nil 58 | } 59 | 60 | func main() { 61 | configPath := flag.String("config", "config/config.yaml", "Path to configuration file") 62 | port := flag.String("port", "8091", "Port to run the HTTP server on") 63 | flag.Parse() 64 | 65 | utils.SetGlobalPort(*port) 66 | 67 | client, config, err := setup(*configPath) 68 | if err != nil { 69 | log.Fatalf("Error setting up: %v", err) 70 | } 71 | 72 | collector := metrics.NewCollector(client.API, config) 73 | 74 | // 设置定时任务 75 | if config.CronSchedule != "" { 76 | c := cron.New() 77 | _, err := c.AddFunc(config.CronSchedule, func() { 78 | data, err := collector.CollectMetrics() 79 | if err != nil { 80 | log.Printf("定时任务收集指标失败: %v", err) 81 | return 82 | } 83 | 84 | reportFilePath, err := report.GenerateReport(*data) 85 | if err != nil { 86 | log.Printf("定时任务生成报告失败: %v", err) 87 | return 88 | } 89 | log.Printf("定时任务成功生成报告: %s", reportFilePath) 90 | 91 | if config.Notifications.Dingtalk.Enabled { 92 | log.Printf("发送钉钉消息") 93 | if err := notify.SendDingtalk(config.Notifications.Dingtalk, reportFilePath); err != nil { 94 | log.Printf("发送钉钉消息失败: %v", err) 95 | } 96 | } 97 | 98 | if config.Notifications.Email.Enabled { 99 | log.Printf("发送邮件") 100 | notify.SendEmail(config.Notifications.Email, reportFilePath) 101 | } 102 | 103 | 104 | }) 105 | 106 | if err != nil { 107 | log.Printf("设置定时任务失败: %v", err) 108 | } else { 109 | c.Start() 110 | log.Printf("已启动定时任务,执行计划: %s", config.CronSchedule) 111 | } 112 | } else { 113 | log.Printf("未配置定时任务,请手动触发生成报告") 114 | } 115 | if config.ReportCleanup.Enabled { 116 | // 确定使用哪个计划 117 | cleanupSchedule := config.ReportCleanup.CronSchedule 118 | if cleanupSchedule == "" { 119 | cleanupSchedule = config.CronSchedule 120 | } 121 | 122 | if cleanupSchedule != "" { 123 | c := cron.New() 124 | _, err := c.AddFunc(cleanupSchedule, func() { 125 | if err := report.CleanupReports(config.ReportCleanup.MaxAge); err != nil { 126 | log.Printf("报告清理失败: %v", err) 127 | return 128 | } 129 | log.Printf("报告清理成功") 130 | }) 131 | 132 | if err != nil { 133 | log.Printf("设置清理定时任务失败: %v", err) 134 | } else { 135 | c.Start() 136 | log.Printf("已启动清理定时任务,执行计划: %s", cleanupSchedule) 137 | } 138 | } else { 139 | log.Printf("未配置任何定时任务计划,请手动清理报告") 140 | } 141 | } 142 | 143 | // 设置路由处理器 144 | setupRoutes(collector, config) 145 | 146 | // 启动服务器 147 | log.Printf("Starting server on port: %s with config: %s", *port, *configPath) 148 | log.Printf("Prometheus URL: %s", config.PrometheusURL) 149 | log.Printf("获取报告地址: http://localhost:%s/getreport", *port) 150 | log.Printf("健康看板地址: http://localhost:%s/status", *port) 151 | if err := http.ListenAndServe(":"+*port, nil); err != nil { 152 | log.Fatalf("Error starting HTTP server: %v", err) 153 | } 154 | } 155 | 156 | // setupRoutes 设置 HTTP 路由 157 | func setupRoutes(collector *metrics.Collector, config *config.Config) { 158 | // 设置报告生成路由 159 | http.HandleFunc("/getreport", makeReportHandler(collector)) 160 | 161 | // 设置静态文件服务 162 | http.Handle("/reports/", http.StripPrefix("/reports/", http.FileServer(http.Dir("reports")))) 163 | 164 | // 设置状态页面路由 165 | http.HandleFunc("/status", makeStatusHandler(collector.Client, config)) 166 | 167 | } 168 | 169 | // makeReportHandler 创建报告处理器 170 | func makeReportHandler(collector *metrics.Collector) http.HandlerFunc { 171 | return func(w http.ResponseWriter, r *http.Request) { 172 | data, err := collector.CollectMetrics() 173 | if err != nil { 174 | http.Error(w, "Failed to collect metrics", http.StatusInternalServerError) 175 | log.Printf("Error collecting metrics: %v", err) 176 | return 177 | } 178 | 179 | reportFilePath, err := report.GenerateReport(*data) 180 | if err != nil { 181 | http.Error(w, "Failed to generate report", http.StatusInternalServerError) 182 | log.Printf("Error generating report: %v", err) 183 | return 184 | } 185 | 186 | http.Redirect(w, r, "/"+reportFilePath, http.StatusSeeOther) 187 | } 188 | } 189 | 190 | // makeStatusHandler 创建状态页面处理器 191 | func makeStatusHandler(client metrics.PrometheusAPI, config *config.Config) http.HandlerFunc { 192 | return func(w http.ResponseWriter, r *http.Request) { 193 | data, err := status.CollectMetricStatus(client, config) 194 | if err != nil { 195 | http.Error(w, "Failed to collect status data", http.StatusInternalServerError) 196 | log.Printf("Error collecting status data: %v", err) 197 | return 198 | } 199 | 200 | // 创建模板函数映射 201 | funcMap := template.FuncMap{ 202 | "now": time.Now, 203 | "date": func(format string, t time.Time) string { 204 | return t.Format(format) 205 | }, 206 | } 207 | 208 | tmpl := template.New("status.html").Funcs(funcMap) 209 | tmpl, err = tmpl.ParseFiles("templates/status.html") 210 | if err != nil { 211 | http.Error(w, "Failed to parse template", http.StatusInternalServerError) 212 | log.Printf("Error parsing template: %v", err) 213 | return 214 | } 215 | 216 | if err := tmpl.Execute(w, data); err != nil { 217 | http.Error(w, "Failed to render template", http.StatusInternalServerError) 218 | log.Printf("Error rendering template: %v", err) 219 | return 220 | } 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /outputs/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/outputs/readme.md -------------------------------------------------------------------------------- /pkg/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import "PromAI/pkg/notify" 4 | 5 | type Config struct { 6 | PrometheusURL string `yaml:"prometheus_url"` 7 | MetricTypes []MetricType `yaml:"metric_types"` 8 | ProjectName string `yaml:"project_name"` 9 | CronSchedule string `yaml:"cron_schedule"` 10 | ReportCleanup struct { 11 | Enabled bool `yaml:"enabled"` 12 | MaxAge int `yaml:"max_age"` 13 | CronSchedule string `yaml:"cron_schedule"` 14 | } `yaml:"report_cleanup"` 15 | Notifications struct { 16 | Dingtalk notify.DingtalkConfig `yaml:"dingtalk"` 17 | Email notify.EmailConfig `yaml:"email"` 18 | } `yaml:"notifications"` 19 | Port string `yaml:"port"` 20 | } 21 | 22 | type MetricType struct { 23 | Type string `yaml:"type"` 24 | Metrics []MetricConfig `yaml:"metrics"` 25 | } 26 | 27 | type MetricConfig struct { 28 | Name string `yaml:"name"` 29 | Description string `yaml:"description"` 30 | Query string `yaml:"query"` 31 | Threshold float64 `yaml:"threshold"` 32 | Unit string `yaml:"unit"` 33 | Labels map[string]string `yaml:"labels"` 34 | ThresholdType string `yaml:"threshold_type"` 35 | } 36 | -------------------------------------------------------------------------------- /pkg/metrics/collector.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "html/template" 7 | "log" 8 | "time" 9 | 10 | v1 "github.com/prometheus/client_golang/api/prometheus/v1" 11 | "github.com/prometheus/common/model" 12 | 13 | "PromAI/pkg/config" 14 | "PromAI/pkg/report" 15 | ) 16 | 17 | // Collector 处理指标收集 18 | type Collector struct { 19 | Client PrometheusAPI 20 | config *config.Config 21 | } 22 | 23 | type PrometheusAPI interface { 24 | Query(ctx context.Context, query string, ts time.Time, opts ...v1.Option) (model.Value, v1.Warnings, error) 25 | QueryRange(ctx context.Context, query string, r v1.Range, opts ...v1.Option) (model.Value, v1.Warnings, error) 26 | } 27 | 28 | // NewCollector 创建新的收集器 29 | func NewCollector(client PrometheusAPI, config *config.Config) *Collector { 30 | return &Collector{ 31 | Client: client, 32 | config: config, 33 | } 34 | } 35 | 36 | // CollectMetrics 收集指标数据 37 | func (c *Collector) CollectMetrics() (*report.ReportData, error) { 38 | ctx := context.Background() 39 | 40 | data := &report.ReportData{ 41 | Timestamp: time.Now(), 42 | MetricGroups: make(map[string]*report.MetricGroup), 43 | ChartData: make(map[string]template.JS), 44 | Project: c.config.ProjectName, 45 | } 46 | 47 | for _, metricType := range c.config.MetricTypes { 48 | group := &report.MetricGroup{ 49 | Type: metricType.Type, 50 | MetricsByName: make(map[string][]report.MetricData), 51 | } 52 | data.MetricGroups[metricType.Type] = group 53 | 54 | for _, metric := range metricType.Metrics { 55 | result, _, err := c.Client.Query(ctx, metric.Query, time.Now()) 56 | if err != nil { 57 | log.Printf("警告: 查询指标 %s 失败: %v", metric.Name, err) 58 | continue 59 | } 60 | log.Printf("指标 [%s] 查询结果: %+v", metric.Name, result) 61 | 62 | switch v := result.(type) { 63 | case model.Vector: 64 | metrics := make([]report.MetricData, 0, len(v)) 65 | for _, sample := range v { 66 | log.Printf("指标 [%s] 原始数据: %+v, 值: %+v", metric.Name, sample.Metric, sample.Value) 67 | 68 | availableLabels := make(map[string]string) 69 | for labelName, labelValue := range sample.Metric { 70 | availableLabels[string(labelName)] = string(labelValue) 71 | } 72 | 73 | labels := make([]report.LabelData, 0, len(metric.Labels)) 74 | for configLabel, configAlias := range metric.Labels { 75 | labelValue := "-" 76 | if rawValue, exists := availableLabels[configLabel]; exists && rawValue != "" { 77 | labelValue = rawValue 78 | } else { 79 | log.Printf("警告: 指标 [%s] 标签 [%s] 缺失或为空", metric.Name, configLabel) 80 | } 81 | 82 | labels = append(labels, report.LabelData{ 83 | Name: configLabel, 84 | Alias: configAlias, 85 | Value: labelValue, 86 | }) 87 | } 88 | 89 | if !validateLabels(labels) { 90 | log.Printf("警告: 指标 [%s] 标签数据不完整,跳过该条记录", metric.Name) 91 | continue 92 | } 93 | 94 | metricData := report.MetricData{ 95 | Name: metric.Name, 96 | Description: metric.Description, 97 | Value: float64(sample.Value), 98 | Threshold: metric.Threshold, 99 | Unit: metric.Unit, 100 | Status: getStatus(float64(sample.Value), metric.Threshold, metric.ThresholdType), 101 | StatusText: report.GetStatusText(getStatus(float64(sample.Value), metric.Threshold, metric.ThresholdType)), 102 | Timestamp: time.Now(), 103 | Labels: labels, 104 | } 105 | 106 | if err := validateMetricData(metricData, metric.Labels); err != nil { 107 | log.Printf("警告: 指标 [%s] 数据验证失败: %v", metric.Name, err) 108 | continue 109 | } 110 | 111 | metrics = append(metrics, metricData) 112 | } 113 | group.MetricsByName[metric.Name] = metrics 114 | } 115 | } 116 | } 117 | return data, nil 118 | } 119 | 120 | // validateMetricData 验证指标数据的完整性 121 | func validateMetricData(data report.MetricData, configLabels map[string]string) error { 122 | if len(data.Labels) != len(configLabels) { 123 | return fmt.Errorf("标签数量不匹配: 期望 %d, 实际 %d", 124 | len(configLabels), len(data.Labels)) 125 | } 126 | 127 | labelMap := make(map[string]bool) 128 | for _, label := range data.Labels { 129 | if _, exists := configLabels[label.Name]; !exists { 130 | return fmt.Errorf("发现未配置的标签: %s", label.Name) 131 | } 132 | if label.Value == "" || label.Value == "-" { 133 | return fmt.Errorf("标签 %s 值为空", label.Name) 134 | } 135 | labelMap[label.Name] = true 136 | } 137 | 138 | return nil 139 | } 140 | 141 | // getStatus 获取状态 142 | func getStatus(value, threshold float64, thresholdType string) string { 143 | if thresholdType == "" { 144 | thresholdType = "greater" 145 | } 146 | switch thresholdType { 147 | case "greater": 148 | if value > threshold { 149 | return "critical" 150 | } else if value >= threshold*0.8 { 151 | return "warning" 152 | } 153 | case "greater_equal": 154 | if value >= threshold { 155 | return "critical" 156 | } else if value >= threshold*0.8 { 157 | return "warning" 158 | } 159 | case "less": 160 | if value < threshold { 161 | return "normal" 162 | } else if value <= threshold*1.2 { 163 | return "warning" 164 | } 165 | case "less_equal": 166 | if value <= threshold { 167 | return "normal" 168 | } else if value <= threshold*1.2 { 169 | return "warning" 170 | } 171 | case "equal": 172 | if value == threshold { 173 | return "normal" 174 | } else if value > threshold { 175 | return "critical" 176 | } 177 | return "critical" 178 | } 179 | return "normal" 180 | } 181 | 182 | // validateLabels 验证标签数据的完整性 183 | func validateLabels(labels []report.LabelData) bool { 184 | for _, label := range labels { 185 | if label.Value == "" || label.Value == "-" { 186 | return false 187 | } 188 | } 189 | return true 190 | } 191 | -------------------------------------------------------------------------------- /pkg/notify/notify.go: -------------------------------------------------------------------------------- 1 | package notify 2 | 3 | import ( 4 | // "PromAI/pkg/utils" 5 | "bytes" 6 | "crypto/hmac" 7 | "crypto/sha256" 8 | "crypto/tls" 9 | "encoding/base64" 10 | "encoding/json" 11 | "fmt" 12 | "io" 13 | "log" 14 | "mime/multipart" 15 | "net/http" 16 | "net/smtp" 17 | "net/url" 18 | "os" 19 | "path/filepath" 20 | "time" 21 | 22 | "github.com/jordan-wright/email" 23 | ) 24 | 25 | type DingtalkConfig struct { 26 | Enabled bool `yaml:"enabled"` 27 | Webhook string `yaml:"webhook"` 28 | Secret string `yaml:"secret"` 29 | ReportURL string `yaml:"report_url"` 30 | } 31 | 32 | type EmailConfig struct { 33 | Enabled bool `yaml:"enabled"` 34 | SMTPHost string `yaml:"smtp_host"` 35 | SMTPPort int `yaml:"smtp_port"` 36 | Username string `yaml:"username"` 37 | Password string `yaml:"password"` 38 | From string `yaml:"from"` 39 | To []string `yaml:"to"` 40 | ReportURL string `yaml:"report_url"` 41 | } 42 | 43 | // config/config.yaml 中 dingtalk 配置 44 | // notifications: 45 | // dingtalk: 46 | // enabled: true 47 | // webhook: "https://oapi.dingtalk.com/robot/send?access_token=29f727c8c973e5fb8d8339968d059393a4b4bb0bdcd667d592996035a8c0e135" 48 | // secret: "SEC75fd20834b42064b86c1aa97930738befeb2fe214044649397752212c5894848" 49 | 50 | // SendDingtalk 发送钉钉通知 51 | func SendDingtalk(config DingtalkConfig, reportPath string) error { 52 | if !config.Enabled { 53 | log.Printf("钉钉通知未启用") 54 | return nil 55 | } 56 | log.Printf("开始发送钉钉通知...") 57 | // 计算时间戳和签名 58 | timestamp := time.Now().UnixMilli() 59 | sign := calculateDingtalkSign(timestamp, config.Secret) 60 | webhook := fmt.Sprintf("%s×tamp=%d&sign=%s", config.Webhook, timestamp, sign) 61 | 62 | log.Printf("准备发送请求到 webhook: %s", webhook) 63 | // 创建multipart表单 64 | body := &bytes.Buffer{} 65 | writer := multipart.NewWriter(body) 66 | 67 | // 添加文件 68 | file, err := os.Open(reportPath) 69 | if err != nil { 70 | log.Printf("打开文件失败: %v", err) 71 | return fmt.Errorf("打开文件失败: %v", err) 72 | } 73 | defer file.Close() 74 | 75 | part, err := writer.CreateFormFile("file", filepath.Base(reportPath)) 76 | if err != nil { 77 | log.Printf("创建表单文件失败: %v", err) 78 | return fmt.Errorf("创建表单文件失败: %v", err) 79 | } 80 | 81 | fileContent, err := os.ReadFile(reportPath) 82 | if err != nil { 83 | log.Printf("读取文件失败: %v", err) 84 | return fmt.Errorf("读取文件失败: %v", err) 85 | } 86 | part.Write(fileContent) 87 | 88 | // 正确生成报告的访问链接 89 | reportFileName := filepath.Base(reportPath) 90 | reportLink := fmt.Sprintf("%s/reports/%s", config.ReportURL, reportFileName) 91 | 92 | // 添加消息内容 93 | messageContent := map[string]interface{}{ 94 | "msgtype": "markdown", 95 | "markdown": map[string]string{ 96 | "title": "巡检报告", 97 | "text": fmt.Sprintf("## 🔍 巡检报告已生成\n\n"+ 98 | "### ⏰ 生成时间\n"+ 99 | "> %s\n\n"+ 100 | "### 📄 报告详情\n"+ 101 | "- **文件名**:`%s`\n"+ 102 | "- **访问链接**:[点击查看报告](%s)\n\n"+ 103 | "---\n"+ 104 | "💡 请登录环境查看完整报告内容", 105 | time.Now().Format("2006-01-02 15:04:05"), 106 | reportFileName, 107 | reportLink), 108 | }, 109 | } 110 | 111 | jsonData, err := json.Marshal(messageContent) 112 | if err != nil { 113 | log.Printf("JSON编码失败: %v", err) 114 | return fmt.Errorf("JSON编码失败: %v", err) 115 | } 116 | 117 | // 发送请求 118 | req, err := http.NewRequest("POST", webhook, bytes.NewBuffer(jsonData)) 119 | if err != nil { 120 | log.Printf("创建请求失败: %v", err) 121 | return fmt.Errorf("创建请求失败: %v", err) 122 | } 123 | req.Header.Set("Content-Type", "application/json") 124 | 125 | client := &http.Client{} 126 | resp, err := client.Do(req) 127 | if err != nil { 128 | log.Printf("发送请求失败: %v", err) 129 | return fmt.Errorf("发送请求失败: %v", err) 130 | } 131 | defer resp.Body.Close() 132 | respBody, _ := io.ReadAll(resp.Body) 133 | log.Printf("钉钉响应状态码: %d, 响应内容: %s", resp.StatusCode, string(respBody)) 134 | 135 | if resp.StatusCode != http.StatusOK { 136 | return fmt.Errorf("钉钉发送失败,状态码: %d", resp.StatusCode) 137 | } 138 | 139 | log.Printf("钉钉通知发送成功") 140 | return nil 141 | } 142 | 143 | // SendEmail 发送邮件通知 144 | func SendEmail(config EmailConfig, reportPath string) error { 145 | if !config.Enabled { 146 | log.Printf("邮件通知未启用") 147 | return nil 148 | } 149 | 150 | log.Printf("开始发送邮件通知...") 151 | log.Printf("SMTP服务器: %s:%d", config.SMTPHost, config.SMTPPort) 152 | log.Printf("发件人: %s", config.From) 153 | log.Printf("收件人: %v", config.To) 154 | 155 | e := email.NewEmail() 156 | e.From = config.From 157 | e.To = config.To 158 | e.Subject = "巡检报告" 159 | 160 | // 正确生成报告的访问链接 161 | reportFileName := filepath.Base(reportPath) 162 | reportLink := fmt.Sprintf("%s/reports/%s", config.ReportURL, reportFileName) 163 | 164 | // 添加更丰富的邮件内容 165 | e.HTML = []byte(fmt.Sprintf(` 166 |

🔍 巡检报告已生成

167 |

生成时间:%s

168 |

报告文件:%s

169 |

在线查看:点击查看报告

170 |

请登录环境查看完整报告内容!

171 | `, 172 | time.Now().Format("2006-01-02 15:04:05"), 173 | reportFileName, 174 | reportLink)) 175 | 176 | // 添加附件 177 | if _, err := e.AttachFile(reportPath); err != nil { 178 | log.Printf("添加附件失败: %v", err) 179 | return fmt.Errorf("添加附件失败: %v", err) 180 | } 181 | 182 | // 发送邮件(使用TLS) 183 | addr := fmt.Sprintf("%s:%d", config.SMTPHost, config.SMTPPort) 184 | auth := smtp.PlainAuth("", config.Username, config.Password, config.SMTPHost) 185 | 186 | tlsConfig := &tls.Config{ 187 | InsecureSkipVerify: true, 188 | ServerName: config.SMTPHost, 189 | } 190 | 191 | log.Printf("正在发送邮件...") 192 | if err := e.SendWithTLS(addr, auth, tlsConfig); err != nil { 193 | log.Printf("发送邮件失败: %v", err) 194 | log.Printf("SMTP配置信息:") 195 | log.Printf("- 服务器: %s", config.SMTPHost) 196 | log.Printf("- 端口: %d", config.SMTPPort) 197 | log.Printf("- 用户名: %s", config.Username) 198 | return fmt.Errorf("发送邮件失败: %v", err) 199 | } 200 | 201 | log.Printf("邮件发送成功") 202 | return nil 203 | } 204 | 205 | // calculateDingtalkSign 计算钉钉签名 206 | func calculateDingtalkSign(timestamp int64, secret string) string { 207 | stringToSign := fmt.Sprintf("%d\n%s", timestamp, secret) 208 | h := hmac.New(sha256.New, []byte(secret)) 209 | h.Write([]byte(stringToSign)) 210 | return url.QueryEscape(base64.StdEncoding.EncodeToString(h.Sum(nil))) 211 | } 212 | -------------------------------------------------------------------------------- /pkg/prometheus/client.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/prometheus/client_golang/api" 7 | v1 "github.com/prometheus/client_golang/api/prometheus/v1" 8 | ) 9 | 10 | // Client 封装 Prometheus 客户端 11 | type Client struct { 12 | API v1.API 13 | } 14 | 15 | // NewClient 创建新的 Prometheus 客户端 16 | func NewClient(url string) (*Client, error) { 17 | client, err := api.NewClient(api.Config{ 18 | Address: url, 19 | }) 20 | if err != nil { 21 | return nil, fmt.Errorf("creating prometheus client: %w", err) 22 | } 23 | 24 | return &Client{ 25 | API: v1.NewAPI(client), 26 | }, nil 27 | } 28 | -------------------------------------------------------------------------------- /pkg/prometheus/prometheus.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | "io" 5 | "net/http" 6 | ) 7 | 8 | func FetchData(url string) ([]byte, error) { 9 | resp, err := http.Get(url) 10 | if err != nil { 11 | return nil, err 12 | } 13 | defer resp.Body.Close() 14 | return io.ReadAll(resp.Body) 15 | } 16 | -------------------------------------------------------------------------------- /pkg/report/cleanup.go: -------------------------------------------------------------------------------- 1 | package report 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "path/filepath" 7 | "time" 8 | ) 9 | 10 | // CleanupReports 清理旧报告 11 | func CleanupReports(maxAge int) error { 12 | reportsDir := "reports" 13 | now := time.Now() 14 | 15 | // 遍历报告目录 16 | return filepath.Walk(reportsDir, func(path string, info os.FileInfo, err error) error { 17 | if err != nil { 18 | return err 19 | } 20 | 21 | // 跳过目录本身 22 | if path == reportsDir { 23 | return nil 24 | } 25 | 26 | // 检查文件年龄 27 | if info.ModTime().Add(time.Duration(maxAge) * 24 * time.Hour).Before(now) { 28 | if err := os.Remove(path); err != nil { 29 | log.Printf("删除报告文件失败 %s: %v", path, err) 30 | return err 31 | } 32 | log.Printf("已删除过期报告: %s", path) 33 | } 34 | 35 | return nil 36 | }) 37 | } -------------------------------------------------------------------------------- /pkg/report/generator.go: -------------------------------------------------------------------------------- 1 | package report 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "html/template" 7 | "log" 8 | "math" 9 | "os" 10 | "sort" 11 | "time" 12 | ) 13 | 14 | type LabelData struct { 15 | Name string // 原始标签名 16 | Alias string // 显示的别名 17 | Value string // 标签值 18 | } 19 | type GroupStats struct { 20 | MaxValue float64 21 | MinValue float64 22 | Average float64 23 | AlertCount int // 告警数量 24 | CriticalCount int // 严重告警数量 25 | WarningCount int // 警告数量 26 | TotalCount int // 总指标数 27 | } 28 | type MetricData struct { 29 | Instance string 30 | Name string 31 | Description string 32 | Value float64 33 | Threshold float64 34 | Unit string 35 | Status string 36 | StatusText string 37 | Timestamp time.Time 38 | Labels []LabelData // 改用结构化的标签数据 39 | } 40 | 41 | type MetricGroup struct { 42 | Type string 43 | MetricsByName map[string][]MetricData 44 | Stats GroupStats // 替换原来的 Average 45 | } 46 | type ReportData struct { 47 | Timestamp time.Time 48 | MetricGroups map[string]*MetricGroup 49 | ChartData map[string]template.JS 50 | Project string 51 | } 52 | 53 | func GetStatusText(status string) string { 54 | switch status { 55 | case "critical": 56 | return "严重" 57 | case "warning": 58 | return "警告" 59 | default: 60 | return "正常" 61 | } 62 | } 63 | 64 | func GenerateReport(data ReportData) (string, error) { 65 | // 计算每个组的统计信息 66 | for _, group := range data.MetricGroups { 67 | stats := GroupStats{ 68 | MinValue: math.MaxFloat64, 69 | } 70 | 71 | for _, metrics := range group.MetricsByName { 72 | for _, metric := range metrics { 73 | // 更新最大最小值 74 | stats.MaxValue = math.Max(stats.MaxValue, metric.Value) 75 | stats.MinValue = math.Min(stats.MinValue, metric.Value) 76 | stats.TotalCount++ 77 | 78 | // 累加值用于计算平均值 79 | // stats.Average += metric.Value 80 | 81 | // 统计告警数量 82 | switch metric.Status { 83 | case "warning": 84 | stats.WarningCount++ 85 | stats.AlertCount++ 86 | case "critical": 87 | stats.CriticalCount++ 88 | stats.AlertCount++ 89 | } 90 | } 91 | } 92 | 93 | // 计算平均值 平均值无意义,先暂时取消 94 | // if stats.TotalCount > 0 { 95 | // stats.Average = stats.Average / float64(stats.TotalCount) 96 | // } 97 | group.Stats = stats 98 | } 99 | 100 | // 处理图表数据 101 | allLabels := make(map[string]bool) // 用于存储所有唯一的标签值 102 | chartData := make(map[string][]float64) // 用于存储图表数据 103 | // 收集所有唯一的标签值和准备图表数据 104 | labelValuesByMetric := make(map[string]map[string]bool) // 按指标存储唯一标签值 105 | 106 | // 第一次遍历收集每个指标的唯一标签值 107 | for _, group := range data.MetricGroups { 108 | for metricName, metrics := range group.MetricsByName { 109 | metricKey := fmt.Sprintf("%s_%s", group.Type, metricName) 110 | labelValuesByMetric[metricKey] = make(map[string]bool) 111 | // log.Println("指标组:", group.Type, "指标:", metricName, "指标键:", metricKey) 112 | for _, metric := range metrics { 113 | for _, label := range metric.Labels { 114 | labelValuesByMetric[metricKey][label.Value] = true 115 | // log.Println("指标组:", group.Type, "指标:", metricName, "指标键:", metricKey, "标签值:", label.Value) 116 | allLabels[label.Value] = true 117 | 118 | } 119 | } 120 | } 121 | } 122 | 123 | // 第二次遍历按标签值顺序生成图表数据 124 | for _, group := range data.MetricGroups { 125 | for metricName, metrics := range group.MetricsByName { 126 | metricKey := fmt.Sprintf("%s_%s", group.Type, metricName) 127 | metricValues := make(map[string]float64) 128 | // log.Println("指标类型:", group.Type, "指标名称:", metricName, "指标Key:", metricKey) 129 | 130 | // 初始化所有标签值对应的指标值为0 131 | for labelValue := range labelValuesByMetric[metricKey] { 132 | 133 | metricValues[labelValue] = 0 134 | 135 | log.Println("标签值:", labelValue, "指标值:", metricValues[labelValue]) 136 | } 137 | 138 | // 填充实际的指标值 139 | for _, metric := range metrics { 140 | if len(metric.Labels) > 0 { 141 | metricValues[metric.Labels[0].Value] = metric.Value 142 | } 143 | } 144 | 145 | // 按标签值顺序添加到图表数据 146 | chartData[metricKey] = make([]float64, 0) 147 | for labelValue := range labelValuesByMetric[metricKey] { 148 | chartData[metricKey] = append(chartData[metricKey], metricValues[labelValue]) 149 | } 150 | // log.Println("图表数据:", metricKey, "图表数据值:", chartData[metricKey]) 151 | } 152 | } 153 | 154 | // 转换标签为数组并排序 155 | labels := make([]string, 0, len(allLabels)) 156 | for label := range allLabels { 157 | labels = append(labels, label) 158 | } 159 | sort.Strings(labels) 160 | 161 | // 转换为JSON 162 | labelsJSON, _ := json.Marshal(labels) 163 | data.ChartData["labels"] = template.JS(labelsJSON) 164 | // log.Println("标签:", labels) 165 | // 为每个指标生成图表数据 166 | for key, values := range chartData { 167 | valuesJSON, _ := json.Marshal(values) 168 | data.ChartData[key] = template.JS(valuesJSON) 169 | } 170 | 171 | // 生成报告 172 | tmpl, err := template.ParseFiles("templates/report.html") 173 | if err != nil { 174 | return "", fmt.Errorf("parsing template: %w", err) 175 | } 176 | 177 | // 创建输出文件 178 | filename := fmt.Sprintf("reports/inspection_report_%s.html", time.Now().Format("20060102_150405")) 179 | file, err := os.Create(filename) 180 | if err != nil { 181 | return "", fmt.Errorf("creating output file: %w", err) 182 | } 183 | defer file.Close() 184 | 185 | // 执行模板 186 | if err := tmpl.Execute(file, data); err != nil { 187 | return "", fmt.Errorf("executing template: %w", err) 188 | } 189 | 190 | // log.Println("Report generated successfully:", filename) 191 | log.Printf("项目[%s]报告生成成功: %s", data.Project, filename) 192 | 193 | return filename, nil // 添加返回语句 194 | } 195 | -------------------------------------------------------------------------------- /pkg/status/status.go: -------------------------------------------------------------------------------- 1 | package status 2 | 3 | import ( 4 | "context" 5 | "log" 6 | "time" 7 | 8 | "PromAI/pkg/config" 9 | "PromAI/pkg/metrics" 10 | 11 | v1 "github.com/prometheus/client_golang/api/prometheus/v1" 12 | "github.com/prometheus/common/model" 13 | ) 14 | 15 | // 添加配置相关的类型定义 16 | type Config struct { 17 | PrometheusURL string `yaml:"prometheus_url"` 18 | MetricTypes []MetricType `yaml:"metric_types"` 19 | } 20 | 21 | type MetricType struct { 22 | Type string `yaml:"type"` 23 | Metrics []MetricConfig `yaml:"metrics"` 24 | } 25 | 26 | type MetricConfig struct { 27 | Name string `yaml:"name"` 28 | Description string `yaml:"description"` 29 | Query string `yaml:"query"` 30 | Threshold float64 `yaml:"threshold"` 31 | Unit string `yaml:"unit"` 32 | Labels map[string]string `yaml:"labels"` 33 | ThresholdType string `yaml:"threshold_type"` 34 | } 35 | 36 | type StatusSummary struct { 37 | Normal int 38 | Warning int // 新增警告状态计数 39 | Abnormal int 40 | TotalMetrics int // 总指标数 41 | TypeCounts map[string]int // 每种类型的指标数量 42 | } 43 | 44 | type MetricStatus struct { 45 | Name string 46 | DailyStatus map[string]string // key是日期,value是状态("normal"/"warning"/"abnormal") 47 | Threshold float64 48 | Unit string 49 | ThresholdType string 50 | } 51 | 52 | type StatusData struct { 53 | Summary StatusSummary 54 | Metrics []MetricStatus 55 | Dates []string 56 | } 57 | 58 | func GenerateStatusData(days int) (*StatusData, error) { 59 | data := &StatusData{ 60 | Summary: StatusSummary{ 61 | TypeCounts: make(map[string]int), // 初始化类型计数map 62 | }, 63 | Metrics: []MetricStatus{}, 64 | Dates: make([]string, days), 65 | } 66 | 67 | // 生成最近n天的日期 68 | now := time.Now() 69 | for i := 0; i < days; i++ { 70 | date := now.AddDate(0, 0, -i) 71 | data.Dates[days-1-i] = date.Format("01-02") // MM-DD格式 72 | } 73 | 74 | return data, nil 75 | } 76 | 77 | func CollectMetricStatus(client metrics.PrometheusAPI, config *config.Config) (*StatusData, error) { 78 | data, err := GenerateStatusData(7) // 显示最近7天的数据 79 | if err != nil { 80 | log.Printf("生成状态数据失败: %v", err) 81 | return nil, err 82 | } 83 | 84 | log.Printf("开始收集指标状态数据,时间范围: %v", data.Dates) 85 | 86 | // 遍历所有指标类型 87 | for _, metricType := range config.MetricTypes { 88 | log.Printf("处理指标类型: %s", metricType.Type) 89 | 90 | // 统计每种类型的指标数量 91 | data.Summary.TypeCounts[metricType.Type] = len(metricType.Metrics) 92 | // 累加总指标数 93 | data.Summary.TotalMetrics += len(metricType.Metrics) 94 | 95 | // 遍历每个指标 96 | for _, metric := range metricType.Metrics { 97 | log.Printf("处理指标: %s (阈值: %v %s, 阈值类型: %s)", 98 | metric.Name, metric.Threshold, metric.Unit, metric.ThresholdType) 99 | 100 | metricStatus := MetricStatus{ 101 | Name: metric.Name, 102 | DailyStatus: make(map[string]string), 103 | Threshold: metric.Threshold, 104 | Unit: metric.Unit, 105 | ThresholdType: metric.ThresholdType, 106 | } 107 | 108 | // 查询每天的状态 109 | for _, date := range data.Dates { 110 | status, err := queryMetricStatus(client, metric, date) 111 | if err != nil { 112 | log.Printf("查询指标 [%s] 在 %s 的状态失败: %v", metric.Name, date, err) 113 | metricStatus.DailyStatus[date] = "abnormal" 114 | data.Summary.Abnormal++ 115 | } else { 116 | metricStatus.DailyStatus[date] = status 117 | switch status { 118 | case "normal": 119 | log.Printf("指标 [%s] 在 %s 状态正常", metric.Name, date) 120 | data.Summary.Normal++ 121 | case "warning": 122 | log.Printf("指标 [%s] 在 %s 状态警告", metric.Name, date) 123 | data.Summary.Warning++ 124 | case "abnormal": 125 | log.Printf("指标 [%s] 在 %s 状态异常", metric.Name, date) 126 | data.Summary.Abnormal++ 127 | } 128 | } 129 | } 130 | 131 | data.Metrics = append(data.Metrics, metricStatus) 132 | } 133 | } 134 | 135 | log.Printf("状态数据收集完成. 总指标数: %d, 正常: %d, 警告: %d, 异常: %d", 136 | data.Summary.TotalMetrics, data.Summary.Normal, data.Summary.Warning, data.Summary.Abnormal) 137 | 138 | // 打印每种类型的指标数量 139 | for typeName, count := range data.Summary.TypeCounts { 140 | log.Printf("指标类型 [%s] 包含 %d 个指标", typeName, count) 141 | } 142 | 143 | return data, nil 144 | } 145 | 146 | func queryMetricStatus(client metrics.PrometheusAPI, metric config.MetricConfig, date string) (string, error) { 147 | ctx := context.Background() 148 | 149 | dateTime, err := time.Parse("01-02", date) 150 | if err != nil { 151 | return "abnormal", err 152 | } 153 | 154 | // 设置查询时间范围为那一天的0点到23:59:59 155 | startTime := time.Date(time.Now().Year(), dateTime.Month(), dateTime.Day(), 0, 0, 0, 0, time.Local) 156 | endTime := startTime.Add(24 * time.Hour).Add(-time.Second) 157 | 158 | log.Printf(` 159 | 查询指标: [%s] 160 | 时间范围: %s 到 %s 161 | PromQL: %s 162 | 调试步骤: 163 | 1. 打开 Prometheus UI 164 | 2. 粘贴查询: %s 165 | 3. 设置时间范围为: %s 到 %s 166 | -------------------`, 167 | metric.Name, 168 | startTime.Format("2006-01-02 15:04:05"), 169 | endTime.Format("2006-01-02 15:04:05"), 170 | metric.Query, 171 | metric.Query, 172 | startTime.Format("2006-01-02 15:04:05"), 173 | endTime.Format("2006-01-02 15:04:05")) 174 | 175 | // 直接使用原始查询语句 176 | result, _, err := client.QueryRange(ctx, metric.Query, v1.Range{ 177 | Start: startTime, 178 | End: endTime, 179 | Step: time.Hour, // 每小时一个采样点 180 | }) 181 | 182 | if err != nil { 183 | log.Printf("执行查询失败 [%s]: %v", metric.Query, err) 184 | return "abnormal", err 185 | } 186 | 187 | switch v := result.(type) { 188 | case model.Matrix: 189 | if len(v) == 0 { 190 | log.Printf("指标 [%s] 查询结果为空", metric.Name) 191 | return "abnormal", nil 192 | } 193 | 194 | log.Printf("指标 [%s] 返回 %d 个时间序列", metric.Name, len(v)) 195 | 196 | maxValue := float64(0) 197 | // 遍历每个时间序列 198 | for _, series := range v { 199 | // 遍历每个采样点,找出最大值 200 | for _, sample := range series.Values { 201 | value := float64(sample.Value) 202 | if value > maxValue { 203 | maxValue = value 204 | } 205 | log.Printf("指标 [%s] 时间: %v, 值: %v", 206 | metric.Name, 207 | sample.Timestamp.Time().Format("15:04:05"), 208 | value) 209 | } 210 | } 211 | 212 | // 使用最大值进行阈值判断 213 | status := checkThreshold(maxValue, metric.Threshold, metric.ThresholdType) 214 | log.Printf("指标 [%s] 最大值: %v, 阈值: %v, 阈值类型: %s, 状态: %s", 215 | metric.Name, 216 | maxValue, 217 | metric.Threshold, 218 | metric.ThresholdType, 219 | status) 220 | 221 | return status, nil 222 | 223 | default: 224 | log.Printf("指标 [%s] 返回了意外的结果类型: %T", metric.Name, result) 225 | return "abnormal", nil 226 | } 227 | } 228 | 229 | // 根据阈值类型判断状态 230 | func checkThreshold(value, threshold float64, thresholdType string) string { 231 | if thresholdType == "" { 232 | thresholdType = "greater" // 默认值 233 | } 234 | 235 | // 警告阈值为正常阈值的90% 236 | warningFactor := 0.9 237 | 238 | switch thresholdType { 239 | case "greater": 240 | // 当值大于阈值时告警 241 | // 例如:CPU使用率 > 80% 告警 242 | if value > threshold { 243 | return "abnormal" 244 | } else if value > threshold*warningFactor { 245 | return "warning" 246 | } 247 | return "normal" 248 | case "greater_equal": 249 | // 当值大于等于阈值时告警 250 | if value >= threshold { 251 | return "abnormal" 252 | } else if value >= threshold*warningFactor { 253 | return "warning" 254 | } 255 | return "normal" 256 | case "less": 257 | // 当值小于阈值时告警 258 | // 例如:可用节点数 < 3 告警 259 | if value < threshold { 260 | return "abnormal" 261 | } else if value < threshold/warningFactor { 262 | return "warning" 263 | } 264 | return "normal" 265 | case "less_equal": 266 | // 当值小于等于阈值时告警 267 | if value <= threshold { 268 | return "abnormal" 269 | } else if value <= threshold/warningFactor { 270 | return "warning" 271 | } 272 | return "normal" 273 | case "equal": 274 | // 值必须等于阈值才正常 275 | if value == threshold { 276 | return "normal" 277 | } 278 | return "abnormal" 279 | case "not_equal": 280 | // 值不等于阈值才正常 281 | if value != threshold { 282 | return "normal" 283 | } 284 | return "abnormal" 285 | default: 286 | // 默认情况:大于阈值告警 287 | if value > threshold { 288 | return "abnormal" 289 | } else if value > threshold*warningFactor { 290 | return "warning" 291 | } 292 | return "normal" 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /pkg/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "net" 5 | ) 6 | 7 | var globalPort string 8 | 9 | func GetLocalIPs() []string { 10 | interfaces, err := net.Interfaces() 11 | if err != nil { 12 | return nil 13 | } 14 | 15 | var ips []string 16 | for _, iface := range interfaces { 17 | // 跳过禁用的接口 18 | if iface.Flags&net.FlagUp == 0 { 19 | continue 20 | } 21 | 22 | // 获取接口的所有地址 23 | addrs, err := iface.Addrs() 24 | if err != nil { 25 | continue 26 | } 27 | 28 | for _, addr := range addrs { 29 | if ipnet, ok := addr.(*net.IPNet); ok { 30 | // 排除回环地址和 IPv6 地址 31 | if !ipnet.IP.IsLoopback() && ipnet.IP.To4() != nil { 32 | ips = append(ips, ipnet.IP.String()) 33 | } 34 | } 35 | } 36 | } 37 | return ips 38 | } 39 | 40 | func SetGlobalPort(port string) { 41 | globalPort = port 42 | } 43 | 44 | func GetGlobalPort() string { 45 | return globalPort 46 | } 47 | 48 | 49 | -------------------------------------------------------------------------------- /reports/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubehan/PromAI/f94cf0c0c2ca84d1ebe168599afe6633f6509253/reports/.DS_Store -------------------------------------------------------------------------------- /templates/report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{.Project}} 6 | 7 | 154 | 155 | 156 |
157 | 158 |

{{.Project}}

159 |

生成时间: {{.Timestamp.Format "2006-01-02 15:04:05"}}

160 | 161 | 162 |
163 | {{range $type, $group := .MetricGroups}} 164 |
165 |

{{$type}}

166 |
167 |
168 |
最大值
169 |
{{printf "%.2f" $group.Stats.MaxValue}}
170 |
171 |
172 |
最小值
173 |
{{printf "%.2f" $group.Stats.MinValue}}
174 |
175 | 179 |
180 |
告警数
181 |
182 | {{$group.Stats.AlertCount}}/{{$group.Stats.TotalCount}} 183 |
184 |
185 |
186 |
告警详情
187 |
188 | 严重:{{$group.Stats.CriticalCount}} 189 | 警告:{{$group.Stats.WarningCount}} 190 |
191 |
192 |
193 |
194 | {{end}} 195 |
196 | 197 | 198 |
199 |

资源使用概览

200 |
201 | 202 |
203 |
204 | 205 | 206 | {{range $type, $group := .MetricGroups}} 207 |
208 |

{{$type}} 监控指标

209 | {{range $metricName, $metrics := $group.MetricsByName}} 210 |

  • {{$metricName}}
  • 211 | {{if eq (len $metrics) 0 }} 212 |

    未查询到数据

    213 | {{end}} 214 | {{if gt (len $metrics) 0}} 215 | 216 | 217 | 218 | {{$headerLabels := (index $metrics 0).Labels}} 219 | {{range $headerLabels}} 220 | 221 | {{end}} 222 | 223 | 224 | 225 | 226 | {{range $metric := $metrics}} 227 | 228 | 229 | {{range $headerLabels}} 230 | {{$labelName := .Name}} 231 | {{range $metricLabel := $metric.Labels}} 232 | {{if eq $metricLabel.Name $labelName}} 233 | 236 | {{end}} 237 | {{end}} 238 | {{end}} 239 | 240 | 247 | 248 | 249 | {{end}} 250 |
    指标名称{{.Alias}}状态检测时间
    {{.Name}} 234 | {{$metricLabel.Value}} 235 | {{printf "%.2f" $metric.Value}}{{$metric.Unit}} 241 | {{if eq .Status "normal"}}正常 242 | {{else if eq .Status "warning"}}警告 243 | {{else if eq .Status "critical"}}严重 244 | {{else}}{{.Status}} 245 | {{end}} 246 | {{.Timestamp.Format "2006-01-02 15:04:05"}}
    251 | {{end}} 252 | {{end}} 253 |
    254 | {{end}} 255 |
    256 | 257 | 386 | 387 | -------------------------------------------------------------------------------- /templates/status.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 服务健康看板 5 | 6 | 7 | 217 | 218 | 219 |
    220 |
    221 |

    服务健康看板

    222 |
    223 | 最后更新时间: {{now | date "2006-01-02 15:04:05"}} 224 |
    225 |
    226 | 227 |
    228 |
    229 |

    总指标数

    230 |
    {{.Summary.TotalMetrics}}
    231 |
    232 |
    233 |

    正常服务

    234 |
    {{.Summary.Normal}}
    235 |
    236 |
    237 |

    异常服务

    238 |
    {{.Summary.Abnormal}}
    239 |
    240 |
    241 |

    警告服务

    242 |
    {{.Summary.Warning}}
    243 |
    244 |
    245 | 246 |
    247 |

    指标类型统计

    248 |
    249 | {{range $type, $count := .Summary.TypeCounts}} 250 |
    251 |

    {{$type}}

    252 |
    {{$count}}
    253 |
    254 | {{end}} 255 |
    256 |
    257 | 258 |
    259 | 260 | 261 | 262 | 263 | {{range $date := .Dates}} 264 | 265 | {{end}} 266 | 267 | 268 | 269 | {{range $metric := .Metrics}} 270 | 271 | 290 | {{range $date := $.Dates}} 291 | 302 | {{end}} 303 | 304 | {{end}} 305 | 306 |
    指标信息{{$date}}
    272 |
    {{$metric.Name}}
    273 |
    274 | 阈值: {{$metric.Threshold}}{{$metric.Unit}} 275 | {{if eq $metric.ThresholdType "greater"}} 276 | (>报警) 277 | {{else if eq $metric.ThresholdType "greater_equal"}} 278 | (>=报警) 279 | {{else if eq $metric.ThresholdType "less"}} 280 | (<报警) 281 | {{else if eq $metric.ThresholdType "less_equal"}} 282 | (<=报警) 283 | {{else if eq $metric.ThresholdType "equal"}} 284 | (=正常) 285 | {{else if eq $metric.ThresholdType "not_equal"}} 286 | (!=正常) 287 | {{end}} 288 |
    289 |
    292 | 293 | {{if eq (index $metric.DailyStatus $date) "normal"}} 294 | ✓ 295 | {{else if eq (index $metric.DailyStatus $date) "warning"}} 296 | ⚠ 297 | {{else}} 298 | ✗ 299 | {{end}} 300 | 301 |
    307 |
    308 |
    309 | 310 | 311 | --------------------------------------------------------------------------------