├── .gitignore ├── .nvmrc ├── AUTHOR.md ├── CHANGELOGS.md ├── CHECKLIST.md ├── Introduction.md ├── PLACE_HOLDER.md ├── README.md ├── REFERENCES.md ├── SUMMARY.md ├── TODO.md ├── Vagrantfile ├── alert ├── README.md ├── SUMMARY.md ├── alert-manager-config.md ├── alert-manager-extension-with-webhook.md ├── alert-manager-inhibit.md ├── alert-manager-mute.md ├── alert-manager-route.md ├── alert-manager-use-receiver.md ├── alert-template.md ├── alert-with-slack.md ├── alert-with-smtp.md ├── alert-with-wechat.md ├── install-alert-manager.md ├── prometheus-alert-manager-overview.md ├── prometheus-alert-rule.md ├── prometheus-recoding-rules.md └── static │ ├── add-incomming-webhooks.png │ ├── alertmanager-active-silences.png │ ├── alertmanager-alert.png │ ├── alertmanager-dingtalk-test-result.png │ ├── alertmanager-features.png │ ├── alertmanager-new-slicense.png │ ├── alertmanager-slicense-alerts-result.png │ ├── alertmanager.png │ ├── custom-slack-message.png │ ├── dingding-group-robot.png │ ├── dingtalk-message-test.png │ ├── dingtalk-robot-create-webhook.png │ ├── incomming-webhooks-setting.png │ ├── mail-alert-page.png │ ├── node_cpu_alert_firing.png │ ├── node_cpu_alert_pending.png │ ├── node_cpu_usgae_high.png │ ├── prometheus-alert-artich.png │ ├── prometheus-alert-firing-with-manager.png │ ├── prometheus-ui-alert (1).png │ ├── prometheus-ui-alert.png │ ├── prometheus-ui-rules.png │ ├── slack-channel.png │ ├── slack-create-channel.png │ ├── slack-overview.png │ ├── slack-receiver-message.png │ ├── slack_alert_message.png │ ├── slack_resolved_message.png │ └── wechat-alert-page.png ├── book.json ├── docker-compose.yml ├── draft ├── alert-with-wechat.md ├── share_dashboard.md ├── use-federation-in-operator.md └── use_table_panel.md ├── examples ├── ch1 │ ├── README.md │ ├── Vagrantfile │ ├── install.sh │ ├── node_exporter.service │ ├── prometheus.service │ ├── prometheus.yml │ └── ubuntu-xenial-16.04-cloudimg-console.log ├── ch7 │ ├── Vagrantfile │ ├── install.sh │ ├── install_federation.sh │ ├── node_exporter.service │ ├── prometheus.service │ ├── prometheus_federation.yml │ ├── prometheus_slave.yml │ └── ubuntu-xenial-16.04-cloudimg-console.log ├── cluster │ ├── alertmanager.procfile │ ├── docker-compose.yml │ ├── prometheus.procfile │ ├── send-alert.sh │ └── xxx.log ├── grafana │ ├── host_container_dashboard.json │ └── prometheus_usage.json ├── kubernetes │ ├── README.md │ ├── nginx │ │ ├── nginx-deployment.yml │ │ └── nginx-service.yml │ ├── operator │ │ ├── demo-deploy.yml │ │ ├── demo-service-monitor.yml │ │ ├── prometheus-operator-deploy.yml │ │ ├── prometheus-svc.yml │ │ ├── rbac-setup.yml │ │ └── service-monitor-selector.yml │ └── prometheus │ │ ├── blackbox-exporter-deployment.yml │ │ ├── kubernetes-prometheus-eq1.yml │ │ ├── kubernetes-prometheus.yml │ │ ├── node-exporter-daemonset.yml │ │ ├── prometheus-config-eq1.yml │ │ ├── prometheus-config-eq2.yml │ │ ├── prometheus-config-eq3.yml │ │ ├── prometheus-config-eq4.yml │ │ ├── prometheus-config-sd-example.yml │ │ ├── prometheus-config.yml │ │ ├── prometheus-deployment.yml │ │ ├── prometheus-ingress.yml │ │ └── prometheus-rbac-setup.yml ├── mysql_exporter │ └── docker-compose.yml ├── operator │ ├── README.md │ ├── minikube.sh │ ├── prometheus-k8s.yml │ └── prometheus-operator.yml ├── prometheus-operator │ ├── 00prometheus.yaml │ ├── 01prometheus.yaml │ ├── 02prometheus.yaml │ ├── 03prometheus.yaml │ ├── alertmanager-service.yaml │ ├── alertmanager-setup.yaml │ ├── alertmanager-svc.yaml │ ├── alertmanager.yaml │ ├── example-app-monitor.yaml │ ├── example-app.yaml │ ├── example-rule.yaml │ ├── node-exporter-daemonset.yaml │ ├── prometheus-operator.yaml │ ├── prometheus-rbac-setup.yaml │ └── prometheus-svc.yaml └── standalone │ ├── README.md │ ├── alertmanager.yml │ ├── prometheus.procfile │ ├── prometheus.yml │ ├── reload.sh │ └── rules │ └── hoststats-alert.rule ├── exporter ├── README.md ├── SUMMARY.md ├── client_library_java.md ├── commonly-eporter-usage.md ├── custom_app_support_prometheus.md ├── custom_exporter_with_java.md ├── install_blackbox_exporter.md ├── static │ ├── blackbox-whitebox-tower.png │ ├── cadvisor-total-usage.png │ ├── container_fs_reads_bytes_total.png │ ├── container_fs_writes_bytes_total.png │ ├── container_network_receive_bytes_total.png │ ├── container_network_transmit_bytes_total.png │ ├── mysqld_exporter_target_stats.png │ ├── prometheus-exporter.png │ ├── prometheus_client_java_2.png │ ├── prometheus_targetes_with_cadvisor.png │ ├── promql_container_cpu_usage.png │ └── relabel_blackbox_targets.png ├── use-prometheus-monitor-container.md ├── use-promethues-monitor-mysql.md └── what-is-prometheus-exporter.md ├── grafana ├── README.md ├── SUMMARY.md ├── grafana-intro.md ├── grafana-panels.md ├── static │ ├── consoles_index.png │ ├── custom_index_head.png │ ├── dashboard-components.png │ ├── grafana-dashboard-example.png │ ├── grafana-framework.png │ ├── grafana_bucket_demo.png │ ├── grafana_bucket_setting.png │ ├── grafana_dashboard_add_panel.png │ ├── grafana_edit_panel.png │ ├── grafana_format_as_table.png │ ├── grafana_graph_counter_demo_axes.png │ ├── grafana_graph_counter_demo_display_draw.png │ ├── grafana_graph_counter_demo_legend.png │ ├── grafana_graph_counter_demo_legend_sample.png │ ├── grafana_graph_counter_demo_metrics.png │ ├── grafana_graph_counter_demo_metrics_legend.png │ ├── grafana_graph_counter_demo_v2.png │ ├── grafana_graph_panel.png │ ├── grafana_heatmap_axes_setting.png │ ├── grafana_heatmap_editor.png │ ├── grafana_heatmap_metrics_setting.png │ ├── grafana_heatmap_normal_axes.png │ ├── grafana_heatmap_normal_metrics.png │ ├── grafana_heatmap_normal_sample.png │ ├── grafana_heatmap_sample.png │ ├── grafana_panel_general.png │ ├── grafana_prometheus_datasources.png │ ├── grafana_query_editor_inspector.png │ ├── grafana_series_overrides.png │ ├── grafana_series_overrides_demo.png │ ├── grafana_single_stat_edit_options.png │ ├── grafana_single_stat_edit_value_mapping.png │ ├── grafana_single_stat_edit_value_mapping_emoji.png │ ├── grafana_single_stat_sample.png │ ├── grafana_singlestat_sample.png │ ├── grafana_table_panel_cloum_style.png │ ├── grafana_table_panel_example2.png │ ├── grafana_templating_add_variables.png │ ├── grafana_templating_query_result.png │ ├── grafana_templating_query_variables3.png │ ├── grafana_templating_repeat_e2.png │ ├── grafana_templating_repeat_e3 (1).png │ ├── grafana_templating_repeat_e3.png │ ├── grafana_templating_repeat_example1.png │ ├── grafana_templating_repeat_row.png │ ├── grafana_templating_repeat_var.png │ ├── grafana_templating_variables_example.png │ ├── grafana_templating_variables_example1.png │ ├── grafana_templating_variables_filter.png │ ├── grafana_thresholds_demo.png │ ├── graph_prometheus_query_editor.png │ ├── head.png │ ├── prom_graph_timecontrol.png │ └── query_graph.png ├── templating.md ├── use-console-template.md ├── use_graph_panel.md ├── use_heatmap_panel.md └── use_singlestat_panel.md ├── ha ├── READMD.md ├── SUMMARY.md ├── alertmanager-high-availability.md ├── prometheus-and-high-availability.md ├── prometheus-local-storage.md ├── prometheus-remote-storage.md ├── scale-prometheus-with-federation.md └── static │ ├── alertmanager-features.png │ ├── alertmanager-gossip-ha.png │ ├── am-gossip.png │ ├── am-ha-status.png │ ├── am-notifi-pipeline.png │ ├── gossip-protoctl.png │ ├── prom-ha-with-am-gossip.png │ ├── prom-ha-with-double-am.png │ ├── prom-ha-with-single-am.png │ ├── prometheus-ha-remote-storage.png │ ├── prometheus-ha-rs-fedreation.png │ ├── prometheus_feradtion.png │ ├── prometheus_feradtion_2.png │ ├── promethues-alertmanager-ha.png │ ├── promethues-ha-01.png │ ├── promethues-remote-storage.png │ ├── promethues-sharding-targets.png │ ├── remote-storage-paths.png │ ├── remote-write-path-2.png │ └── remote_read_path-2.png ├── kubernetes ├── READMD.md ├── SUMMARY.md ├── deploy-prometheus-in-kubernetes.md ├── hap-with-prometheus.md ├── kubernetes-with-minikube.md ├── prometheus-with-kubernetes.md ├── service-discovery-with-kubernetes.md ├── static │ ├── k8s-sd-with-node-with-relabel-1.png │ ├── k8s-sd-with-node-with-relabel-2.png │ ├── k8s-service-endpoints.png │ ├── kubelet_pod_start_latency_microseconds (1).png │ ├── kubelet_pod_start_latency_microseconds.png │ ├── kubelet_pod_start_latency_microseconds_avg (1).png │ ├── kubelet_pod_start_latency_microseconds_avg.png │ ├── kubernetes-apiservers-monitor.png │ ├── kubernetes-app-model.png │ ├── kubernetes-artch-overview (1).png │ ├── kubernetes-artch-overview.png │ ├── kubernetes-dashboard.png │ ├── kubernetes-kubelets-step2.png │ ├── kubernetes-kubelets-step3.png │ ├── kubernetes-prometheus-step1.png │ ├── kubernetes-service-endpoints-sd-targets.png │ ├── kubernetes-service-endpoints-sd.png │ ├── kubernetes_service_endpoints.png │ ├── nginx-home-page.png │ ├── pre-ccm-arch.png │ ├── prometheus-alert-cluster-status.png │ ├── prometheus-alerting-auto2.png │ ├── prometheus-architecture.png │ ├── prometheus-cadvisor-step1.png │ ├── prometheus-cadvisor-step2.png │ ├── prometheus-config-with-servermonitor.png │ ├── prometheus-k8s-sd-example1.png │ ├── prometheus-k8s-sd-example3.png │ ├── prometheus-operator-instance.png │ ├── prometheus-operator-targets.png │ ├── prometheus-pods-sd-ex1.png │ ├── prometheus-rule.png │ └── promethues-api-server-sd.eq1.png ├── use-alertmanager-operator.md ├── use-prometheus-monitor-containers-in-k8s.md ├── use-prometheus-monitor-k8s-cluster-state.md ├── use-prometheus-monitor-k8s-svc-and-ingress-state.md ├── use-prometheus-monitor-kubernetes.md └── use-promethues-monitor-node-in-k8s.md ├── operator ├── README.md ├── SUMMARY.md ├── gs │ ├── alertmanager-inst.yaml │ ├── alertmanager.yaml │ ├── example-app-service-monitor.yaml │ ├── example-app.yaml │ ├── example-rule.yaml │ ├── prometheus-inst-cc.yaml │ ├── prometheus-inst.yaml │ ├── prometheus-rbac.yaml │ └── prometheus.yaml ├── static │ ├── operator-01.png │ ├── prometheus-alert-cluster-status.png │ ├── prometheus-alerting-auto2.png │ ├── prometheus-architecture.png │ └── prometheus-rule.png ├── use-custom-configuration-in-operator.md ├── use-operator-manage-monitor.md ├── use-operator-manage-prometheus.md └── what-is-prometheus-operator.md ├── package.json ├── prometheus ├── prometheus_consul.yml └── prometheus_static.yml ├── promql ├── README.md ├── SUMMARY.md ├── prometheus-aggr-ops.md ├── prometheus-metrics-types.md ├── prometheus-promql-best-praticase.md ├── prometheus-promql-functions.md ├── prometheus-promql-operators-v2.md ├── prometheus-promql-with-http-api.md ├── prometheus-query-language.md ├── static │ ├── USEMethod.png │ ├── counter-to-rate.png │ ├── histogram_quantile.png │ └── rate_vs_irate.png └── what-is-prometheus-metrics-and-labels.md ├── quickstart ├── README.md ├── SUMMARY.md ├── install-prometheus-server.md ├── prometheus-arch.md ├── prometheus-job-and-instance.md ├── prometheus-quick-start.md ├── promql_quickstart.md ├── static │ ├── add_default_prometheus_datasource.png │ ├── architecture.svg │ ├── first_grafana_dashboard.png │ ├── get_start_with_grafana2.png │ ├── grafana_dashboards.png │ ├── monitor-internal.png │ ├── nagios-platform.png │ ├── nagios-ui.png │ ├── node_cpu_usage_by_cpu_and_mode.png │ ├── node_cpu_usage_by_mode.png │ ├── node_cpu_usage_total.png │ ├── node_exporter_dashboard.png │ ├── node_exporter_home_page.png │ ├── node_exporter_metrics_page.png │ ├── node_node1_graph.png │ ├── prometheus-release-roadmaps.png │ ├── prometheus-ui-graph.png │ ├── prometheus_architecture.png │ ├── prometheus_ui_graph_query.png │ ├── prometheus_ui_targets.png │ ├── prometheus_ui_targets_status.png │ ├── prometheus_ui_up_query.png │ └── pullvspush.png ├── use-grafana-create-dashboard.md ├── use-node-exporter.md └── why-monitor.md ├── rancher └── README.md ├── samples └── client_java_samples │ ├── .gitignore │ ├── build.gradle │ ├── gradle │ └── wrapper │ │ ├── gradle-wrapper.jar │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ ├── settings.gradle │ └── src │ └── main │ └── java │ └── io │ └── prometheus │ └── client │ └── sample │ ├── CustomExporter.java │ ├── collectors │ ├── YourCustomCollector.java │ └── YourCustomCollector2.java │ ├── metrics │ ├── GaugeExample.java │ ├── HistogramExample.java │ └── SummaryExample.java │ ├── mock │ └── Request.java │ └── push │ └── PushGatewayIntegration.java ├── sd ├── README.md ├── SUMMARY.md ├── service-discovery-with-consul.md ├── service-discovery-with-file.md ├── service-discovery-with-relabel.md ├── static │ ├── alertmanager-slicense-alerts-result.png │ ├── aws_autoscaling.png │ ├── bolg_sd_mutil_cluster.png │ ├── consul_ui_page.png │ ├── prometheus-sd.png │ ├── prometheus_file_target_metadata.png │ ├── pulls_vs_push.png │ ├── relabel_hash_mode.png │ ├── service_ds_with_file.png │ └── when-relabel-work.png └── why-need-service-discovery.md └── sources ├── blackbox-dns-probe.md ├── blackbox-http-probe.md ├── blackbox-icmp-probe.md ├── blackbox-ipvx-probe.md ├── blackbox-tcp-probe.md ├── comparison_with_other.md ├── custom_metrics_with_client_library.md ├── custom_metrics_with_java_sdk.md ├── expose-cluster-level-metrics-with-kube-state-metrics.md ├── prometheus-promql-operators.md ├── prometheus-quick-start-node-exporter.md ├── prometheus-storage-v2.md ├── prometheus-time-series-selectors.md ├── static ├── custom_collector.png ├── host_stats_cpu.png ├── host_stats_mem_used.png ├── host_status_disk_io.png ├── node_cpu.png ├── node_exporter_metrics.png ├── node_exporter_targets.png └── prometheus_client_java_2.png ├── the-advantage-of-prometheus.md ├── use-prometheus-monitor-nginx.md ├── use-prometheus-monitor-rabbitmq.md └── what-is-prometheus.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | _book 3 | .vscode 4 | .vagrant 5 | npm-debug.log 6 | **/**/.vagrant 7 | *.pdf 8 | examples/localhost 9 | .DS_Store 10 | **/**/.DS_Store -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | v6.9.1 -------------------------------------------------------------------------------- /AUTHOR.md: -------------------------------------------------------------------------------- 1 | # 作者介绍 2 | 3 | 郑云龙,全栈工程师,CNCF基金会Certified Kubernetes Administrator。在敏捷和DevOps领域有丰富的实践经验,曾作为敏捷和DevOps技术教练向多家大型企业提供咨询和培训。当前在一家容器创业公司负责CaaS产品研发和设计。 4 | 5 | ## Prometheus操作指南:云原生监控之道 6 | 7 | “You can't fixed what you can't see”。 Prometheus被誉为下一代监控系统的首选,是继Kubernetes之后的第一个CNCF基金会顶级项目。本书是一本介绍Prometheus以及周边相关技术的实践书籍。本书主要分为三个部分。第一部分,主要通过一个实际的案例介绍了Prometheus的是什么?能做什么?以及Prometheus的基础架构,让读者对Prometheus有一个基本的认识以及概念。第二部分,本书则重点放在Prometheus的高级实践,包括社区提供的已经实现的Exporter的使用场景以及方法,以及教会读者如果通过Prometheus提供的Client Library根据自身需求创建自定义的Exporter。基于如何使用Prometheus的服务发现能力,提升平台的动态能力。在第二部分的最后我们会介绍如何对Prometheus运维管理,根据需求的不同实现Prometheus的扩容。以及如何通过一些其他的开源工具如Prometheus Operator实现对Prometheus的管理。最后,在第三部分,我们将会在Kubernetes下基于Prometheus构建我们的容器云监控平台,并且基于监控数据实现应用的Auto Scaling。 8 | 9 | 在通读这些内容后,相信读者能够对Prometheus有一个全面的认识。 -------------------------------------------------------------------------------- /CHANGELOGS.md: -------------------------------------------------------------------------------- 1 | ## 版本更新历史 2 | 3 | 4月变更内容: 4 | 5 | * 修改Feedback反馈的内容 6 | * 在“初始Prometheus小节”中添加Grafana可视化内容; 7 | * 重构第1章,Prometheus简介小节 8 | * 重构第2章,理解时间序列小节 9 | * 重构第2章,Metrics类型小节 10 | * 重构第2章,PromQL内置函数 11 | * 第4章,添加使用Blackbox进行黑盒监控 12 | * 第7章,Prometheus与服务发现 13 | * 第7章,服务发现与Relabel 14 | * 第7章 基于文件的服务发现 15 | * 第7章 基于Consul的服务发现 16 | * 第8章,初识Kubernetes 17 | * 第8章,部署Prometheus 18 | * 第8章 Kubernetes下的服务发现 19 | * 第8章 监控Kubernetes中的容器 20 | * 第8章 监控Kubernetes集群中的节点 21 | * 第8章 监控Kubernetes集群状态 -------------------------------------------------------------------------------- /CHECKLIST.md: -------------------------------------------------------------------------------- 1 | # 检查清单 2 | 3 | * 标点符号是否使用正确:逗号、顿号、句号。 4 | * 语言表达是否太过口语化。 5 | * 科技类图示,凡事章节号应该使用阿拉伯数字。 6 | * 是否有明显的错别字。 7 | * 是否区分了"地"和"的"。 8 | * 术语在第一次出现的时候,如果标注了英文,后面就可以直接使用中文,或者直接使用英文。 9 | * 术语的写法要规范,MySQL, CPU。 10 | * 文中的图,应该有图号,图题,并且在文中应该对图进行一点说明。 11 | * 文中的图例,需要考虑印刷效果。 12 | 13 | * 关于术语的表述一定要统一,不能一会儿使用中文,一会儿又使用英文。如果对于标签这个词,要使用英文的话,除了在第一次出现的时候给出对应的中文,后面就应该一直使用label这个词(而且首字母的大小写要一致) 14 | 15 | 常见拼写错误: 16 | 17 | | 源 | 错误 | 18 | |--|---| 19 | | 反映 | 反应| 20 | | 称为 | 成为 | -------------------------------------------------------------------------------- /Introduction.md: -------------------------------------------------------------------------------- 1 | # 全书组织 2 | 3 | 这里假定你已经对Linux系统以及Docker技术有一定的基本认识,也可能使用过像Java,Golang这样的编程语言,在本书中我们不会事无巨细的讲述所有事。 4 | 5 | 第1章,是Prometheus基础的综述,通过一个简单案例(使用Prometheus采集主机的监控数据)来了解Prometheus是什么,能做什么,以及它的架构组成。通过阅读本章希望读者能对Prometheus有一个基本的理解和认识。 6 | 7 | 第2章,读者将会了解到Prometheus的数据模型,以及时间序列模型。同时会学习到如何利用Prometheus的数据查询语言PrmQL(Prometheus Query Language)对监控数据进行查询、聚合、计算等。 8 | 9 | 第3章,我们的重点将放在监控告警部分,作为监控系统的重要能力之一,我们希望能够及时的了解系统的变化。这一章中读者将学习如何在Prometheus中自定义告警规则,同时了解如何使用AlertManager对告警进行处理。 10 | 11 | 第4章,介绍Prometheus中一些常用的Exporter的使用场景以及使用方法。之后还会带领读者通过Java和Golang实现自定义的Exporter,同时了解如何在现有应用系统上添加对Prometheus支持,从而实现应用层面的监控对接。 12 | 13 | 从第1章到第4章的部分都是本书的基础性章节,对大部分的研发或者运维人员来说可以快速掌握,并且能够使用Prometheus来完成一些基本的日常任务。余下的章节我们会关注到Prometheus的高级用法部分。 14 | 15 | 第5章,"You can't fix what you can't see"。可视化是监控的核心目标之一,这部分将会基于Grafana这一可视化工具实现监控数据可视化,并且了解Grafana作为一个通用的可视化工具是如何与Prometheus进行配合的。 16 | 17 | 第6章,读者将会了解到如何通过Prometheus的服务发现能力,自动的发现那些需要监控的资源和服务。特别是在云平台或者容器平台中,资源的创建和销毁成本变得更加频繁,通过服务发现自动地去发现监控目标,能够充分简化Prometheus的运维和管理难度。 18 | 19 | 第7章,在单个节点的情况下Prometheus能够轻松完成对数以百万的监控指标的处理,但是当监控的目标资源以及数据量变得更大的时候,我们如何实现对Prometheus的扩展?这一章节中重点讨论Prometheus高可用方面的能力。 20 | 21 | 第8章,这一章节中我们的另外一位重要成员Kubernetes将会登场,这里我们会带领读者对Kubernetes有一个基本的认识,并且通过Prometheus构建我们的容器云监控系统。并且介绍如何通过Prometheus与Kubernetes结合实现应用程序的弹性伸缩。 22 | -------------------------------------------------------------------------------- /PLACE_HOLDER.md: -------------------------------------------------------------------------------- 1 | 内容正在建设中 -------------------------------------------------------------------------------- /REFERENCES.md: -------------------------------------------------------------------------------- 1 | ## 参考资料 2 | 3 | ### Install & Configuration 4 | 5 | * https://www.digitalocean.com/community/tutorials/how-to-install-prometheus-on-ubuntu-16-04 6 | 7 | ### Storage 8 | 9 | * https://coreos.com/blog/prometheus-2.0-storage-layer-optimization 10 | 11 | ### Kubernetes 12 | 13 | * https://docs.bitnami.com/kubernetes/how-to/configure-autoscaling-custom-metrics/ 14 | * https://github.com/kubernetes/kube-state-metrics 15 | 16 | ### Others 17 | 18 | * https://news.ycombinator.com/item?id=12455045 19 | * https://github.com/digitalocean/vulcan 20 | * https://github.com/coreos/prometheus-operator/blob/master/Documentation/high-availability.md 21 | * https://github.com/katosys/kato/issues/43 22 | * https://www.robustperception.io/tag/tuning/ 23 | * https://www.robustperception.io/how-much-ram-does-my-prometheus-need-for-ingestion/ 24 | * https://jaxenter.com/prometheus-product-devops-mindset-130860.html 25 | * https://www.slideshare.net/brianbrazil/so-you-want-to-write-an-exporter 26 | 27 | ### PromSQL 28 | 29 | * https://www.youtube.com/watch?v=lrfTpnzq3Kw 30 | 31 | ### Exporters: 32 | 33 | * https://blog.csdn.net/zhaowenbo168/article/details/53196063 -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | TODO: 2 | 3 | Relabeling: 4 | * metric_relabel_configs: Metric relabeling is applied to samples as the last step before ingestion 5 | * alert_relabel_configs:Alert relabeling is applied to alerts before they are sent to the Alertmanager -------------------------------------------------------------------------------- /alert/README.md: -------------------------------------------------------------------------------- 1 | # 第3章 Prometheus告警处理 2 | 3 | 本章我们将带领读者探索Prometheus的告警处理机制,在前面的部分中已经介绍了告警能力在Prometheus的架构中被划分为两个部分,在Prometheus Server中定义告警规则以及产生告警,Alertmanager组件则用于处理这些由Prometheus产生的告警。Alertmanager即Prometheus体系中告警的统一处理中心。Alertmanager提供了多种内置第三方告警通知方式,同时还提供了对Webhook通知的支持,通过Webhook用户可以完成对告警更多个性化的扩展。 4 | 5 | 本章主要内容: 6 | 7 | * 在Prometheus中自定义告警规则 8 | * 理解Alertmanager特性 9 | * 基于标签的动态告警处理 10 | * 将告警通知发送到第三方服务 11 | * 如何使用Webhook扩展Alertmanager 12 | * 以及一些其他的性能优化模式 -------------------------------------------------------------------------------- /alert/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # 小结 2 | 3 | 当故障发生时,即时获取到异常结果是大多数用户使用监控系统的最主要的目的之一。通过Prometheus提供的告警以及告警处理能力,通过内置的告警通知能力,能过帮助用户快速实现告警的通知。同时其还提供了简单有效的扩展方式,让用户可以基于Prometheus的告警处理模式实现更多的定制化需求。 -------------------------------------------------------------------------------- /alert/alert-manager-config.md: -------------------------------------------------------------------------------- 1 | # Alertmanager配置概述 2 | 3 | 在上面的部分中已经简单介绍过,在Alertmanager中通过路由(Route)来定义告警的处理方式。路由是一个基于标签匹配的树状匹配结构。根据接收到告警的标签匹配相应的处理方式。这里将详细介绍路由相关的内容。 4 | 5 | Alertmanager主要负责对Prometheus产生的告警进行统一处理,因此在Alertmanager配置中一般会包含以下几个主要部分: 6 | 7 | * 全局配置(global):用于定义一些全局的公共参数,如全局的SMTP配置,Slack配置等内容; 8 | * 模板(templates):用于定义告警通知时的模板,如HTML模板,邮件模板等; 9 | * 告警路由(route):根据标签匹配,确定当前告警应该如何处理; 10 | * 接收人(receivers):接收人是一个抽象的概念,它可以是一个邮箱也可以是微信,Slack或者Webhook等,接收人一般配合告警路由使用; 11 | * 抑制规则(inhibit_rules):合理设置抑制规则可以减少垃圾告警的产生 12 | 13 | 其完整配置格式如下: 14 | 15 | ``` 16 | global: 17 | [ resolve_timeout: | default = 5m ] 18 | [ smtp_from: ] 19 | [ smtp_smarthost: ] 20 | [ smtp_hello: | default = "localhost" ] 21 | [ smtp_auth_username: ] 22 | [ smtp_auth_password: ] 23 | [ smtp_auth_identity: ] 24 | [ smtp_auth_secret: ] 25 | [ smtp_require_tls: | default = true ] 26 | [ slack_api_url: ] 27 | [ victorops_api_key: ] 28 | [ victorops_api_url: | default = "https://alert.victorops.com/integrations/generic/20131114/alert/" ] 29 | [ pagerduty_url: | default = "https://events.pagerduty.com/v2/enqueue" ] 30 | [ opsgenie_api_key: ] 31 | [ opsgenie_api_url: | default = "https://api.opsgenie.com/" ] 32 | [ hipchat_api_url: | default = "https://api.hipchat.com/" ] 33 | [ hipchat_auth_token: ] 34 | [ wechat_api_url: | default = "https://qyapi.weixin.qq.com/cgi-bin/" ] 35 | [ wechat_api_secret: ] 36 | [ wechat_api_corp_id: ] 37 | [ http_config: ] 38 | 39 | templates: 40 | [ - ... ] 41 | 42 | route: 43 | 44 | receivers: 45 | - ... 46 | 47 | inhibit_rules: 48 | [ - ... ] 49 | ``` 50 | 51 | 在全局配置中需要注意的是`resolve_timeout`,该参数定义了当Alertmanager持续多长时间未接收到告警后标记告警状态为resolved(已解决)。该参数的定义可能会影响到告警恢复通知的接收时间,读者可根据自己的实际场景进行定义,其默认值为5分钟。在接下来的部分,我们将已一些实际的例子解释Alertmanager的其它配置内容。 -------------------------------------------------------------------------------- /alert/alert-manager-inhibit.md: -------------------------------------------------------------------------------- 1 | # 屏蔽告警通知 2 | 3 | Alertmanager提供了方式可以帮助用户控制告警通知的行为,包括预先定义的抑制机制和临时定义的静默规则。 4 | 5 | ## 抑制机制 6 | 7 | Alertmanager的抑制机制可以避免当某种问题告警产生之后用户接收到大量由此问题导致的一系列的其它告警通知。例如当集群不可用时,用户可能只希望接收到一条告警,告诉他这时候集群出现了问题,而不是大量的如集群中的应用异常、中间件服务异常的告警通知。 8 | 9 | 在Alertmanager配置文件中,使用inhibit_rules定义一组告警的抑制规则: 10 | 11 | ``` 12 | inhibit_rules: 13 | [ - ... ] 14 | ``` 15 | 16 | 每一条抑制规则的具体配置如下: 17 | 18 | ``` 19 | target_match: 20 | [ : , ... ] 21 | target_match_re: 22 | [ : , ... ] 23 | 24 | source_match: 25 | [ : , ... ] 26 | source_match_re: 27 | [ : , ... ] 28 | 29 | [ equal: '[' , ... ']' ] 30 | ``` 31 | 32 | 当已经发送的告警通知匹配到target_match和target_match_re规则,当有新的告警规则如果满足source_match或者定义的匹配规则,并且已发送的告警与新产生的告警中equal定义的标签完全相同,则启动抑制机制,新的告警不会发送。 33 | 34 | 例如,定义如下抑制规则: 35 | 36 | ``` 37 | - source_match: 38 | alertname: NodeDown 39 | severity: critical 40 | target_match: 41 | severity: critical 42 | equal: 43 | - node 44 | ``` 45 | 46 | 例如当集群中的某一个主机节点异常宕机导致告警NodeDown被触发,同时在告警规则中定义了告警级别severity=critical。由于主机异常宕机,该主机上部署的所有服务,中间件会不可用并触发报警。根据抑制规则的定义,如果有新的告警级别为severity=critical,并且告警中标签node的值与NodeDown告警的相同,则说明新的告警是由NodeDown导致的,则启动抑制机制停止向接收器发送通知。 47 | 48 | ## 临时静默 49 | 50 | 除了基于抑制机制可以控制告警通知的行为以外,用户或者管理员还可以直接通过Alertmanager的UI临时屏蔽特定的告警通知。通过定义标签的匹配规则(字符串或者正则表达式),如果新的告警通知满足静默规则的设置,则停止向receiver发送通知。 51 | 52 | 进入Alertmanager UI,点击"New Silence"显示如下内容: 53 | 54 | ![创建静默规则](./static/alertmanager-new-slicense.png) 55 | 56 | 用户可以通过该UI定义新的静默规则的开始时间以及持续时间,通过Matchers部分可以设置多条匹配规则(字符串匹配或者正则匹配)。填写当前静默规则的创建者以及创建原因后,点击"Create"按钮即可。 57 | 58 | 通过"Preview Alerts"可以查看预览当前匹配规则匹配到的告警信息。静默规则创建成功后,Alertmanager会开始加载该规则并且设置状态为Pending,当规则生效后则进行到Active状态。 59 | 60 | ![活动的静默规则](./static/alertmanager-active-silences.png) 61 | 62 | 当静默规则生效以后,从Alertmanager的Alerts页面下用户将不会看到该规则匹配到的告警信息。 63 | 64 | ![告警信息](./static/alertmanager-slicense-alerts-result.png) 65 | 66 | 对于已经生效的规则,用户可以通过手动点击“Expire”按钮使当前规则过期。 67 | -------------------------------------------------------------------------------- /alert/alert-manager-mute.md: -------------------------------------------------------------------------------- 1 | ## 临时屏蔽告警通知 2 | 3 | 除了基于抑制机制可以控制告警通知的行为以外,用户或者管理员还可以直接通过Alertmanager的UI临时屏蔽特定的告警通知。通过定义标签的匹配规则(字符串或者正则表达式),如果新的告警通知满足静默规则的设置,则不停止向receiver发送通知。 4 | 5 | 进入Alertmanager UI,点击"New Silence"显示如下内容: 6 | 7 | ![创建静默规则](./static/alertmanager-new-slicense.png) 8 | 9 | 用户可以通过该UI定义新的静默规则的开始时间以及持续时间,通过Matchers部分可以设置多条匹配规则(字符串匹配或者正则匹配)。填写当前静默规则的创建者以及创建原因后,点击"Create"按钮即可。 10 | 11 | 通过"Preview Alerts"可以查看预览当前匹配规则匹配到的告警信息。静默规则创建成功后,Alertmanager会开始加载该规则并且设置状态为Pending,当规则生效后则进行到Active状态。 12 | 13 | ![活动的静默规则](./static/alertmanager-active-silences.png) 14 | 15 | 当静默规则生效以后,从Alertmanager的Alerts页面下用户将不会看到该规则匹配到的告警信息。 16 | 17 | ![告警信息](./static/alertmanager-slicense-alerts-result.png) 18 | 19 | 对于已经生效的规则,用户可以通过手动点击“Expire”按钮使当前规则过期。 -------------------------------------------------------------------------------- /alert/alert-manager-route.md: -------------------------------------------------------------------------------- 1 | # 基于标签的告警路由 2 | 3 | 在Alertmanager的配置中会定义一个基于标签匹配规则的告警路由树,以确定在接收到告警后Alertmanager需要如何对其进行处理: 4 | 5 | ``` 6 | route: 7 | ``` 8 | 9 | 其中route中则主要定义了告警的路由匹配规则,以及Alertmanager需要将匹配到的告警发送给哪一个receiver,一个最简单的route定义如下所示: 10 | 11 | ``` 12 | route: 13 | group_by: ['alertname'] 14 | receiver: 'web.hook' 15 | receivers: 16 | - name: 'web.hook' 17 | webhook_configs: 18 | - url: 'http://127.0.0.1:5001/' 19 | ``` 20 | 21 | 如上所示:在Alertmanager配置文件中,我们只定义了一个路由,那就意味着所有由Prometheus产生的告警在发送到Alertmanager之后都会通过名为`web.hook`的receiver接收。这里的web.hook定义为一个webhook地址。当然实际场景下,告警处理可不是这么简单的一件事情,对于不同级别的告警,我们可能会有完全不同的处理方式,因此在route中,我们还可以定义更多的子Route,这些Route通过标签匹配告警的处理方式,route的完整定义如下: 22 | 23 | ``` 24 | [ receiver: ] 25 | [ group_by: '[' , ... ']' ] 26 | [ continue: | default = false ] 27 | 28 | match: 29 | [ : , ... ] 30 | 31 | match_re: 32 | [ : , ... ] 33 | 34 | [ group_wait: | default = 30s ] 35 | [ group_interval: | default = 5m ] 36 | [ repeat_interval: | default = 4h ] 37 | 38 | routes: 39 | [ - ... ] 40 | ``` 41 | 42 | ## 路由匹配 43 | 44 | 每一个告警都会从配置文件中顶级的route进入路由树,需要注意的是顶级的route必须匹配所有告警(即不能有任何的匹配设置match和match_re),每一个路由都可以定义自己的接受人以及匹配规则。默认情况下,告警进入到顶级route后会遍历所有的子节点,直到找到最深的匹配route,并将告警发送到该route定义的receiver中。但如果route中设置**continue**的值为false,那么告警在匹配到第一个子节点之后就直接停止。如果**continue**为true,报警则会继续进行后续子节点的匹配。如果当前告警匹配不到任何的子节点,那该告警将会基于当前路由节点的接收器配置方式进行处理。 45 | 46 | 其中告警的匹配有两种方式可以选择。一种方式基于字符串验证,通过设置**match**规则判断当前告警中是否存在标签labelname并且其值等于labelvalue。第二种方式则基于正则表达式,通过设置**match_re**验证当前告警标签的值是否满足正则表达式的内容。 47 | 48 | 如果警报已经成功发送通知, 如果想设置发送告警通知之前要等待时间,则可以通过**repeat_interval**参数进行设置。 49 | 50 | ## 告警分组 51 | 52 | 在之前的部分有讲过,Alertmanager可以对告警通知进行分组,将多条告警合合并为一个通知。这里我们可以使用**group_by**来定义分组规则。基于告警中包含的标签,如果满足**group_by**中定义标签名称,那么这些告警将会合并为一个通知发送给接收器。 53 | 54 | 有的时候为了能够一次性收集和发送更多的相关信息时,可以通过**group_wait**参数设置等待时间,如果在等待时间内,当前group接收到了新的告警,这些告警将会合并为一个通知向receiver发送。 55 | 56 | 而**group_interval**配置,则用于定义相同的Group之间发送告警通知的时间间隔。 57 | 58 | 例如,当使用Prometheus监控多个集群以及部署在集群中的应用和数据库服务,并且定义以下的告警处理路由规则来对集群中的异常进行通知。 59 | 60 | ``` 61 | route: 62 | receiver: 'default-receiver' 63 | group_wait: 30s 64 | group_interval: 5m 65 | repeat_interval: 4h 66 | group_by: [cluster, alertname] 67 | routes: 68 | - receiver: 'database-pager' 69 | group_wait: 10s 70 | match_re: 71 | service: mysql|cassandra 72 | - receiver: 'frontend-pager' 73 | group_by: [product, environment] 74 | match: 75 | team: frontend 76 | ``` 77 | 78 | 默认情况下所有的告警都会发送给集群管理员default-receiver,因此在Alertmanager的配置文件的根路由中,对告警信息按照集群以及告警的名称对告警进行分组。 79 | 80 | 如果告警时来源于数据库服务如MySQL或者Cassandra,此时则需要将告警发送给相应的数据库管理员(database-pager)。这里定义了一个单独子路由,如果告警中包含service标签,并且service为MySQL或者Cassandra,则向database-pager发送告警通知,由于这里没有定义group_by等属性,这些属性的配置信息将从上级路由继承,database-pager将会接收到按cluster和alertname进行分组的告警通知。 81 | 82 | 而某些告警规则可能来源于开发团队的定义,这些告警中通过添加标签team来标示这些告警的创建者。在Alertmanager配置文件的告警路由下,定义单独子路由用于处理这一类的告警通知,如果匹配到告警中包含标签team,并且team的值为frontend,Alertmanager将会按照标签product和environment对告警进行分组。此时如果应用出现异常,开发团队就能清楚的知道哪一个环境(environment)中的哪一个应用程序出现了问题,可以快速对应用进行问题定位。 83 | -------------------------------------------------------------------------------- /alert/alert-manager-use-receiver.md: -------------------------------------------------------------------------------- 1 | # 内置告警接收器Receiver 2 | 3 | 前上一小节已经讲过,在Alertmanager中路由负责对告警信息进行分组匹配,并将像告警接收器发送通知。告警接收器可以通过以下形式进行配置: 4 | 5 | ``` 6 | receivers: 7 | - ... 8 | ``` 9 | 10 | 每一个receiver具有一个全局唯一的名称,并且对应一个或者多个通知方式: 11 | 12 | ``` 13 | name: 14 | email_configs: 15 | [ - , ... ] 16 | hipchat_configs: 17 | [ - , ... ] 18 | pagerduty_configs: 19 | [ - , ... ] 20 | pushover_configs: 21 | [ - , ... ] 22 | slack_configs: 23 | [ - , ... ] 24 | opsgenie_configs: 25 | [ - , ... ] 26 | webhook_configs: 27 | [ - , ... ] 28 | victorops_configs: 29 | [ - , ... ] 30 | ``` 31 | 32 | 目前官方内置的第三方通知集成包括:邮件、 即时通讯软件(如Slack、Hipchat)、移动应用消息推送(如Pushover)和自动化运维工具(例如:Pagerduty、Opsgenie、Victorops)。Alertmanager的通知方式中还可以支持Webhook,通过这种方式开发者可以实现更多个性化的扩展支持。 33 | 34 | -------------------------------------------------------------------------------- /alert/alert-template.md: -------------------------------------------------------------------------------- 1 | ## 自定义告警模板 2 | 3 | 默认情况下Alertmanager使用了系统自带的默认通知模板,模板源码可以从[https://github.com/prometheus/alertmanager/blob/master/template/default.tmpl](https://github.com/prometheus/alertmanager/blob/master/template/default.tmpl)获得。Alertmanager的通知模板基于[Go的模板系统](http://golang.org/pkg/text/template)。Alertmanager也支持用户定义和使用自己的模板,一般来说有两种方式可以选择。 4 | 5 | 第一种,基于模板字符串。用户可以直接在Alertmanager的配置文件中使用模板字符串,例如: 6 | 7 | ``` 8 | receivers: 9 | - name: 'slack-notifications' 10 | slack_configs: 11 | - channel: '#alerts' 12 | text: 'https://internal.myorg.net/wiki/alerts/{{ .GroupLabels.app }}/{{ .GroupLabels.alertname }}' 13 | ``` 14 | 15 | 第二种方式,自定义可复用的模板文件。例如,可以创建自定义模板文件custom-template.tmpl,如下所示: 16 | 17 | ``` 18 | {{ define "slack.myorg.text" }}https://internal.myorg.net/wiki/alerts/{{ .GroupLabels.app }}/{{ .GroupLabels.alertname }}{{ end}} 19 | ``` 20 | 21 | 通过在Alertmanager的全局设置中定义templates配置来指定自定义模板的访问路径: 22 | 23 | ``` 24 | # Files from which custom notification template definitions are read. 25 | # The last component may use a wildcard matcher, e.g. 'templates/*.tmpl'. 26 | templates: 27 | [ - ... ] 28 | ``` 29 | 30 | 在设置了自定义模板的访问路径后,用户则可以直接在配置中使用该模板: 31 | 32 | ``` 33 | receivers: 34 | - name: 'slack-notifications' 35 | slack_configs: 36 | - channel: '#alerts' 37 | text: '{{ template "slack.myorg.text" . }}' 38 | 39 | templates: 40 | - '/etc/alertmanager/templates/myorg.tmpl' 41 | ``` -------------------------------------------------------------------------------- /alert/alert-with-smtp.md: -------------------------------------------------------------------------------- 1 | # 与SMTP邮件集成 2 | 3 | 邮箱应该是目前企业最常用的告警通知方式,Alertmanager内置了对SMTP协议的支持,因此对于企业用户而言,只需要一些基本的配置即可实现通过邮件的通知。 4 | 5 | 在Alertmanager使用邮箱通知,用户只需要定义好SMTP相关的配置,并且在receiver中定义接收方的邮件地址即可。在Alertmanager中我们可以直接在配置文件的global中定义全局的SMTP配置: 6 | 7 | ``` 8 | global: 9 | [ smtp_from: ] 10 | [ smtp_smarthost: ] 11 | [ smtp_hello: | default = "localhost" ] 12 | [ smtp_auth_username: ] 13 | [ smtp_auth_password: ] 14 | [ smtp_auth_identity: ] 15 | [ smtp_auth_secret: ] 16 | [ smtp_require_tls: | default = true ] 17 | ``` 18 | 19 | 完成全局SMTP之后,我们只需要为receiver配置email_configs用于定义一组接收告警的邮箱地址即可,如下所示: 20 | 21 | ``` 22 | name: 23 | email_configs: 24 | [ - , ... ] 25 | ``` 26 | 27 | 每个email_config中定义相应的接收人邮箱地址,邮件通知模板等信息即可,当然如果当前接收人需要单独的SMTP配置,那直接在email_config中覆盖即可: 28 | 29 | ``` 30 | [ send_resolved: | default = false ] 31 | to: 32 | [ html: | default = '{{ template "email.default.html" . }}' ] 33 | [ headers: { : , ... } ] 34 | ``` 35 | 36 | 如果当前收件人需要接受告警恢复的通知的话,在email_config中定义`send_resolved`为true即可。 37 | 38 | 如果所有的邮件配置使用了相同的SMTP配置,则可以直接定义全局的SMTP配置。 39 | 40 | 这里,以Gmail邮箱为例,我们定义了一个全局的SMTP配置,并且通过route将所有告警信息发送到default-receiver中: 41 | 42 | ``` 43 | global: 44 | smtp_smarthost: smtp.gmail.com:587 45 | smtp_from: 46 | smtp_auth_username: 47 | smtp_auth_identity: 48 | smtp_auth_password: 49 | 50 | route: 51 | group_by: ['alertname'] 52 | receiver: 'default-receiver' 53 | 54 | receivers: 55 | - name: default-receiver 56 | email_configs: 57 | - to: 58 | send_resolved: true 59 | ``` 60 | 61 | > 需要注意的是新的Google账号安全规则需要使用“应用专有密码”作为邮箱登录密码 62 | 63 | 这时如果手动拉高主机CPU使用率,使得监控样本数据满足告警触发条件。在SMTP配置正确的情况下,可以接收到如下的告警内容: 64 | 65 | ![告警](./static/mail-alert-page.png) -------------------------------------------------------------------------------- /alert/install-alert-manager.md: -------------------------------------------------------------------------------- 1 | # 部署Alertmanager 2 | 3 | Alertmanager和Prometheus Server一样均采用Golang实现,并且没有第三方依赖。一般来说我们可以通过以下几种方式来部署Alertmanager:二进制包、容器以及源码方式安装。 4 | 5 | ## 使用二进制包部署AlertManager 6 | 7 | ##### 获取并安装软件包 8 | 9 | Alertmanager最新版本的下载地址可以从Prometheus官方网站[https://prometheus.io/download/](https://prometheus.io/download/)获取。 10 | 11 | ```shell 12 | export VERSION=0.15.2 13 | curl -LO https://github.com/prometheus/alertmanager/releases/download/v$VERSION/alertmanager-$VERSION.darwin-amd64.tar.gz 14 | tar xvf alertmanager-$VERSION.darwin-amd64.tar.gz 15 | ``` 16 | 17 | ##### 创建alertmanager配置文件 18 | 19 | Alertmanager解压后会包含一个默认的alertmanager.yml配置文件,内容如下所示: 20 | 21 | ``` 22 | global: 23 | resolve_timeout: 5m 24 | 25 | route: 26 | group_by: ['alertname'] 27 | group_wait: 10s 28 | group_interval: 10s 29 | repeat_interval: 1h 30 | receiver: 'web.hook' 31 | receivers: 32 | - name: 'web.hook' 33 | webhook_configs: 34 | - url: 'http://127.0.0.1:5001/' 35 | inhibit_rules: 36 | - source_match: 37 | severity: 'critical' 38 | target_match: 39 | severity: 'warning' 40 | equal: ['alertname', 'dev', 'instance'] 41 | ``` 42 | 43 | Alertmanager的配置主要包含两个部分:路由(route)以及接收器(receivers)。所有的告警信息都会从配置中的顶级路由(route)进入路由树,根据路由规则将告警信息发送给相应的接收器。 44 | 45 | 在Alertmanager中可以定义一组接收器,比如可以按照角色(比如系统运维,数据库管理员)来划分多个接收器。接收器可以关联邮件,Slack以及其它方式接收告警信息。 46 | 47 | 当前配置文件中定义了一个默认的接收者default-receiver由于这里没有设置接收方式,目前只相当于一个占位符。关于接收器的详细介绍会在后续章节介绍。 48 | 49 | 在配置文件中使用route定义了顶级的路由,路由是一个基于标签匹配规则的树状结构。所有的告警信息从顶级路由开始,根据标签匹配规则进入到不同的子路由,并且根据子路由设置的接收器发送告警。目前配置文件中只设置了一个顶级路由route并且定义的接收器为default-receiver。因此,所有的告警都会发送给default-receiver。关于路由的详细内容会在后续进行详细介绍。 50 | 51 | ##### 启动Alertmanager 52 | 53 | Alermanager会将数据保存到本地中,默认的存储路径为`data/`。因此,在启动Alertmanager之前需要创建相应的目录: 54 | 55 | ``` 56 | ./alertmanager 57 | ``` 58 | 59 | 用户也在启动Alertmanager时使用参数修改相关配置。`--config.file`用于指定alertmanager配置文件路径,`--storage.path`用于指定数据存储路径。 60 | 61 | #### 查看运行状态 62 | 63 | Alertmanager启动后可以通过9093端口访问,[http://192.168.33.10:9093](http://192.168.33.10:9093) 64 | 65 | ![Alertmanager页面](./static/alertmanager.png) 66 | 67 | Alert菜单下可以查看Alertmanager接收到的告警内容。Silences菜单下则可以通过UI创建静默规则,这部分我们会在后续部分介绍。进入Status菜单,可以看到当前系统的运行状态以及配置信息。 68 | 69 | ## 关联Prometheus与Alertmanager 70 | 71 | 在Prometheus的架构中被划分成两个独立的部分。Prometheus负责产生告警,而Alertmanager负责告警产生后的后续处理。因此Alertmanager部署完成后,需要在Prometheus中设置Alertmanager相关的信息。 72 | 73 | 编辑Prometheus配置文件prometheus.yml,并添加以下内容 74 | 75 | ``` 76 | alerting: 77 | alertmanagers: 78 | - static_configs: 79 | - targets: ['localhost:9093'] 80 | ``` 81 | 82 | 重启Prometheus服务,成功后,可以从[http://192.168.33.10:9090/config](http://192.168.33.10:9090/config)查看alerting配置是否生效。 83 | 84 | 此时,再次尝试手动拉高系统CPU使用率: 85 | 86 | ``` 87 | cat /dev/zero>/dev/null 88 | ``` 89 | 90 | 等待Prometheus告警进行触发状态: 91 | 92 | ![](./static/prometheus-alert-firing-with-manager.png) 93 | 94 | 查看Alertmanager UI此时可以看到Alertmanager接收到的告警信息。 95 | 96 | ![](./static/alertmanager-alert.png) 97 | 98 | ## 接下来 99 | 100 | 目前为止,我们已经成功安装部署了Alertmanager并且与Prometheus关联,能够正常接收来自Prometheus的告警信息。接下来我们将详细介绍Alertmanager是如何处理这些接收到的告警信息的。 101 | -------------------------------------------------------------------------------- /alert/prometheus-alert-manager-overview.md: -------------------------------------------------------------------------------- 1 | # Prometheus告警简介 2 | 3 | 告警能力在Prometheus的架构中被划分成两个独立的部分。如下所示,通过在Prometheus中定义AlertRule(告警规则),Prometheus会周期性的对告警规则进行计算,如果满足告警触发条件就会向Alertmanager发送告警信息。 4 | 5 | ![Prometheus告警处理](./static/prometheus-alert-artich.png) 6 | 7 | 在Prometheus中一条告警规则主要由以下几部分组成: 8 | * 告警名称:用户需要为告警规则命名,当然对于命名而言,需要能够直接表达出该告警的主要内容 9 | * 告警规则:告警规则实际上主要由PromQL进行定义,其实际意义是当表达式(PromQL)查询结果持续多长时间(During)后出发告警 10 | 11 | 在Prometheus中,还可以通过Group(告警组)对一组相关的告警进行统一定义。当然这些定义都是通过YAML文件来统一管理的。 12 | 13 | Alertmanager作为一个独立的组件,负责接收并处理来自Prometheus Server(也可以是其它的客户端程序)的告警信息。Alertmanager可以对这些告警信息进行进一步的处理,比如当接收到大量重复告警时能够消除重复的告警信息,同时对告警信息进行分组并且路由到正确的通知方,Prometheus内置了对邮件,Slack等多种通知方式的支持,同时还支持与Webhook的集成,以支持更多定制化的场景。例如,目前Alertmanager还不支持钉钉,那用户完全可以通过Webhook与钉钉机器人进行集成,从而通过钉钉接收告警信息。同时AlertManager还提供了静默和告警抑制机制来对告警通知行为进行优化。 14 | 15 | ## Alertmanager特性 16 | 17 | Alertmanager除了提供基本的告警通知能力以外,还主要提供了如:分组、抑制以及静默等告警特性: 18 | 19 | ![Alertmanager特性](./static/alertmanager-features.png) 20 | 21 | #### 分组 22 | 23 | 分组机制可以将详细的告警信息合并成一个通知。在某些情况下,比如由于系统宕机导致大量的告警被同时触发,在这种情况下分组机制可以将这些被触发的告警合并为一个告警通知,避免一次性接受大量的告警通知,而无法对问题进行快速定位。 24 | 25 | 例如,当集群中有数百个正在运行的服务实例,并且为每一个实例设置了告警规则。假如此时发生了网络故障,可能导致大量的服务实例无法连接到数据库,结果就会有数百个告警被发送到Alertmanager。 26 | 27 | 而作为用户,可能只希望能够在一个通知中就能查看哪些服务实例受到影响。这时可以按照服务所在集群或者告警名称对告警进行分组,而将这些告警内聚在一起成为一个通知。 28 | 29 | 告警分组,告警时间,以及告警的接受方式可以通过Alertmanager的配置文件进行配置。 30 | 31 | #### 抑制 32 | 33 | 抑制是指当某一告警发出后,可以停止重复发送由此告警引发的其它告警的机制。 34 | 35 | 例如,当集群不可访问时触发了一次告警,通过配置Alertmanager可以忽略与该集群有关的其它所有告警。这样可以避免接收到大量与实际问题无关的告警通知。 36 | 37 | 抑制机制同样通过Alertmanager的配置文件进行设置。 38 | 39 | #### 静默 40 | 41 | 静默提供了一个简单的机制可以快速根据标签对告警进行静默处理。如果接收到的告警符合静默的配置,Alertmanager则不会发送告警通知。 42 | 43 | 静默设置需要在Alertmanager的Web页面上进行设置。 44 | 45 | -------------------------------------------------------------------------------- /alert/prometheus-recoding-rules.md: -------------------------------------------------------------------------------- 1 | # 使用Recoding Rules优化性能 2 | 3 | 通过PromQL可以实时对Prometheus中采集到的样本数据进行查询,聚合以及其它各种运算操作。而在某些PromQL较为复杂且计算量较大时,直接使用PromQL可能会导致Prometheus响应超时的情况。这时需要一种能够类似于后台批处理的机制能够在后台完成这些复杂运算的计算,对于使用者而言只需要查询这些运算结果即可。Prometheus通过Recoding Rule规则支持这种后台计算的方式,可以实现对复杂查询的性能优化,提高查询效率。 4 | 5 | ## 定义Recoding rules 6 | 7 | 在Prometheus配置文件中,通过rule_files定义recoding rule规则文件的访问路径。 8 | 9 | ``` 10 | rule_files: 11 | [ - ... ] 12 | ``` 13 | 14 | 每一个规则文件通过以下格式进行定义: 15 | 16 | ``` 17 | groups: 18 | [ - ] 19 | ``` 20 | 21 | 一个简单的规则文件可能是这个样子的: 22 | 23 | ``` 24 | groups: 25 | - name: example 26 | rules: 27 | - record: job:http_inprogress_requests:sum 28 | expr: sum(http_inprogress_requests) by (job) 29 | ``` 30 | 31 | rule_group的具体配置项如下所示: 32 | 33 | ``` 34 | # The name of the group. Must be unique within a file. 35 | name: 36 | 37 | # How often rules in the group are evaluated. 38 | [ interval: | default = global.evaluation_interval ] 39 | 40 | rules: 41 | [ - ... ] 42 | ``` 43 | 44 | 与告警规则一致,一个group下可以包含多条规则rule。 45 | 46 | ``` 47 | # The name of the time series to output to. Must be a valid metric name. 48 | record: 49 | 50 | # The PromQL expression to evaluate. Every evaluation cycle this is 51 | # evaluated at the current time, and the result recorded as a new set of 52 | # time series with the metric name as given by 'record'. 53 | expr: 54 | 55 | # Labels to add or overwrite before storing the result. 56 | labels: 57 | [ : ] 58 | ``` 59 | 60 | 根据规则中的定义,Prometheus会在后台完成expr中定义的PromQL表达式计算,并且将计算结果保存到新的时间序列record中。同时还可以通过labels为这些样本添加额外的标签。 61 | 62 | 这些规则文件的计算频率与告警规则计算频率一致,都通过global.evaluation_interval定义: 63 | 64 | ``` 65 | global: 66 | [ evaluation_interval: | default = 1m ] 67 | ``` -------------------------------------------------------------------------------- /alert/static/add-incomming-webhooks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/add-incomming-webhooks.png -------------------------------------------------------------------------------- /alert/static/alertmanager-active-silences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/alertmanager-active-silences.png -------------------------------------------------------------------------------- /alert/static/alertmanager-alert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/alertmanager-alert.png -------------------------------------------------------------------------------- /alert/static/alertmanager-dingtalk-test-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/alertmanager-dingtalk-test-result.png -------------------------------------------------------------------------------- /alert/static/alertmanager-features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/alertmanager-features.png -------------------------------------------------------------------------------- /alert/static/alertmanager-new-slicense.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/alertmanager-new-slicense.png -------------------------------------------------------------------------------- /alert/static/alertmanager-slicense-alerts-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/alertmanager-slicense-alerts-result.png -------------------------------------------------------------------------------- /alert/static/alertmanager.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/alertmanager.png -------------------------------------------------------------------------------- /alert/static/custom-slack-message.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/custom-slack-message.png -------------------------------------------------------------------------------- /alert/static/dingding-group-robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/dingding-group-robot.png -------------------------------------------------------------------------------- /alert/static/dingtalk-message-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/dingtalk-message-test.png -------------------------------------------------------------------------------- /alert/static/dingtalk-robot-create-webhook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/dingtalk-robot-create-webhook.png -------------------------------------------------------------------------------- /alert/static/incomming-webhooks-setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/incomming-webhooks-setting.png -------------------------------------------------------------------------------- /alert/static/mail-alert-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/mail-alert-page.png -------------------------------------------------------------------------------- /alert/static/node_cpu_alert_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/node_cpu_alert_firing.png -------------------------------------------------------------------------------- /alert/static/node_cpu_alert_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/node_cpu_alert_pending.png -------------------------------------------------------------------------------- /alert/static/node_cpu_usgae_high.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/node_cpu_usgae_high.png -------------------------------------------------------------------------------- /alert/static/prometheus-alert-artich.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/prometheus-alert-artich.png -------------------------------------------------------------------------------- /alert/static/prometheus-alert-firing-with-manager.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/prometheus-alert-firing-with-manager.png -------------------------------------------------------------------------------- /alert/static/prometheus-ui-alert (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/prometheus-ui-alert (1).png -------------------------------------------------------------------------------- /alert/static/prometheus-ui-alert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/prometheus-ui-alert.png -------------------------------------------------------------------------------- /alert/static/prometheus-ui-rules.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/prometheus-ui-rules.png -------------------------------------------------------------------------------- /alert/static/slack-channel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/slack-channel.png -------------------------------------------------------------------------------- /alert/static/slack-create-channel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/slack-create-channel.png -------------------------------------------------------------------------------- /alert/static/slack-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/slack-overview.png -------------------------------------------------------------------------------- /alert/static/slack-receiver-message.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/slack-receiver-message.png -------------------------------------------------------------------------------- /alert/static/slack_alert_message.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/slack_alert_message.png -------------------------------------------------------------------------------- /alert/static/slack_resolved_message.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/slack_resolved_message.png -------------------------------------------------------------------------------- /alert/static/wechat-alert-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/alert/static/wechat-alert-page.png -------------------------------------------------------------------------------- /book.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Prometheus In Action", 3 | "author": "yunlzheng", 4 | "description": "", 5 | "language": "zh-hans", 6 | "links" : { 7 | "sidebar" : { 8 | "Github" : "http://github.com/yunlzheng" 9 | } 10 | }, 11 | "plugins": [ 12 | "multipart", 13 | "image-captions", 14 | "sectionx", 15 | "anchor-navigation", 16 | "splitter", 17 | "anchor-navigation", 18 | "-sharing" 19 | ], 20 | "pluginsConfig": { 21 | "image-captions": { 22 | "caption": "_CAPTION_" 23 | }, 24 | "toc2": { 25 | "addClass": true, 26 | "className": "toc" 27 | }, 28 | "theme-default": { 29 | "showLevel": false 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | consul: 4 | image: consul 5 | ports: 6 | - 8400:8400 7 | - 8500:8500 8 | - 8600:53/udp 9 | command: agent -server -client=0.0.0.0 -dev -node=node0 -bootstrap-expect=1 -data-dir=/tmp/consul 10 | labels: 11 | SERVICE_IGNORE: 'true' 12 | registrator: 13 | image: gliderlabs/registrator 14 | depends_on: 15 | - consul 16 | volumes: 17 | - /var/run:/tmp:rw 18 | command: consul://consul:8500 19 | prometheus: 20 | image: quay.io/prometheus/prometheus 21 | ports: 22 | - 9090:9090 23 | volumes: 24 | # - ./prometheus/prometheus_static.yml:/etc/prometheus/prometheus.yml 25 | - ./prometheus/prometheus_consul.yml:/etc/prometheus/prometheus.yml 26 | grafana: 27 | image: grafana/grafana 28 | ports: 29 | - 3000:3000 30 | volumes: 31 | - ~/grafana/:/var/lib/grafana:rw 32 | node_exporter: 33 | image: quay.io/prometheus/node-exporter 34 | pid: "host" 35 | ports: 36 | - 9100:9100 37 | labels: 38 | SERVICE_TAGS: "development" 39 | SERVICE_NAME: "node_exporter" 40 | SERVICE_TAG_IO_PROMETHEUS_SCRAPED: "true" 41 | docker_exporter: 42 | image: wisecity/docker-metrics-exporter 43 | volumes: 44 | - /var/lib/docker/:/var/lib/docker:ro 45 | - /var/run:/var/run:rw 46 | labels: 47 | SERVICE_TAGS: "production" 48 | # cadvisor: 49 | # image: google/cadvisor:latest 50 | # ports: 51 | # - 8080:8080 52 | # volumes: 53 | # - /:/rootfs:ro 54 | # - /var/run:/var/run:rw 55 | # - /var/lib/docker/:/var/lib/docker:ro 56 | # labels: 57 | # SERVICE_TAGS: "production,scraped" 58 | jenkins: 59 | image: jenkins/jenkins 60 | ports: 61 | - 8080:8080 62 | - 50000:50000 63 | volumes: 64 | - ~/jenkins_home/:/var/jenkins_home/:rw 65 | labels: 66 | SERVICE_NAME: "jenkins" 67 | SERVICE_8080_NAME: "jenkins" 68 | SERVICE_8080_PORT: "8080" 69 | SERVICE_50000_IGNORE: "true" 70 | -------------------------------------------------------------------------------- /draft/alert-with-wechat.md: -------------------------------------------------------------------------------- 1 | # 与微信进行集成 2 | 3 | 在国内,微信已经是最大的即时通讯工具。微信针对企业的应用场景专门退出了面向企业版微信。在这部分,我们将介绍如何将企业微信集成到Alertmanager中。 -------------------------------------------------------------------------------- /draft/share_dashboard.md: -------------------------------------------------------------------------------- 1 | # 共享Dashboard -------------------------------------------------------------------------------- /draft/use-federation-in-operator.md: -------------------------------------------------------------------------------- 1 | # 使用Prometheus构建联邦集群 2 | 3 | 通过本章前几节的内容读者应该对Prometheus Operator有了一个基本的认识。 这部分,我们将介绍如何通过Prometheus Operator搭建Prometheus联邦集群。 4 | 5 | references: 6 | * https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#endpoint 7 | * https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec 8 | * additionalScrapeConfigs -------------------------------------------------------------------------------- /draft/use_table_panel.md: -------------------------------------------------------------------------------- 1 | # 表格:Table Panel 2 | 3 | 通过表格的形式可以同时显示多条时间序列中的数据,可以方便的查看和比较监控指标的数据。Table Panel是Grafana提供的基础可视化组件之一。 4 | 5 | ![Table Panel示例](./static/grafana_table_panel_example2.png) 6 | 7 | 对于Prometheus采集到的时间序列数据,Table Panel支持直接将PromQL返回的时间序列格式化为表格的形式进行展示,也可以直接展示时间序列并且对样本数据进行统计聚合。 8 | 9 | ## 格式化时间序列 10 | 11 | 如下所示,Table Panel在默认情况下**Format as**配置选项为**Table**。该配置会直接将PromQL查询到的所有样本格式化为Grafana的Table数据结构,并直接展示到表格当中。 12 | 13 | ![Format As Table](./static/grafana_format_as_table.png) 14 | 15 | 其中样本的所有标签都被映射成表格的列,其中名为Value列会显示当前样本的值。默认情况下样本值不带任何的单位,为了让Table Panel能够自动化格式化样本值,可以通过Column Styles为Value定义样本值的格式化方式,如下所示: 16 | 17 | ![Column Styles选项](./static/grafana_table_panel_cloum_style.png) 18 | 19 | ## 使用Table可视化时间序列 20 | 21 | ## 按行显示时间序列 22 | 23 | ## 按列显示时间序列 24 | 25 | ## 对样本数据进行聚合 -------------------------------------------------------------------------------- /examples/ch1/README.md: -------------------------------------------------------------------------------- 1 | # Prometheus Standalone Environment 2 | 3 | ## Components 4 | 5 | * Prometheus Server 6 | * Alertmanager 7 | * Node Exporter -------------------------------------------------------------------------------- /examples/ch1/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | Vagrant.configure("2") do |config| 5 | config.vm.box = "ubuntu/xenial64" 6 | config.vm.network "private_network", ip: "192.168.33.10" 7 | config.vm.provision "shell", path: "install.sh" 8 | end 9 | -------------------------------------------------------------------------------- /examples/ch1/install.sh: -------------------------------------------------------------------------------- 1 | ############################ 2 | #### 安装Prometheus ######### 3 | ############################ 4 | # 创建用户 5 | if ! id -u prometheus > /dev/null 2>&1; then 6 | sudo useradd --no-create-home --shell /bin/false prometheus 7 | fi 8 | 9 | # 创建目录 10 | if [ ! -d "/etc/prometheus" ]; then 11 | sudo mkdir /etc/prometheus 12 | sudo chown prometheus:prometheus /etc/prometheus 13 | fi 14 | 15 | if [ ! -d "/var/lib/prometheus" ]; then 16 | sudo mkdir /var/lib/prometheus 17 | sudo chown prometheus:prometheus /var/lib/prometheus 18 | fi 19 | 20 | # 安装prometheus 21 | cd ~ 22 | 23 | if [ ! -f prometheus-2.0.0.linux-amd64.tar.gz ]; then 24 | curl -LO https://github.com/prometheus/prometheus/releases/download/v2.0.0/prometheus-2.0.0.linux-amd64.tar.gz 25 | tar xvf prometheus-2.0.0.linux-amd64.tar.gz 26 | sudo cp prometheus-2.0.0.linux-amd64/prometheus /usr/local/bin/ 27 | sudo cp prometheus-2.0.0.linux-amd64/promtool /usr/local/bin/ 28 | 29 | sudo chown prometheus:prometheus /usr/local/bin/prometheus 30 | sudo chown prometheus:prometheus /usr/local/bin/promtool 31 | 32 | sudo cp -r prometheus-2.0.0.linux-amd64/consoles /etc/prometheus 33 | sudo cp -r prometheus-2.0.0.linux-amd64/console_libraries /etc/prometheus 34 | 35 | sudo chown -R prometheus:prometheus /etc/prometheus/consoles 36 | sudo chown -R prometheus:prometheus /etc/prometheus/console_libraries 37 | fi 38 | 39 | # 创建配置文件 40 | cp -f /vagrant/prometheus.yml /etc/prometheus/prometheus.yml 41 | sudo chown prometheus:prometheus /etc/prometheus/prometheus.yml 42 | 43 | # 创建prometheus.service文件 44 | cp -f /vagrant/prometheus.service /etc/systemd/system/prometheus.service 45 | 46 | ############################ 47 | #### 安装NodeExporter ###### 48 | ############################ 49 | 50 | # 创建用户 51 | if ! id -u node_exporter > /dev/null 2>&1; then 52 | sudo useradd --no-create-home --shell /bin/false node_exporter 53 | fi 54 | 55 | # 安装node_exporter 56 | cd ~ 57 | if [ ! -f node_exporter-0.15.1.linux-amd64.tar.gz ]; then 58 | curl -LO https://github.com/prometheus/node_exporter/releases/download/v0.15.1/node_exporter-0.15.1.linux-amd64.tar.gz 59 | tar xvf node_exporter-0.15.1.linux-amd64.tar.gz 60 | 61 | sudo cp node_exporter-0.15.1.linux-amd64/node_exporter /usr/local/bin 62 | sudo chown node_exporter:node_exporter /usr/local/bin/node_exporter 63 | fi 64 | 65 | # 创建service文件 66 | cp -f /vagrant/node_exporter.service /etc/systemd/system/node_exporter.service 67 | 68 | ############################ 69 | #### Alertmanager ###### 70 | ############################ 71 | cd ~ 72 | if [ ! -f alertmanager-0.14.0.linux-amd64.tar.gz ]; then 73 | curl -LO https://github.com/prometheus/alertmanager/releases/download/v0.14.0/alertmanager-0.14.0.linux-amd64.tar.gz 74 | tar xvf alertmanager-0.14.0.linux-amd64.tar.gz 75 | fi 76 | 77 | # 启动服务 78 | sudo systemctl daemon-reload 79 | sudo systemctl start node_exporter 80 | sudo systemctl start prometheus 81 | 82 | # 检查状态 83 | sudo systemctl status node_exporter 84 | sudo systemctl status prometheus 85 | 86 | # 设置开机启动 87 | sudo systemctl enable prometheus 88 | sudo systemctl enable node_exporter -------------------------------------------------------------------------------- /examples/ch1/node_exporter.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Node Exporter 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User=node_exporter 8 | Group=node_exporter 9 | Type=simple 10 | ExecStart=/usr/local/bin/node_exporter 11 | 12 | [Install] 13 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /examples/ch1/prometheus.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User=prometheus 8 | Group=prometheus 9 | Type=simple 10 | ExecStart=/usr/local/bin/prometheus \ 11 | --config.file /etc/prometheus/prometheus.yml \ 12 | --storage.tsdb.path /var/lib/prometheus/ \ 13 | --web.console.templates=/etc/prometheus/consoles \ 14 | --web.console.libraries=/etc/prometheus/console_libraries 15 | 16 | [Install] 17 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /examples/ch1/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | rule_files: 4 | - /etc/prometheus/rules/*.rules 5 | scrape_configs: 6 | - job_name: 'prometheus' 7 | scrape_interval: 5s 8 | static_configs: 9 | - targets: ['localhost:9090'] 10 | - job_name: 'node_exporter' 11 | scrape_interval: 5s 12 | static_configs: 13 | - targets: ['localhost:9100'] -------------------------------------------------------------------------------- /examples/ch7/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | Vagrant.configure("2") do |config| 5 | 6 | config.vm.define "core" do |core| 7 | core.vm.box = "ubuntu/xenial64" 8 | core.vm.network "private_network", ip: "192.168.77.10" 9 | core.vm.provision "shell", path: "install_federation.sh" 10 | end 11 | 12 | config.vm.define "dc1" do |dc1| 13 | dc1.vm.box = "ubuntu/xenial64" 14 | dc1.vm.network "private_network", ip: "192.168.77.11" 15 | dc1.vm.provision "shell", path: "install.sh" 16 | end 17 | 18 | config.vm.define "dc2" do |dc2| 19 | dc2.vm.box = "ubuntu/xenial64" 20 | dc2.vm.network "private_network", ip: "192.168.77.12" 21 | dc2.vm.provision "shell", path: "install.sh" 22 | end 23 | 24 | end 25 | -------------------------------------------------------------------------------- /examples/ch7/install.sh: -------------------------------------------------------------------------------- 1 | ############################ 2 | #### 安装Prometheus ######### 3 | ############################ 4 | # 创建用户 5 | if ! id -u prometheus > /dev/null 2>&1; then 6 | sudo useradd --no-create-home --shell /bin/false prometheus 7 | fi 8 | 9 | # 创建目录 10 | if [ ! -d "/etc/prometheus" ]; then 11 | sudo mkdir /etc/prometheus 12 | sudo chown prometheus:prometheus /etc/prometheus 13 | fi 14 | 15 | if [ ! -d "/var/lib/prometheus" ]; then 16 | sudo mkdir /var/lib/prometheus 17 | sudo chown prometheus:prometheus /var/lib/prometheus 18 | fi 19 | 20 | # 安装prometheus 21 | cd ~ 22 | 23 | if [ ! -f prometheus-2.0.0.linux-amd64.tar.gz ]; then 24 | curl -LO https://github.com/prometheus/prometheus/releases/download/v2.0.0/prometheus-2.0.0.linux-amd64.tar.gz 25 | tar xvf prometheus-2.0.0.linux-amd64.tar.gz 26 | sudo cp prometheus-2.0.0.linux-amd64/prometheus /usr/local/bin/ 27 | sudo cp prometheus-2.0.0.linux-amd64/promtool /usr/local/bin/ 28 | 29 | sudo chown prometheus:prometheus /usr/local/bin/prometheus 30 | sudo chown prometheus:prometheus /usr/local/bin/promtool 31 | 32 | sudo cp -r prometheus-2.0.0.linux-amd64/consoles /etc/prometheus 33 | sudo cp -r prometheus-2.0.0.linux-amd64/console_libraries /etc/prometheus 34 | 35 | sudo chown -R prometheus:prometheus /etc/prometheus/consoles 36 | sudo chown -R prometheus:prometheus /etc/prometheus/console_libraries 37 | fi 38 | 39 | # 创建配置文件 40 | cp -f /vagrant/prometheus.yml /etc/prometheus/prometheus.yml 41 | sudo chown prometheus:prometheus /etc/prometheus/prometheus.yml 42 | 43 | # 创建prometheus.service文件 44 | cp -f /vagrant/prometheus.service /etc/systemd/system/prometheus.service 45 | 46 | ############################ 47 | #### 安装NodeExporter ###### 48 | ############################ 49 | 50 | # 创建用户 51 | if ! id -u node_exporter > /dev/null 2>&1; then 52 | sudo useradd --no-create-home --shell /bin/false node_exporter 53 | fi 54 | 55 | # 安装node_exporter 56 | cd ~ 57 | if [ ! -f node_exporter-0.15.1.linux-amd64.tar.gz ]; then 58 | curl -LO https://github.com/prometheus/node_exporter/releases/download/v0.15.1/node_exporter-0.15.1.linux-amd64.tar.gz 59 | tar xvf node_exporter-0.15.1.linux-amd64.tar.gz 60 | 61 | sudo cp node_exporter-0.15.1.linux-amd64/node_exporter /usr/local/bin 62 | sudo chown node_exporter:node_exporter /usr/local/bin/node_exporter 63 | fi 64 | 65 | # 创建service文件 66 | cp -f /vagrant/node_exporter.service /etc/systemd/system/node_exporter.service 67 | 68 | # 启动服务 69 | sudo systemctl daemon-reload 70 | sudo systemctl start node_exporter 71 | sudo systemctl start prometheus 72 | 73 | # 检查状态 74 | sudo systemctl status node_exporter 75 | sudo systemctl status prometheus 76 | 77 | # 设置开机启动 78 | sudo systemctl enable prometheus 79 | sudo systemctl enable node_exporter -------------------------------------------------------------------------------- /examples/ch7/install_federation.sh: -------------------------------------------------------------------------------- 1 | ############################ 2 | #### 安装Prometheus ######### 3 | ############################ 4 | # 创建用户 5 | if ! id -u prometheus > /dev/null 2>&1; then 6 | sudo useradd --no-create-home --shell /bin/false prometheus 7 | fi 8 | 9 | # 创建目录 10 | if [ ! -d "/etc/prometheus" ]; then 11 | sudo mkdir /etc/prometheus 12 | sudo chown prometheus:prometheus /etc/prometheus 13 | fi 14 | 15 | if [ ! -d "/var/lib/prometheus" ]; then 16 | sudo mkdir /var/lib/prometheus 17 | sudo chown prometheus:prometheus /var/lib/prometheus 18 | fi 19 | 20 | # 安装prometheus 21 | cd ~ 22 | 23 | if [ ! -f prometheus-2.0.0.linux-amd64.tar.gz ]; then 24 | curl -LO https://github.com/prometheus/prometheus/releases/download/v2.0.0/prometheus-2.0.0.linux-amd64.tar.gz 25 | tar xvf prometheus-2.0.0.linux-amd64.tar.gz 26 | sudo cp prometheus-2.0.0.linux-amd64/prometheus /usr/local/bin/ 27 | sudo cp prometheus-2.0.0.linux-amd64/promtool /usr/local/bin/ 28 | 29 | sudo chown prometheus:prometheus /usr/local/bin/prometheus 30 | sudo chown prometheus:prometheus /usr/local/bin/promtool 31 | 32 | sudo cp -r prometheus-2.0.0.linux-amd64/consoles /etc/prometheus 33 | sudo cp -r prometheus-2.0.0.linux-amd64/console_libraries /etc/prometheus 34 | 35 | sudo chown -R prometheus:prometheus /etc/prometheus/consoles 36 | sudo chown -R prometheus:prometheus /etc/prometheus/console_libraries 37 | fi 38 | 39 | # 创建配置文件 40 | cp -f /vagrant/prometheus_federation.yml /etc/prometheus/prometheus.yml 41 | sudo chown prometheus:prometheus /etc/prometheus/prometheus.yml 42 | 43 | # 创建prometheus.service文件 44 | cp -f /vagrant/prometheus.service /etc/systemd/system/prometheus.service 45 | 46 | ############################ 47 | #### 安装NodeExporter ###### 48 | ############################ 49 | 50 | # 创建用户 51 | if ! id -u node_exporter > /dev/null 2>&1; then 52 | sudo useradd --no-create-home --shell /bin/false node_exporter 53 | fi 54 | 55 | # 安装node_exporter 56 | cd ~ 57 | if [ ! -f node_exporter-0.15.1.linux-amd64.tar.gz ]; then 58 | curl -LO https://github.com/prometheus/node_exporter/releases/download/v0.15.1/node_exporter-0.15.1.linux-amd64.tar.gz 59 | tar xvf node_exporter-0.15.1.linux-amd64.tar.gz 60 | 61 | sudo cp node_exporter-0.15.1.linux-amd64/node_exporter /usr/local/bin 62 | sudo chown node_exporter:node_exporter /usr/local/bin/node_exporter 63 | fi 64 | 65 | # 创建service文件 66 | cp -f /vagrant/node_exporter.service /etc/systemd/system/node_exporter.service 67 | 68 | # 启动服务 69 | sudo systemctl daemon-reload 70 | sudo systemctl start node_exporter 71 | sudo systemctl start prometheus 72 | 73 | # 检查状态 74 | sudo systemctl status node_exporter 75 | sudo systemctl status prometheus 76 | 77 | # 设置开机启动 78 | sudo systemctl enable prometheus 79 | sudo systemctl enable node_exporter -------------------------------------------------------------------------------- /examples/ch7/node_exporter.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Node Exporter 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User=node_exporter 8 | Group=node_exporter 9 | Type=simple 10 | ExecStart=/usr/local/bin/node_exporter 11 | 12 | [Install] 13 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /examples/ch7/prometheus.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User=prometheus 8 | Group=prometheus 9 | Type=simple 10 | ExecStart=/usr/local/bin/prometheus \ 11 | --config.file /etc/prometheus/prometheus.yml \ 12 | --storage.tsdb.path /var/lib/prometheus/ \ 13 | --web.console.templates=/etc/prometheus/consoles \ 14 | --web.console.libraries=/etc/prometheus/console_libraries 15 | 16 | [Install] 17 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /examples/ch7/prometheus_federation.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | 4 | scrape_configs: 5 | # - job_name: 'prometheus' 6 | # scrape_interval: 5s 7 | # static_configs: 8 | # - targets: ['localhost:9090'] 9 | # - job_name: 'node_exporter' 10 | # scrape_interval: 5s 11 | # static_configs: 12 | # - targets: ['localhost:9100'] 13 | - job_name: 'federate' 14 | scrape_interval: 15s 15 | honor_labels: true 16 | metrics_path: '/federate' 17 | params: 18 | 'match[]': 19 | - '{job="prometheus"}' 20 | - '{__name__=~"job:.*"}' 21 | - '{__name__=~"node.*"}' 22 | static_configs: 23 | - targets: 24 | - '192.168.77.11:9090' 25 | - '192.168.77.12:9090' -------------------------------------------------------------------------------- /examples/ch7/prometheus_slave.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | 4 | scrape_configs: 5 | - job_name: 'prometheus' 6 | scrape_interval: 5s 7 | static_configs: 8 | - targets: ['localhost:9090'] 9 | - job_name: 'node_exporter' 10 | scrape_interval: 5s 11 | static_configs: 12 | - targets: ['localhost:9100'] -------------------------------------------------------------------------------- /examples/cluster/alertmanager.procfile: -------------------------------------------------------------------------------- 1 | a1: alertmanager --web.listen-address=":9093" --cluster.listen-address="127.0.0.1:8001" --config.file=/etc/prometheus/alertmanager-ha.yml --storage.path=/data/alertmanager/ --log.level=debug 2 | a2: alertmanager --web.listen-address=":9094" --cluster.listen-address="127.0.0.1:8002" --cluster.peer=127.0.0.1:8001 --config.file=/etc/prometheus/alertmanager-ha.yml --storage.path=/data/alertmanager2/ --log.level=debug 3 | a3: alertmanager --web.listen-address=":9095" --cluster.listen-address="127.0.0.1:8003" --cluster.peer=127.0.0.1:8001 --config.file=/etc/prometheus/alertmanager-ha.yml --storage.path=/data/alertmanager2/ --log.level=debug 4 | 5 | webhook: webhook -------------------------------------------------------------------------------- /examples/cluster/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | nginx: 4 | image: nginx 5 | ports: 6 | - 1080:80 7 | cadvisor: 8 | image: google/cadvisor 9 | ports: 10 | - 8080:8080 11 | volumes: 12 | - /var/lib/docker/:/var/lib/docker:ro 13 | - /var/run:/var/run:rw 14 | - /dev/disk/:/dev/disk:ro -------------------------------------------------------------------------------- /examples/cluster/prometheus.procfile: -------------------------------------------------------------------------------- 1 | p1: prometheus --config.file=/etc/prometheus/prometheus-ha.yml --storage.tsdb.path=/data/prometheus/ --web.listen-address="127.0.0.1:9090" 2 | p2: prometheus --config.file=/etc/prometheus/prometheus-ha.yml --storage.tsdb.path=/data/prometheus2/ --web.listen-address="127.0.0.1:9091" 3 | 4 | node_exporter: node_exporter -web.listen-address="0.0.0.0:9100" -------------------------------------------------------------------------------- /examples/cluster/send-alert.sh: -------------------------------------------------------------------------------- 1 | alerts1='[ 2 | { 3 | "labels": { 4 | "alertname": "DiskRunningFull", 5 | "dev": "sda1", 6 | "instance": "example1" 7 | }, 8 | "annotations": { 9 | "info": "The disk sda1 is running full", 10 | "summary": "please check the instance example1" 11 | } 12 | }, 13 | { 14 | "labels": { 15 | "alertname": "DiskRunningFull", 16 | "dev": "sda2", 17 | "instance": "example1" 18 | }, 19 | "annotations": { 20 | "info": "The disk sda2 is running full", 21 | "summary": "please check the instance example1", 22 | "runbook": "the following link http://test-url should be clickable" 23 | } 24 | }, 25 | { 26 | "labels": { 27 | "alertname": "DiskRunningFull", 28 | "dev": "sda1", 29 | "instance": "example2" 30 | }, 31 | "annotations": { 32 | "info": "The disk sda1 is running full", 33 | "summary": "please check the instance example2" 34 | } 35 | }, 36 | { 37 | "labels": { 38 | "alertname": "DiskRunningFull", 39 | "dev": "sdb2", 40 | "instance": "example2" 41 | }, 42 | "annotations": { 43 | "info": "The disk sdb2 is running full", 44 | "summary": "please check the instance example2" 45 | } 46 | }, 47 | { 48 | "labels": { 49 | "alertname": "DiskRunningFull", 50 | "dev": "sda1", 51 | "instance": "example3", 52 | "severity": "critical" 53 | } 54 | }, 55 | { 56 | "labels": { 57 | "alertname": "DiskRunningFull", 58 | "dev": "sda1", 59 | "instance": "example3", 60 | "severity": "warning" 61 | } 62 | } 63 | ]' 64 | 65 | echo $alerts1 66 | 67 | curl -XPOST -d"$alerts1" http://localhost:9093/api/v1/alerts 68 | curl -XPOST -d"$alerts1" http://localhost:9094/api/v1/alerts 69 | curl -XPOST -d"$alerts1" http://localhost:9095/api/v1/alerts 70 | -------------------------------------------------------------------------------- /examples/cluster/xxx.log: -------------------------------------------------------------------------------- 1 | Attaching to remote server dce4, please wait... 2 | -------------------------------------------------------------------------------- /examples/kubernetes/README.md: -------------------------------------------------------------------------------- 1 | 在Kubernetes下安装部署Prometheus 2 | ================== 3 | 4 | 部署Prometheus 5 | 6 | ``` 7 | kubectl create -f prometheus/prometheus-rbac-setup.yml 8 | kubectl create -f prometheus/prometheus-config.yml 9 | kubectl create -f prometheus/prometheus-deployment.yml 10 | kubectl create -f prometheus/prometheus-ingress.yml 11 | ``` 12 | 13 | 部署Exporters 14 | 15 | ``` 16 | kubectl create -f prometheus/node-exporter-daemonset.yml 17 | kubectl create -f prometheus/blackbox-exporter-deployment.yml 18 | ``` 19 | 20 | 部署测试应用 21 | 22 | ``` 23 | kubectl create -f nginx-deployment.yml 24 | kubectl create -f nginx/nginx-service.yml 25 | ``` -------------------------------------------------------------------------------- /examples/kubernetes/nginx/nginx-deployment.yml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: nginx-deployment 5 | labels: 6 | app: nginx 7 | spec: 8 | replicas: 3 9 | selector: 10 | matchLabels: 11 | app: nginx 12 | template: 13 | metadata: 14 | labels: 15 | app: nginx 16 | spec: 17 | containers: 18 | - name: nginx 19 | image: nginx:1.7.9 20 | ports: 21 | - containerPort: 80 -------------------------------------------------------------------------------- /examples/kubernetes/nginx/nginx-service.yml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | annotations: 5 | prometheus.io/probe: 'true' 6 | name: nginx-service 7 | spec: 8 | selector: 9 | app: nginx 10 | ports: 11 | - protocol: TCP 12 | port: 80 13 | targetPort: 80 14 | type: NodePort -------------------------------------------------------------------------------- /examples/kubernetes/operator/demo-deploy.yml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: example-app 5 | labels: 6 | app: example-app 7 | spec: 8 | selector: 9 | app: example-app 10 | ports: 11 | - name: web 12 | port: 8080 13 | --- 14 | apiVersion: extensions/v1beta1 15 | kind: Deployment 16 | metadata: 17 | name: example-app 18 | spec: 19 | replicas: 3 20 | template: 21 | metadata: 22 | labels: 23 | app: example-app 24 | spec: 25 | containers: 26 | - name: example-app 27 | image: fabxc/instrumented_app 28 | ports: 29 | - name: web 30 | containerPort: 8080 -------------------------------------------------------------------------------- /examples/kubernetes/operator/demo-service-monitor.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: example-app 5 | labels: 6 | team: frontend 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: example-app 11 | endpoints: 12 | - port: web -------------------------------------------------------------------------------- /examples/kubernetes/operator/prometheus-operator-deploy.yml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: prometheus-operator 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: prometheus-operator 9 | subjects: 10 | - kind: ServiceAccount 11 | name: prometheus-operator 12 | namespace: default 13 | --- 14 | apiVersion: rbac.authorization.k8s.io/v1beta1 15 | kind: ClusterRole 16 | metadata: 17 | name: prometheus-operator 18 | rules: 19 | - apiGroups: 20 | - extensions 21 | resources: 22 | - thirdpartyresources 23 | verbs: 24 | - "*" 25 | - apiGroups: 26 | - apiextensions.k8s.io 27 | resources: 28 | - customresourcedefinitions 29 | verbs: 30 | - "*" 31 | - apiGroups: 32 | - monitoring.coreos.com 33 | resources: 34 | - alertmanagers 35 | - prometheuses 36 | - prometheuses/finalizers 37 | - servicemonitors 38 | verbs: 39 | - "*" 40 | - apiGroups: 41 | - apps 42 | resources: 43 | - statefulsets 44 | verbs: ["*"] 45 | - apiGroups: [""] 46 | resources: 47 | - configmaps 48 | - secrets 49 | verbs: ["*"] 50 | - apiGroups: [""] 51 | resources: 52 | - pods 53 | verbs: ["list", "delete"] 54 | - apiGroups: [""] 55 | resources: 56 | - services 57 | - endpoints 58 | verbs: ["get", "create", "update"] 59 | - apiGroups: [""] 60 | resources: 61 | - nodes 62 | verbs: ["list", "watch"] 63 | - apiGroups: [""] 64 | resources: 65 | - namespaces 66 | verbs: ["list"] 67 | --- 68 | apiVersion: v1 69 | kind: ServiceAccount 70 | metadata: 71 | name: prometheus-operator 72 | --- 73 | apiVersion: extensions/v1beta1 74 | kind: Deployment 75 | metadata: 76 | labels: 77 | k8s-app: prometheus-operator 78 | name: prometheus-operator 79 | spec: 80 | replicas: 1 81 | template: 82 | metadata: 83 | labels: 84 | k8s-app: prometheus-operator 85 | spec: 86 | containers: 87 | - args: 88 | - --kubelet-service=kube-system/kubelet 89 | - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 90 | image: quay.io/coreos/prometheus-operator:v0.17.0 91 | name: prometheus-operator 92 | ports: 93 | - containerPort: 8080 94 | name: http 95 | resources: 96 | limits: 97 | cpu: 200m 98 | memory: 100Mi 99 | requests: 100 | cpu: 100m 101 | memory: 50Mi 102 | securityContext: 103 | runAsNonRoot: true 104 | runAsUser: 65534 105 | serviceAccountName: prometheus-operator -------------------------------------------------------------------------------- /examples/kubernetes/operator/prometheus-svc.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: prometheus 5 | spec: 6 | type: NodePort 7 | ports: 8 | - name: web 9 | nodePort: 30900 10 | port: 9090 11 | protocol: TCP 12 | targetPort: web 13 | selector: 14 | prometheus: prometheus -------------------------------------------------------------------------------- /examples/kubernetes/operator/rbac-setup.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: prometheus 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1beta1 7 | kind: ClusterRole 8 | metadata: 9 | name: prometheus 10 | rules: 11 | - apiGroups: [""] 12 | resources: 13 | - nodes 14 | - services 15 | - endpoints 16 | - pods 17 | verbs: ["get", "list", "watch"] 18 | - apiGroups: [""] 19 | resources: 20 | - configmaps 21 | verbs: ["get"] 22 | - nonResourceURLs: ["/metrics"] 23 | verbs: ["get"] 24 | --- 25 | apiVersion: rbac.authorization.k8s.io/v1beta1 26 | kind: ClusterRoleBinding 27 | metadata: 28 | name: prometheus 29 | roleRef: 30 | apiGroup: rbac.authorization.k8s.io 31 | kind: ClusterRole 32 | name: prometheus 33 | subjects: 34 | - kind: ServiceAccount 35 | name: prometheus 36 | namespace: default -------------------------------------------------------------------------------- /examples/kubernetes/operator/service-monitor-selector.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | name: prometheus 5 | spec: 6 | serviceAccountName: prometheus 7 | serviceMonitorSelector: 8 | matchLabels: 9 | team: frontend 10 | resources: 11 | requests: 12 | memory: 400Mi 13 | -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/blackbox-exporter-deployment.yml: -------------------------------------------------------------------------------- 1 | 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | labels: 6 | app: blackbox-exporter 7 | name: blackbox-exporter 8 | spec: 9 | ports: 10 | - name: blackbox 11 | port: 9115 12 | protocol: TCP 13 | selector: 14 | app: blackbox-exporter 15 | type: ClusterIP 16 | --- 17 | apiVersion: extensions/v1beta1 18 | kind: Deployment 19 | metadata: 20 | labels: 21 | app: blackbox-exporter 22 | name: blackbox-exporter 23 | spec: 24 | replicas: 1 25 | selector: 26 | matchLabels: 27 | app: blackbox-exporter 28 | template: 29 | metadata: 30 | labels: 31 | app: blackbox-exporter 32 | spec: 33 | containers: 34 | - image: prom/blackbox-exporter 35 | imagePullPolicy: IfNotPresent 36 | name: blackbox-exporter -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/kubernetes-prometheus-eq1.yml: -------------------------------------------------------------------------------- 1 | scrape_configs: 2 | 3 | - job_name: 'kubernetes-nodes' 4 | scheme: https 5 | tls_config: 6 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 7 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 8 | kubernetes_sd_configs: 9 | - role: node -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/node-exporter-daemonset.yml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: DaemonSet 3 | metadata: 4 | name: node-exporter 5 | spec: 6 | template: 7 | metadata: 8 | annotations: 9 | prometheus.io/scrape: 'true' 10 | prometheus.io/port: '9100' 11 | labels: 12 | app: node-exporter 13 | name: node-exporter 14 | spec: 15 | containers: 16 | - image: prom/node-exporter 17 | imagePullPolicy: IfNotPresent 18 | name: node-exporter 19 | ports: 20 | - containerPort: 9100 21 | hostPort: 9100 22 | name: scrape 23 | hostNetwork: true 24 | hostPID: true -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/prometheus-config-eq1.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | prometheus.yml: |- 4 | global: 5 | scrape_interval: 15s 6 | evaluation_interval: 15s 7 | scrape_configs: 8 | - job_name: 'kubernetes-nodes' 9 | scheme: https 10 | tls_config: 11 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 12 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 13 | kubernetes_sd_configs: 14 | - role: node 15 | kind: ConfigMap 16 | metadata: 17 | name: prometheus-config -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/prometheus-config-eq2.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | prometheus.yml: |- 4 | global: 5 | scrape_interval: 15s 6 | evaluation_interval: 15s 7 | scrape_configs: 8 | 9 | - job_name: 'kubernetes-cadvisor' 10 | scheme: https 11 | tls_config: 12 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 13 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 14 | kubernetes_sd_configs: 15 | - role: node 16 | relabel_configs: 17 | - action: labelmap 18 | regex: __meta_kubernetes_node_label_(.+) 19 | - target_label: __address__ 20 | replacement: kubernetes.default.svc:443 21 | - source_labels: [__meta_kubernetes_node_name] 22 | regex: (.+) 23 | target_label: __metrics_path__ 24 | replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor 25 | kind: ConfigMap 26 | metadata: 27 | name: prometheus-config -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/prometheus-config-eq3.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: prometheus-config 5 | data: 6 | prometheus.yml: |- 7 | global: 8 | scrape_interval: 15s 9 | evaluation_interval: 15s 10 | scrape_configs: 11 | 12 | - job_name: 'kubernetes-cadvisor' 13 | scheme: https 14 | tls_config: 15 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 16 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 17 | kubernetes_sd_configs: 18 | - role: node 19 | relabel_configs: 20 | - source_labels: [__address__] 21 | regex: (.+):(.+) 22 | action: replace 23 | target_label: __address__ 24 | replacement: $1:4194 25 | - action: replace 26 | target_label: __scheme__ 27 | replacement: http 28 | - action: labelmap 29 | regex: __meta_kubernetes_node_label_(.+) 30 | -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/prometheus-config-eq4.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: prometheus-config 5 | data: 6 | prometheus.yml: |- 7 | global: 8 | scrape_interval: 15s 9 | evaluation_interval: 15s 10 | scrape_configs: 11 | 12 | - job_name: 'kubernetes-cadvisor' 13 | scheme: https 14 | tls_config: 15 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 16 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 17 | kubernetes_sd_configs: 18 | - role: node 19 | relabel_configs: 20 | - action: labelmap 21 | regex: __meta_kubernetes_node_label_(.+) 22 | - target_label: __address__ 23 | replacement: kubernetes.default.svc:443 24 | - source_labels: [__meta_kubernetes_node_name] 25 | regex: (.+) 26 | target_label: __metrics_path__ 27 | replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/prometheus-config-sd-example.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | prometheus.yml: |- 4 | global: 5 | scrape_interval: 15s 6 | evaluation_interval: 15s 7 | scrape_configs: 8 | 9 | - job_name: 'kubernetes-nodes' 10 | tls_config: 11 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 12 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 13 | kubernetes_sd_configs: 14 | - role: node 15 | 16 | - job_name: 'kubernetes-service' 17 | tls_config: 18 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 19 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 20 | kubernetes_sd_configs: 21 | - role: service 22 | 23 | - job_name: 'kubernetes-endpoints' 24 | tls_config: 25 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 26 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 27 | kubernetes_sd_configs: 28 | - role: endpoints 29 | 30 | - job_name: 'kubernetes-ingress' 31 | tls_config: 32 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 33 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 34 | kubernetes_sd_configs: 35 | - role: ingress 36 | 37 | - job_name: 'kubernetes-pods' 38 | tls_config: 39 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 40 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 41 | kubernetes_sd_configs: 42 | - role: pod 43 | 44 | kind: ConfigMap 45 | metadata: 46 | name: prometheus-config -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/prometheus-deployment.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: "Service" 3 | metadata: 4 | name: prometheus 5 | labels: 6 | name: prometheus 7 | spec: 8 | ports: 9 | - name: prometheus 10 | protocol: TCP 11 | port: 9090 12 | targetPort: 9090 13 | selector: 14 | app: prometheus 15 | type: NodePort 16 | --- 17 | apiVersion: extensions/v1beta1 18 | kind: Deployment 19 | metadata: 20 | labels: 21 | name: prometheus 22 | name: prometheus 23 | spec: 24 | replicas: 1 25 | template: 26 | metadata: 27 | labels: 28 | app: prometheus 29 | spec: 30 | serviceAccountName: prometheus 31 | serviceAccount: prometheus 32 | containers: 33 | - name: prometheus 34 | image: prom/prometheus:v2.2.1 35 | command: 36 | - "/bin/prometheus" 37 | args: 38 | - "--config.file=/etc/prometheus/prometheus.yml" 39 | ports: 40 | - containerPort: 9090 41 | protocol: TCP 42 | volumeMounts: 43 | - mountPath: "/etc/prometheus" 44 | name: prometheus-config 45 | volumes: 46 | - name: prometheus-config 47 | configMap: 48 | name: prometheus-config -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/prometheus-ingress.yml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Ingress 3 | metadata: 4 | name: prometheus 5 | annotations: 6 | prometheus.io/probe: 'true' 7 | nginx.ingress.kubernetes.io/rewrite-target: / 8 | spec: 9 | rules: 10 | - http: 11 | paths: 12 | - path: / 13 | backend: 14 | serviceName: prometheus 15 | servicePort: 9090 16 | -------------------------------------------------------------------------------- /examples/kubernetes/prometheus/prometheus-rbac-setup.yml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: ClusterRole 3 | metadata: 4 | name: prometheus 5 | rules: 6 | - apiGroups: [""] 7 | resources: 8 | - nodes 9 | - nodes/proxy 10 | - services 11 | - endpoints 12 | - pods 13 | verbs: ["get", "list", "watch"] 14 | - apiGroups: 15 | - extensions 16 | resources: 17 | - ingresses 18 | verbs: ["get", "list", "watch"] 19 | - nonResourceURLs: ["/metrics"] 20 | verbs: ["get"] 21 | --- 22 | apiVersion: v1 23 | kind: ServiceAccount 24 | metadata: 25 | name: prometheus 26 | namespace: default 27 | --- 28 | apiVersion: rbac.authorization.k8s.io/v1beta1 29 | kind: ClusterRoleBinding 30 | metadata: 31 | name: prometheus 32 | roleRef: 33 | apiGroup: rbac.authorization.k8s.io 34 | kind: ClusterRole 35 | name: prometheus 36 | subjects: 37 | - kind: ServiceAccount 38 | name: prometheus 39 | namespace: default -------------------------------------------------------------------------------- /examples/mysql_exporter/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | mysql: 4 | image: mysql:5.7 5 | ports: 6 | - "3306:3306" 7 | environment: 8 | - MYSQL_ROOT_PASSWORD=password 9 | - MYSQL_DATABASE=database 10 | mysqlexporter: 11 | image: prom/mysqld-exporter 12 | ports: 13 | - "9104:9104" 14 | environment: 15 | - DATA_SOURCE_NAME=root:password@(mysql:3306)/database 16 | command: 17 | - --collect.info_schema.query_response_time -------------------------------------------------------------------------------- /examples/operator/README.md: -------------------------------------------------------------------------------- 1 | Use Operator Manage Prometheus 2 | =========== 3 | 4 | ## Prepare Helm 5 | 6 | ``` 7 | brew install kubernetes-helm 8 | helm init 9 | ``` 10 | 11 | ## Kubernetes RBD 12 | 13 | ``` 14 | # minikube vm 15 | docker run -d --net=host -v /etc/ceph:/etc/ceph -e MON_IP=192.168.99.100 -e CEPH_PUBLIC_NETWORK=192.168.99.0/24 ceph/demo ceph 16 | ``` 17 | 18 | ## Install Operator 19 | 20 | ``` 21 | helm repo add coreos https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/ 22 | 23 | helm install coreos/prometheus-operator --name prometheus-operator --namespace monitoring 24 | 25 | helm install coreos/kube-prometheus --name kube-prometheus --set global.rbacEnable=true --namespace monitoring 26 | ``` -------------------------------------------------------------------------------- /examples/operator/minikube.sh: -------------------------------------------------------------------------------- 1 | minikube start --disk-size=40g --registry-mirror=https://k2eue1s7.mirror.aliyuncs.com -------------------------------------------------------------------------------- /examples/operator/prometheus-k8s.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | name: prometheus-k8s 5 | labels: 6 | prometheus: k8s 7 | spec: 8 | version: v1.3.0 9 | --- 10 | apiVersion: v1 11 | kind: Service 12 | metadata: 13 | name: prometheus-k8s 14 | spec: 15 | type: NodePort 16 | ports: 17 | - name: web 18 | port: 9090 19 | protocol: TCP 20 | targetPort: web 21 | selector: 22 | prometheus: prometheus-k8s 23 | -------------------------------------------------------------------------------- /examples/operator/prometheus-operator.yml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: prometheus-operator 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: prometheus-operator 9 | subjects: 10 | - kind: ServiceAccount 11 | name: prometheus-operator 12 | namespace: default 13 | --- 14 | apiVersion: rbac.authorization.k8s.io/v1beta1 15 | kind: ClusterRole 16 | metadata: 17 | name: prometheus-operator 18 | rules: 19 | - apiGroups: 20 | - extensions 21 | resources: 22 | - thirdpartyresources 23 | verbs: 24 | - "*" 25 | - apiGroups: 26 | - apiextensions.k8s.io 27 | resources: 28 | - customresourcedefinitions 29 | verbs: 30 | - "*" 31 | - apiGroups: 32 | - monitoring.coreos.com 33 | resources: 34 | - alertmanagers 35 | - prometheuses 36 | - prometheuses/finalizers 37 | - servicemonitors 38 | verbs: 39 | - "*" 40 | - apiGroups: 41 | - apps 42 | resources: 43 | - statefulsets 44 | verbs: ["*"] 45 | - apiGroups: [""] 46 | resources: 47 | - configmaps 48 | - secrets 49 | verbs: ["*"] 50 | - apiGroups: [""] 51 | resources: 52 | - pods 53 | verbs: ["list", "delete"] 54 | - apiGroups: [""] 55 | resources: 56 | - services 57 | - endpoints 58 | verbs: ["get", "create", "update"] 59 | - apiGroups: [""] 60 | resources: 61 | - nodes 62 | verbs: ["list", "watch"] 63 | - apiGroups: [""] 64 | resources: 65 | - namespaces 66 | verbs: ["list"] 67 | --- 68 | apiVersion: v1 69 | kind: ServiceAccount 70 | metadata: 71 | name: prometheus-operator 72 | --- 73 | apiVersion: extensions/v1beta1 74 | kind: Deployment 75 | metadata: 76 | labels: 77 | k8s-app: prometheus-operator 78 | name: prometheus-operator 79 | spec: 80 | replicas: 1 81 | template: 82 | metadata: 83 | labels: 84 | k8s-app: prometheus-operator 85 | spec: 86 | containers: 87 | - args: 88 | - --kubelet-service=kube-system/kubelet 89 | - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 90 | image: quay.io/coreos/prometheus-operator:v0.17.0 91 | name: prometheus-operator 92 | ports: 93 | - containerPort: 8080 94 | name: http 95 | resources: 96 | limits: 97 | cpu: 200m 98 | memory: 100Mi 99 | requests: 100 | cpu: 100m 101 | memory: 50Mi 102 | securityContext: 103 | runAsNonRoot: true 104 | runAsUser: 65534 105 | serviceAccountName: prometheus-operator -------------------------------------------------------------------------------- /examples/prometheus-operator/00prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | name: prometheus 5 | labels: 6 | prometheus: prometheus 7 | spec: 8 | replicas: 2 9 | serviceAccountName: prometheus -------------------------------------------------------------------------------- /examples/prometheus-operator/01prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | name: prometheus 5 | labels: 6 | prometheus: prometheus 7 | spec: 8 | replicas: 2 9 | serviceAccountName: prometheus 10 | serviceMonitorSelector: 11 | matchLabels: 12 | team: frontend -------------------------------------------------------------------------------- /examples/prometheus-operator/02prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | name: prometheus 5 | labels: 6 | prometheus: prometheus 7 | spec: 8 | replicas: 2 9 | serviceAccountName: prometheus 10 | serviceMonitorSelector: 11 | matchLabels: 12 | team: frontend 13 | ruleSelector: 14 | matchLabels: 15 | role: alert-rules 16 | prometheus: example -------------------------------------------------------------------------------- /examples/prometheus-operator/03prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | name: prometheus 5 | labels: 6 | prometheus: prometheus 7 | spec: 8 | replicas: 2 9 | serviceAccountName: prometheus 10 | serviceMonitorSelector: 11 | matchLabels: 12 | team: frontend 13 | alerting: 14 | alertmanagers: 15 | - namespace: default 16 | name: alertmanager-example 17 | port: web 18 | ruleSelector: 19 | matchLabels: 20 | role: alert-rules 21 | prometheus: example -------------------------------------------------------------------------------- /examples/prometheus-operator/alertmanager-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: alertmanager-example 5 | spec: 6 | type: NodePort 7 | ports: 8 | - name: web 9 | nodePort: 30903 10 | port: 9093 11 | protocol: TCP 12 | targetPort: web 13 | selector: 14 | alertmanager: example -------------------------------------------------------------------------------- /examples/prometheus-operator/alertmanager-setup.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Alertmanager 3 | metadata: 4 | name: example 5 | spec: 6 | replicas: 3 7 | -------------------------------------------------------------------------------- /examples/prometheus-operator/alertmanager-svc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: alertmanager-example 5 | spec: 6 | type: NodePort 7 | ports: 8 | - name: web 9 | nodePort: 30903 10 | port: 9093 11 | protocol: TCP 12 | targetPort: web 13 | selector: 14 | alertmanager: example -------------------------------------------------------------------------------- /examples/prometheus-operator/alertmanager.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | resolve_timeout: 5m 3 | route: 4 | group_by: ['job'] 5 | group_wait: 30s 6 | group_interval: 5m 7 | repeat_interval: 12h 8 | receiver: 'webhook' 9 | receivers: 10 | - name: 'webhook' 11 | webhook_configs: 12 | - url: 'http://alertmanagerwh:30500/' -------------------------------------------------------------------------------- /examples/prometheus-operator/example-app-monitor.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: example-app 5 | labels: 6 | team: frontend 7 | k8s-app: example-app 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: example-app 12 | endpoints: 13 | - port: web 14 | -------------------------------------------------------------------------------- /examples/prometheus-operator/example-app.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: example-app 5 | spec: 6 | replicas: 3 7 | template: 8 | metadata: 9 | labels: 10 | app: example-app 11 | spec: 12 | containers: 13 | - name: example-app 14 | image: fabxc/instrumented_app 15 | ports: 16 | - name: web 17 | containerPort: 8080 18 | --- 19 | kind: Service 20 | apiVersion: v1 21 | metadata: 22 | name: example-app 23 | labels: 24 | app: example-app 25 | spec: 26 | selector: 27 | app: example-app 28 | ports: 29 | - name: web 30 | port: 8080 -------------------------------------------------------------------------------- /examples/prometheus-operator/example-rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | prometheus: example 6 | role: alert-rules 7 | name: prometheus-example-rules 8 | spec: 9 | groups: 10 | - name: ./example.rules 11 | rules: 12 | - alert: ExampleAlert 13 | expr: vector(1) -------------------------------------------------------------------------------- /examples/prometheus-operator/node-exporter-daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta2 2 | kind: DaemonSet 3 | metadata: 4 | labels: 5 | app: node-exporter 6 | name: node-exporter 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: node-exporter 11 | template: 12 | metadata: 13 | labels: 14 | app: node-exporter 15 | spec: 16 | containers: 17 | - args: 18 | - --web.listen-address=127.0.0.1:9100 19 | - --path.procfs=/host/proc 20 | - --path.sysfs=/host/sys 21 | image: quay.io/prometheus/node-exporter:v0.15.2 22 | name: node-exporter 23 | volumeMounts: 24 | - mountPath: /host/proc 25 | name: proc 26 | readOnly: false 27 | - mountPath: /host/sys 28 | name: sys 29 | readOnly: false 30 | hostNetwork: true 31 | hostPID: true 32 | nodeSelector: 33 | beta.kubernetes.io/os: linux 34 | volumes: 35 | - hostPath: 36 | path: /proc 37 | name: proc 38 | - hostPath: 39 | path: /sys 40 | name: sys 41 | --- 42 | apiVersion: v1 43 | kind: Service 44 | metadata: 45 | labels: 46 | k8s-app: node-exporter 47 | name: node-exporter 48 | spec: 49 | type: ClusterIP 50 | ports: 51 | - name: https 52 | port: 9100 53 | targetPort: https 54 | selector: 55 | app: node-exporter 56 | --- 57 | apiVersion: monitoring.coreos.com/v1 58 | kind: ServiceMonitor 59 | metadata: 60 | labels: 61 | k8s-app: node-exporter 62 | name: node-exporter 63 | spec: 64 | jobLabel: k8s-app 65 | endpoints: 66 | - interval: 30s 67 | port: https 68 | selector: 69 | matchLabels: 70 | k8s-app: node-exporter 71 | 72 | -------------------------------------------------------------------------------- /examples/prometheus-operator/prometheus-rbac-setup.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: prometheus 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1beta1 7 | kind: ClusterRole 8 | metadata: 9 | name: prometheus 10 | rules: 11 | - apiGroups: [""] 12 | resources: 13 | - nodes 14 | - services 15 | - endpoints 16 | - pods 17 | verbs: ["get", "list", "watch"] 18 | - apiGroups: [""] 19 | resources: 20 | - configmaps 21 | verbs: ["get"] 22 | - nonResourceURLs: ["/metrics"] 23 | verbs: ["get"] 24 | --- 25 | apiVersion: rbac.authorization.k8s.io/v1beta1 26 | kind: ClusterRoleBinding 27 | metadata: 28 | name: prometheus 29 | roleRef: 30 | apiGroup: rbac.authorization.k8s.io 31 | kind: ClusterRole 32 | name: prometheus 33 | subjects: 34 | - kind: ServiceAccount 35 | name: prometheus 36 | namespace: default -------------------------------------------------------------------------------- /examples/prometheus-operator/prometheus-svc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: prometheus 5 | spec: 6 | ports: 7 | - name: web 8 | port: 9090 9 | targetPort: 9090 10 | protocol: TCP 11 | selector: 12 | prometheus: prometheus 13 | type: ClusterIP -------------------------------------------------------------------------------- /examples/standalone/README.md: -------------------------------------------------------------------------------- 1 | Standalone Prometheus Sample 2 | ======= 3 | 4 | ## Components: 5 | 6 | * Prometheus 7 | * Node Exporter 8 | * cAdvisor 9 | * Grafana 10 | 11 | ## How To Run 12 | 13 | ``` 14 | go get github.com/mattn/goreman 15 | ``` 16 | 17 | ``` 18 | docker volume create grafana-storage 19 | ``` 20 | 21 | ``` 22 | goreman -f prometheus.procfile start 23 | ``` 24 | -------------------------------------------------------------------------------- /examples/standalone/alertmanager.yml: -------------------------------------------------------------------------------- 1 | route: 2 | receiver: 'default-receiver' 3 | receivers: 4 | - name: default-receiver 5 | webhook_configs: 6 | - url: http://localhost:8081/webhook 7 | send_resolved: true -------------------------------------------------------------------------------- /examples/standalone/prometheus.procfile: -------------------------------------------------------------------------------- 1 | prometheus: prometheus --config.file=./prometheus.yml --storage.tsdb.path=/data/prometheus/ --web.listen-address="0.0.0.0:9090" --web.enable-lifecycle --web.enable-admin-api 2 | alertmanager: alertmanager --config.file=./alertmanager.yml --storage.path=/data/alertmanager/ 3 | grafana: docker run -p 3000:3000 -e GF_USERS_DEFAULT_THEME=light -v grafana-storage:/var/lib/grafana grafana/grafana 4 | cadvisor: docker run --volume=/:/rootfs:ro --volume=/var/run:/var/run:rw --volume=/sys:/sys:ro --volume=/var/lib/docker/:/var/lib/docker:ro --publish=8080:8080 google/cadvisor:latest 5 | node_exporter: node_exporter -------------------------------------------------------------------------------- /examples/standalone/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | scrape_timeout: 10s 4 | evaluation_interval: 15s 5 | rule_files: 6 | - rules/*.rule 7 | alerting: 8 | alertmanagers: 9 | - static_configs: 10 | - targets: ['localhost:9093'] 11 | scrape_configs: 12 | - job_name: 'node' 13 | static_configs: 14 | - targets: 15 | - 'localhost:9100' 16 | - 'foo:9100' 17 | - job_name: 'cadvisor' 18 | static_configs: 19 | - targets: 20 | - 'localhost:8080' 21 | - job_name: 'prometheus' 22 | static_configs: 23 | - targets: 24 | - 'localhost:9090' 25 | -------------------------------------------------------------------------------- /examples/standalone/reload.sh: -------------------------------------------------------------------------------- 1 | curl -X POST http://localhost:9090/-/reload -------------------------------------------------------------------------------- /examples/standalone/rules/hoststats-alert.rule: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: hostStatsAlert 3 | rules: 4 | - alert: hostCpuUsageAlert 5 | expr: sum(avg without (cpu)(irate(node_cpu{mode!='idle'}[5m]))) by (instance) > 0.5 6 | for: 1m 7 | labels: 8 | severity: page 9 | annotations: 10 | summary: "Instance {{ $labels.instance }} CPU usgae high" 11 | description: "{{ $labels.instance }} CPU usage above 50% (current value: {{ $value }})" 12 | - alert: hostMemUsageAlert 13 | expr: (node_memory_MemTotal - node_memory_MemAvailable)/node_memory_MemTotal > 0.85 14 | for: 1m 15 | labels: 16 | severity: page 17 | annotations: 18 | summary: "Instance {{ $labels.instance }} MEM usgae high" 19 | description: "{{ $labels.instance }} MEM usage above 85% (current value: {{ $value }})" -------------------------------------------------------------------------------- /exporter/README.md: -------------------------------------------------------------------------------- 1 | # 第4章 使用Exporter 2 | 3 | 在第1章中为了采集主机的监控样本数据,我们在主机上安装了一个Node Exporter程序,该程序对外暴露了一个用于获取当前监控样本数据的HTTP访问地址。这样的一个程序称为Exporter,Exporter的实例称为一个Target。Prometheus通过轮询的方式定时从这些Target中获取监控数据样本,并且存储在数据库当中。 在这一章节当中我们将重点讨论这些用于获取特定目标监控样本数据的程序Exporter。 4 | 5 | 本章的主要内容: 6 | 7 | * 常用Exporter的使用,例如如何监控数据库,消息中间件等 8 | * 如何实现自定义的Exporter程序 9 | * 如何对已有的应用程序扩展Prometheus监控支持 10 | -------------------------------------------------------------------------------- /exporter/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # 小结 2 | 3 | Prometheus负责数据的统一收集并且提供统一的查询接口PromQL,而所有监控数据的产生则是由Exporter来进行实现,对于任何能够提供Promethues标准的监控样本的程序都可以称为Exporter。Exporter可以是一个单独的为了采集特定数据而构建的应用程序,也可以直接内置于特定的系统当中。 -------------------------------------------------------------------------------- /exporter/commonly-eporter-usage.md: -------------------------------------------------------------------------------- 1 | # 常用Exporter 2 | 3 | 在第1章中,我们已经初步了解了Node Exporter的使用场景和方法。本小节,将会介绍更多常用的Exporter用法。包括如何监控容器运行状态,如何监控和评估MySQL服务的运行状态以及如何通过Prometheus实现基于网络探测的黑盒监控。 -------------------------------------------------------------------------------- /exporter/custom_exporter_with_java.md: -------------------------------------------------------------------------------- 1 | # 使用Java自定义Exporter 2 | 3 | 本小节将带领读者了解Promrtheus提供的client_java的基本用法,并且在最后在Spring Boot应用程序中使用client_java,直接在应用程序层面提供对Prometheus的支持。 -------------------------------------------------------------------------------- /exporter/static/blackbox-whitebox-tower.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/blackbox-whitebox-tower.png -------------------------------------------------------------------------------- /exporter/static/cadvisor-total-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/cadvisor-total-usage.png -------------------------------------------------------------------------------- /exporter/static/container_fs_reads_bytes_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/container_fs_reads_bytes_total.png -------------------------------------------------------------------------------- /exporter/static/container_fs_writes_bytes_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/container_fs_writes_bytes_total.png -------------------------------------------------------------------------------- /exporter/static/container_network_receive_bytes_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/container_network_receive_bytes_total.png -------------------------------------------------------------------------------- /exporter/static/container_network_transmit_bytes_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/container_network_transmit_bytes_total.png -------------------------------------------------------------------------------- /exporter/static/mysqld_exporter_target_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/mysqld_exporter_target_stats.png -------------------------------------------------------------------------------- /exporter/static/prometheus-exporter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/prometheus-exporter.png -------------------------------------------------------------------------------- /exporter/static/prometheus_client_java_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/prometheus_client_java_2.png -------------------------------------------------------------------------------- /exporter/static/prometheus_targetes_with_cadvisor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/prometheus_targetes_with_cadvisor.png -------------------------------------------------------------------------------- /exporter/static/promql_container_cpu_usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/promql_container_cpu_usage.png -------------------------------------------------------------------------------- /exporter/static/relabel_blackbox_targets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/exporter/static/relabel_blackbox_targets.png -------------------------------------------------------------------------------- /grafana/README.md: -------------------------------------------------------------------------------- 1 | # 第5章 可视化一切 2 | 3 | "You can't fix what you can't see"。可视化是监控的核心目标之一,在本章中我们将介绍Prometheus下的可视化技术。例如,Prometheus自身提供的Console Template能力以及Grafana这一可视化工具实现监控数据可视化。Prometheus UI提供了基本的数据可视化能力,可以帮助用户直接使用PromQL查询数据,并将数据通过可视化图表的方式进行展示,而实际的应用场景中往往不同的人对于可视化的需求不一样,关注的指标也不一样,因此我们需要能够有能力,构建出不同的可视化报表页面。 本章学习的内容就主要解决以上问题。 4 | 5 | 本章的主要内容: 6 | 7 | * 使用Console Template创建可视化页面 8 | * 使用Grafana创建更精美的数据仪表盘 -------------------------------------------------------------------------------- /grafana/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # 小结 2 | 3 | "You can't fix what you can't see"。可视化是监控的核心目标之一,在本章中我们学习了如何通过Prometheus内置的Console Template实现基本的可视化能力,以及通过更专业的开源工具Grafana实现Prometheus的数据可视化。 -------------------------------------------------------------------------------- /grafana/grafana-intro.md: -------------------------------------------------------------------------------- 1 | # Grafana简介 2 | 3 | Console Template虽然能满足一定的可视化需求,但是也仅仅是对Prometheus的基本能力的补充。同时使用也会有许多问题,首先用户需要学习和了解Go Template模板语言,并且其支持的可视化图表类型也非常有限,最后其管理也有一定的成本。在第1章的“初识Prometheus”中我们已经尝试通过Grafana快速搭建过一个主机监控的Dashboard,在本章中将会带来读者学习如何使用Grafana创建更加精美的可视化报表。 4 | 5 | ## Grafana基本概念 6 | 7 | 首先Grafana是一个通用的可视化工具。‘通用’意味着Grafana不仅仅适用于展示Prometheus下的监控数据,也同样适用于一些其他的数据可视化需求。在开始使用Grafana之前,我们首先需要明确一些Grafana下的基本概念,以帮助用户能够快速理解Grafana。 8 | 9 | ### 数据源(Data Source) 10 | 11 | 对于Grafana而言,Prometheus这类为其提供数据的对象均称为数据源(Data Source)。目前,Grafana官方提供了对:Graphite, InfluxDB, OpenTSDB, Prometheus, Elasticsearch, CloudWatch的支持。对于Grafana管理员而言,只需要将这些对象以数据源的形式添加到Grafana中,Grafana便可以轻松的实现对这些数据的可视化工作。 12 | 13 | ### 仪表盘(Dashboard) 14 | 15 | 通过数据源定义好可视化的数据来源之后,对于用户而言最重要的事情就是实现数据的可视化。在Grafana中,我们通过Dashboard来组织和管理我们的数据可视化图表: 16 | 17 | ![Grafana Dashboard](./static/dashboard-components.png) 18 | 19 | 如上所示,在一个Dashboard中一个最基本的可视化单元为一个**Panel(面板)**,Panel通过如趋势图,热力图的形式展示可视化数据。 并且在Dashboard中每一个Panel是一个完全独立的部分,通过Panel的**Query Editor(查询编辑器)**我们可以为每一个Panel设置查询的数据源以及数据查询方式,例如,如果以Prometheus作为数据源,那在Query Editor中,我们实际上使用的是PromQL,而Panel则会负责从特定的Prometheus中查询出相应的数据,并且将其可视化。由于每个Panel是完全独立的,因此在一个Dashboard中,往往可能会包含来自多个Data Source的数据。 20 | 21 | Grafana通过插件的形式提供了多种Panel的实现,常用的如:Graph Panel,Heatmap Panel,SingleStat Panel以及Table Panel等。用户还可通过插件安装更多类型的Panel面板。 22 | 23 | 除了Panel以外,在Dashboard页面中,我们还可以定义一个**Row(行)**,来组织和管理一组相关的Panel。 24 | 25 | 除了Panel, Row这些对象以外,Grafana还允许用户为Dashboard定义**Templating variables(模板参数)**,从而实现可以与用户动态交互的Dashboard页面。同时Grafana通过JSON数据结构管理了整个Dasboard的定义,因此这些Dashboard也是非常方便进行共享的。Grafana还专门为Dashboard提供了一个共享服务:[https://grafana.com/dashboards](https://grafana.com/dashboards),通过该服务用户可以轻松实现Dashboard的共享,同时我们也能快速的从中找到我们希望的Dashboard实现,并导入到自己的Grafana中。 26 | 27 | ### 组织和用户 28 | 29 | 作为一个通用可视化工具,Grafana除了提供灵活的可视化定制能力以外,还提供了面向企业的组织级管理能力。在Grafana中Dashboard是属于一个**Organization(组织)**,通过Organization,可以在更大规模上使用Grafana,例如对于一个企业而言,我们可以创建多个Organization,其中**User(用户)**可以属于一个或多个不同的Organization。 并且在不同的Organization下,可以为User赋予不同的权限。 从而可以有效的根据企业的组织架构定义整个管理模型。 -------------------------------------------------------------------------------- /grafana/grafana-panels.md: -------------------------------------------------------------------------------- 1 | # Grafana与数据可视化 2 | 3 | 在第1章的“初始Prometheus”部分,我们已经带领读者大致了解了Grafana的基本使用方式。对于Grafana而言,Prometheus就是一个用于存储监控样本数据的数据源(Data Source),通过使用PromQL查询特定Prometheus实例中的数据并且在Panel中实现可视化。 4 | 5 | 接下来,我们将带领读者了解如何通过Panel创建精美的可视化图表。 6 | 7 | ## 认识面板(Panel) 8 | 9 | Panel是Grafana中最基本的可视化单元。每一种类型的面板都提供了相应的查询编辑器(Query Editor),让用户可以从不同的数据源(如Prometheus)中查询出相应的监控数据,并且以可视化的方式展现。 10 | 11 | Grafana中所有的面板均以插件的形式进行使用,当前内置了5种类型的面板,分别是:Graph,Singlestat,Heatmap, Dashlist,Table以及Text。 12 | 13 | 其中像Graph这样的面板允许用户可视化任意多个监控指标以及多条时间序列。而Siglestat则必须要求查询结果为单个样本。Dashlist和Text相对比较特殊,它们与特定的数据源无关。 14 | 15 | 通过Grafana UI用户可以在一个Dashboard下添加Panel,点击Dashboard右上角的“Add Panel”按钮,如下所示,将会显示当前系统中所有可使用的Panel类型: 16 | 17 | ![添加Panel](./static/grafana_dashboard_add_panel.png) 18 | 19 | 选择想要创建的面板类型即可。这里以Graph面板为例,创建Panel之后,并切换到编辑模式,就可以进入Panel的配置页面。对于一个Panel而言,一般来说会包含2个主要的配置选项:General(通用设置)、Metrics(度量指标)。其余的配置则根据Panel类型的不同而不同。 20 | 21 | 在通用设置中,除了一些Panel的基本信息以外,最主要的能力就是定义动态Panel的能力,这部分内容会在本章的“模板化Dashboard”小结中详细介绍。 22 | 23 | 对于使用Prometheus作为数据源的用户,最主要的需要了解的就是Metrics设置的使用。在Metric选项中可以定义了Grafana从哪些数据源中查询样本数据。**Data Source**中指定当前查询的数据源,Grafana会加载当前组织中添加的所有数据源。其中还会包含两个特殊的数据源:**Mixed**和**Grafana**。 Mixed用于需要从多个数据源中查询和渲染数据的场景,Grafana则用于需要查询Grafana自身状态时使用。 24 | 25 | 当选中数据源时,Panel会根据当前数据源类型加载不同的Query Editor界面。这里我们主要介绍Prometheus Query Editor,如下所示,当选中的数据源类型为Prometheus时,会显示如下界面: 26 | 27 | ![Query Editor](./static/graph_prometheus_query_editor.png) 28 | 29 | Grafana提供了对PromQL的完整支持,在Query Editor中,可以添加任意个Query,并且使用PromQL表达式从Prometheus中查询相应的样本数据。 30 | 31 | ``` 32 | avg (irate(node_cpu{mode!='idle'}[2m])) without (cpu) 33 | ``` 34 | 35 | 每个PromQL表达式都可能返回多条时间序列。**Legend format**用于控制如何格式化每条时间序列的图例信息。Grafana支持通过模板的方式,根据时间序列的标签动态生成图例名称,例如:使用{{instance}}表示使用当前时间序列中的instance标签的值作为图例名称: 36 | 37 | ``` 38 | {{instance}}-{{mode}} 39 | ``` 40 | 41 | 当查询到的样本数据量非常大时可以导致Grafana渲染图标时出现一些性能问题,通过**Min Step**可以控制Prometheus查询数据时的最小步长(Step),从而减少从Prometheus返回的数据量。 42 | 43 | **Resolution**选项,则可以控制Grafana自身渲染的数据量。例如,如果**Resolution**的值为**1/10**,Grafana会将Prometeus返回的10个样本数据合并成一个点。因此**Resolution**越小可视化的精确性越高,反之,可视化的精度越低。 44 | 45 | **Format as**选项定义如何格式化Prometheus返回的样本数据。这里提供了3个选项:Table,Time Series和Heatmap,分别用于Table面板,Graph面板和Heatmap面板的数据可视化。 46 | 47 | 除此以外,Query Editor还提供了调试相关的功能,点击**Query Inspector**可以展开相关的调试面板: 48 | 49 | ![调试面板](./static/grafana_query_editor_inspector.png) 50 | 51 | 在面板中,可以查看当前Prometheus返回的样本数据,用户也可以提供Mock数据渲染图像。 52 | -------------------------------------------------------------------------------- /grafana/static/consoles_index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/consoles_index.png -------------------------------------------------------------------------------- /grafana/static/custom_index_head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/custom_index_head.png -------------------------------------------------------------------------------- /grafana/static/dashboard-components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/dashboard-components.png -------------------------------------------------------------------------------- /grafana/static/grafana-dashboard-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana-dashboard-example.png -------------------------------------------------------------------------------- /grafana/static/grafana-framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana-framework.png -------------------------------------------------------------------------------- /grafana/static/grafana_bucket_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_bucket_demo.png -------------------------------------------------------------------------------- /grafana/static/grafana_bucket_setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_bucket_setting.png -------------------------------------------------------------------------------- /grafana/static/grafana_dashboard_add_panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_dashboard_add_panel.png -------------------------------------------------------------------------------- /grafana/static/grafana_edit_panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_edit_panel.png -------------------------------------------------------------------------------- /grafana/static/grafana_format_as_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_format_as_table.png -------------------------------------------------------------------------------- /grafana/static/grafana_graph_counter_demo_axes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_graph_counter_demo_axes.png -------------------------------------------------------------------------------- /grafana/static/grafana_graph_counter_demo_display_draw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_graph_counter_demo_display_draw.png -------------------------------------------------------------------------------- /grafana/static/grafana_graph_counter_demo_legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_graph_counter_demo_legend.png -------------------------------------------------------------------------------- /grafana/static/grafana_graph_counter_demo_legend_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_graph_counter_demo_legend_sample.png -------------------------------------------------------------------------------- /grafana/static/grafana_graph_counter_demo_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_graph_counter_demo_metrics.png -------------------------------------------------------------------------------- /grafana/static/grafana_graph_counter_demo_metrics_legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_graph_counter_demo_metrics_legend.png -------------------------------------------------------------------------------- /grafana/static/grafana_graph_counter_demo_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_graph_counter_demo_v2.png -------------------------------------------------------------------------------- /grafana/static/grafana_graph_panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_graph_panel.png -------------------------------------------------------------------------------- /grafana/static/grafana_heatmap_axes_setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_heatmap_axes_setting.png -------------------------------------------------------------------------------- /grafana/static/grafana_heatmap_editor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_heatmap_editor.png -------------------------------------------------------------------------------- /grafana/static/grafana_heatmap_metrics_setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_heatmap_metrics_setting.png -------------------------------------------------------------------------------- /grafana/static/grafana_heatmap_normal_axes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_heatmap_normal_axes.png -------------------------------------------------------------------------------- /grafana/static/grafana_heatmap_normal_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_heatmap_normal_metrics.png -------------------------------------------------------------------------------- /grafana/static/grafana_heatmap_normal_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_heatmap_normal_sample.png -------------------------------------------------------------------------------- /grafana/static/grafana_heatmap_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_heatmap_sample.png -------------------------------------------------------------------------------- /grafana/static/grafana_panel_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_panel_general.png -------------------------------------------------------------------------------- /grafana/static/grafana_prometheus_datasources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_prometheus_datasources.png -------------------------------------------------------------------------------- /grafana/static/grafana_query_editor_inspector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_query_editor_inspector.png -------------------------------------------------------------------------------- /grafana/static/grafana_series_overrides.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_series_overrides.png -------------------------------------------------------------------------------- /grafana/static/grafana_series_overrides_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_series_overrides_demo.png -------------------------------------------------------------------------------- /grafana/static/grafana_single_stat_edit_options.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_single_stat_edit_options.png -------------------------------------------------------------------------------- /grafana/static/grafana_single_stat_edit_value_mapping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_single_stat_edit_value_mapping.png -------------------------------------------------------------------------------- /grafana/static/grafana_single_stat_edit_value_mapping_emoji.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_single_stat_edit_value_mapping_emoji.png -------------------------------------------------------------------------------- /grafana/static/grafana_single_stat_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_single_stat_sample.png -------------------------------------------------------------------------------- /grafana/static/grafana_singlestat_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_singlestat_sample.png -------------------------------------------------------------------------------- /grafana/static/grafana_table_panel_cloum_style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_table_panel_cloum_style.png -------------------------------------------------------------------------------- /grafana/static/grafana_table_panel_example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_table_panel_example2.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_add_variables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_add_variables.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_query_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_query_result.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_query_variables3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_query_variables3.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_repeat_e2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_repeat_e2.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_repeat_e3 (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_repeat_e3 (1).png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_repeat_e3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_repeat_e3.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_repeat_example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_repeat_example1.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_repeat_row.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_repeat_row.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_repeat_var.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_repeat_var.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_variables_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_variables_example.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_variables_example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_variables_example1.png -------------------------------------------------------------------------------- /grafana/static/grafana_templating_variables_filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_templating_variables_filter.png -------------------------------------------------------------------------------- /grafana/static/grafana_thresholds_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/grafana_thresholds_demo.png -------------------------------------------------------------------------------- /grafana/static/graph_prometheus_query_editor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/graph_prometheus_query_editor.png -------------------------------------------------------------------------------- /grafana/static/head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/head.png -------------------------------------------------------------------------------- /grafana/static/prom_graph_timecontrol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/prom_graph_timecontrol.png -------------------------------------------------------------------------------- /grafana/static/query_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/grafana/static/query_graph.png -------------------------------------------------------------------------------- /grafana/use_heatmap_panel.md: -------------------------------------------------------------------------------- 1 | # 分布统计:Heatmap面板 2 | 3 | Heatmap是Grafana v4.3版本以后新添加的可视化面板,通过热图可以直观的查看样本的分布情况。在Grafana v5.1版本中Heatmap完善了对Prometheus的支持。这部分,将介绍如何使用Heatmap Panel实现对Prometheus监控指标的可视化。 4 | 5 | ## 使用Heatmap可视化Histogram样本分布情况 6 | 7 | 在上一小节中,我们尝试了使用Graph面板来可视化Histogram类型的监控指标prometheus_tsdb_compaction_duration_bucket。虽然能展示各个Bucket区间内的样本分布,但是无论是以线图还是柱状图的形式展示,都不够直观。对于Histogram类型的监控指标来说,更好的选择是采用Heatmap Panel,如下所示,Heatmap Panel可以自动对Histogram类型的监控指标分布情况进行计划,获取到每个区间范围内的样本个数,并且以颜色的深浅来表示当前区间内样本个数的大小。而图形的高度,则反映出当前时间点,样本分布的离散程度。 8 | 9 | ![Heatmap示例](./static/grafana_heatmap_sample.png) 10 | 11 | 在Grafana中使用Heatmap Panel也非常简单,在Dashboard页面右上角菜单中点击“add panel”按钮,并选择Heatmap Panel即可。 12 | 13 | 如下所示,Heapmap Panel的编辑页面中,主要包含5类配置选项,分别是:General、Metrics、Axes、Display、Time range。 14 | 15 | ![Heapmap Panel编辑页面](./static/grafana_heatmap_editor.png) 16 | 17 | 其中大部分的配置选项与Graph面板基本保持一致,这里就不重复介绍了。 18 | 19 | 当使用Heatmap可视化Histogram类型的监控指标时,需要设置**Format as**选项为**Heatmap**。当使用Heatmap格式化数据后,Grafana会自动根据样本的中的le标签,计算各个Bucket桶内的分布,并且按照Bucket对数据进行重新排序。**Legend format**模板则将会控制Y轴中的显示内容。如下所示: 20 | 21 | ![Mteircs设置](./static/grafana_heatmap_metrics_setting.png) 22 | 23 | 默认情况下,Heatmap Panel会自行对PromQL查询出的数据进行分布情况统计,而在Prometheus中Histogram类型的监控指标其实是已经自带了分布的Bucket信息的,因此为了直接使用这些Bucket信息,我们需要在**Axes选项**中定义数据的Date format需要定义为**Time series buckets**。该选项表示Heatmap Panel不需要自身对数据的分布情况进行计算,直接使用时间序列中返回的Bucket即可。如下所示: 24 | 25 | ![Axes设置](./static/grafana_heatmap_axes_setting.png) 26 | 27 | 通过以上设置,即可实现对Histogram类型监控指标的可视化。 28 | 29 | ## 使用Heatmap可视化其它类型样本分布情况 30 | 31 | 对于非Histogram类型,由于其监控样本中并不包含Bucket相关信息,因此在**Metrics选项中**需要定义**Format as**为**Time series**,如下所示: 32 | 33 | ![Metrics设置](./static/grafana_heatmap_normal_metrics.png) 34 | 35 | 并且通过**Axes选项**中选择**Data format**方式为**Time series**。设置该选项后Heatmap Panel会要求用户提供Bucket分布范围的设置,如下所示: 36 | 37 | ![Axes设置](./static/grafana_heatmap_normal_axes.png) 38 | 39 | 在Y轴(Y Axis)中需要通过Scale定义Bucket桶的分布范围,默认的Bucket范围支持包括:liner(线性分布)、log(base 10)(10的对数)、log(base 32)(32的对数)、log(base 1024)(1024的对数)等。 40 | 41 | 例如,上图中设置的Scale为log(base 2),那么在Bucket范围将2的对数的形式进行分布,即[1,2,4,8,....],如下所示: 42 | 43 | ![Bucket分布情况](./static/grafana_heatmap_normal_sample.png) 44 | 45 | 通过以上设置,Heatmap会自动根据用户定义的Bucket范围对Prometheus中查询到的样本数据进行分布统计。 -------------------------------------------------------------------------------- /grafana/use_singlestat_panel.md: -------------------------------------------------------------------------------- 1 | # 当前状态:SingleStat面板 2 | 3 | SingleStat Panel侧重于展示系统的当前状态而非变化趋势。如下所示,在以下场景中特别适用于使用SingleStat: 4 | 5 | * 当前系统中所有服务的运行状态; 6 | * 当前基础设施资源的使用量; 7 | * 当前系统中某些事件发生的次数或者资源数量等。 8 | 9 | 如下所示,是使用SingleStat进行数据可视化的显示效果: 10 | 11 | ![SingleStat Panel示例](./static/grafana_singlestat_sample.png) 12 | 13 | ## 使用SingleStat Panel 14 | 15 | 从Dashboardc创建Singlestat Panel,并进入编辑页面, 如下所示: 16 | 17 | ![SingleStat 编辑页面](./static/grafana_single_stat_sample.png) 18 | 19 | 对于SingleStat Panel而言,其只能处理一条时间序列,否则页面中会提示“Multiple Series Error”错误信息。这里使用如下PromQL查询当前主机负载: 20 | 21 | ``` 22 | node_load1{instance="localhost:9100"} 23 | ``` 24 | 25 | 默认情况下,当前面板中会显示当前时间序列中所有样本的平均值,而实际情况下,我们需要显示的是当前主机当前的负载情况,因此需要通过SingleStat Panel的**Options**选项控制当前面板的显示模式: 26 | 27 | ![SingleStat Option选项](./static/grafana_single_stat_edit_options.png) 28 | 29 | 如上所示,通过Value配置项组可以控制当前面板中显示的值,以及字体大小等。对于主机负载而言,我们希望能够显示当前的最新值,因此修改Stat为**Current**即可。 30 | 31 | 如果希望面板能够根据不同的值显示不同的颜色的话,则可以定义**Thresholds**与**Colors**的映射关系,例如,定义Thresholds的分割区间值为“0,1”,则当Value的值落到不同的范围内时,将显示不同的颜色。 32 | 33 | 如果希望能够显示当前时间序列的样本值变化情况,则可以启用Spark lines配置。启用之后,Singlestat面板中除了会显示当前的最新样本值以外,也会同时将时间序列中的数据以趋势图的形式进行展示。 34 | 35 | 除了通过数字大小反应当前状态以外,在某些场景下我们可能更关心的是这些数字表示的意义。例如,在Prometheus监控服务的健康状态时,在样本数据中会通过0表示不健康,1表示健康。 但是如果直接将0或1显示在面板中,那么可视化效果将缺乏一定的可读性。 36 | 37 | 为了提升数字的可读性,可以在Singlestat Panel中可以通过**Value Mappings**定义值的映射关系。Siglesta支持值映射(value to text)和区间映射(range to text)两种方式。 如下所示: 38 | 39 | ![Singlestat value mappings配置](./static/grafana_single_stat_edit_value_mapping.png) 40 | 41 | 当面板中Value的值在0~0.99范围内则显示为Health,否则显示为Unhealth。这种模式特别适合于展示服务的健康状态。 当然你也可以将Value映射为任意的字符,甚至是直接使用Emoji([http://www.iemoji.com/](http://www.iemoji.com/))表情: 42 | 43 | ![在Singlestat中使用Emoji表情字符](./static/grafana_single_stat_edit_value_mapping_emoji.png) -------------------------------------------------------------------------------- /ha/READMD.md: -------------------------------------------------------------------------------- 1 | # 第6章 集群与高可用 2 | 3 | Prometheus内置了一个基于本地存储的时间序列数据库。在Prometheus设计上,使用本地存储可以降低Prometheus部署和管理的复杂度,同时减少高可用(HA)带来的复杂性。 在默认情况下,用户只需要部署多套Prometheus,采集相同的Targets即可实现基本的HA。同时由于Promethus高效的数据处理能力,单个Prometheus Server基本上能够应对大部分用户监控规模的需求。 4 | 5 | 当然本地存储也带来了一些不好的地方,首先就是数据持久化的问题,特别是在像Kubernetes这样的动态集群环境下,如果Prometheus的实例被重新调度,那所有历史监控数据都会丢失。 其次本地存储也意味着Prometheus不适合保存大量历史数据(一般Prometheus推荐只保留几周或者几个月的数据)。最后本地存储也导致Prometheus无法进行弹性扩展。为了适应这方面的需求,Prometheus提供了remote_write和remote_read的特性,支持将数据存储到远端和从远端读取数据。通过将监控与数据分离,Prometheus能够更好地进行弹性扩展。 6 | 7 | 除了本地存储方面的问题,由于Prometheus基于Pull模型,当有大量的Target需要采样本时,单一Prometheus实例在数据抓取时可能会出现一些性能问题,联邦集群的特性可以让Prometheus将样本采集任务划分到不同的Prometheus实例中,并且通过一个统一的中心节点进行聚合,从而可以使Prometheuse可以根据规模进行扩展。 8 | 9 | 除了讨论Prometheus自身的高可用,Alertmanager作为Prometheus体系中的告警处理中心,本章的最后部分会讨论如何实现Alertmanager的高可用部署。 10 | 11 | 本章的主要内容: 12 | * Prometheus本地存储机制 13 | * Prometheus的远程存储机制 14 | * Prometheus联邦集群 15 | * Prometheus高可用部署架构 16 | * Alertmanager高可用部署架构 17 | -------------------------------------------------------------------------------- /ha/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # 小结 2 | 3 | Prometheus的简单性贯穿于整个Prometheus的使用过程中,无论是单机部署还是集群化部署,简单性一致是Prometheus设计的基本原则。这本章中,我们系统学习了如果实现Prometheus下各个中间的高可用部署方式,同时给出了集中常用的高可用方案,读者可以根据自己的实际需求来选择如何部署自己的Promethues集群。 -------------------------------------------------------------------------------- /ha/scale-prometheus-with-federation.md: -------------------------------------------------------------------------------- 1 | # 联邦集群 2 | 3 | 通过Remote Storage可以分离监控样本采集和数据存储,解决Prometheus的持久化问题。这一部分会重点讨论如何利用联邦集群特性对Prometheus进行扩展,以适应不同监控规模的变化。 4 | 5 | ## 使用联邦集群 6 | 7 | 对于大部分监控规模而言,我们只需要在每一个数据中心(例如:EC2可用区,Kubernetes集群)安装一个Prometheus Server实例,就可以在各个数据中心处理上千规模的集群。同时将Prometheus Server部署到不同的数据中心可以避免网络配置的复杂性。 8 | 9 | ![联邦集群](./static/prometheus_feradtion.png) 10 | 11 | 如上图所示,在每个数据中心部署单独的Prometheus Server,用于采集当前数据中心监控数据。并由一个中心的Prometheus Server负责聚合多个数据中心的监控数据。这一特性在Prometheus中称为联邦集群。 12 | 13 | 联邦集群的核心在于每一个Prometheus Server都包含一个用于获取当前实例中监控样本的接口/federate。对于中心Prometheus Server而言,无论是从其他的Prometheus实例还是Exporter实例中获取数据实际上并没有任何差异。 14 | 15 | ``` 16 | scrape_configs: 17 | - job_name: 'federate' 18 | scrape_interval: 15s 19 | honor_labels: true 20 | metrics_path: '/federate' 21 | params: 22 | 'match[]': 23 | - '{job="prometheus"}' 24 | - '{__name__=~"job:.*"}' 25 | - '{__name__=~"node.*"}' 26 | static_configs: 27 | - targets: 28 | - '192.168.77.11:9090' 29 | - '192.168.77.12:9090' 30 | ``` 31 | 32 | 为了有效的减少不必要的时间序列,通过params参数可以用于指定只获取某些时间序列的样本数据,例如 33 | 34 | ``` 35 | "http://192.168.77.11:9090/federate?match[]={job%3D"prometheus"}&match[]={__name__%3D~"job%3A.*"}&match[]={__name__%3D~"node.*"}" 36 | ``` 37 | 38 | 通过URL中的match[]参数指定我们可以指定需要获取的时间序列。match[]参数必须是一个瞬时向量选择器,例如up或者{job="api-server"}。配置多个match[]参数,用于获取多组时间序列的监控数据。 39 | 40 | **horbor_labels**配置true可以确保当采集到的监控指标冲突时,能够自动忽略冲突的监控数据。如果为false时,prometheus会自动将冲突的标签替换为“exported_”的形式。 41 | 42 | ## 功能分区 43 | 44 | 联邦集群的特性可以帮助用户根据不同的监控规模对Prometheus部署架构进行调整。例如如下所示,可以在各个数据中心中部署多个Prometheus Server实例。每一个Prometheus Server实例只负责采集当前数据中心中的一部分任务(Job),例如可以将不同的监控任务分离到不同的Prometheus实例当中,再有中心Prometheus实例进行聚合。 45 | 46 | ![功能分区](./static/prometheus_feradtion_2.png) 47 | 48 | 功能分区,即通过联邦集群的特性在任务级别对Prometheus采集任务进行划分,以支持规模的扩展。 49 | -------------------------------------------------------------------------------- /ha/static/alertmanager-features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/alertmanager-features.png -------------------------------------------------------------------------------- /ha/static/alertmanager-gossip-ha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/alertmanager-gossip-ha.png -------------------------------------------------------------------------------- /ha/static/am-gossip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/am-gossip.png -------------------------------------------------------------------------------- /ha/static/am-ha-status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/am-ha-status.png -------------------------------------------------------------------------------- /ha/static/am-notifi-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/am-notifi-pipeline.png -------------------------------------------------------------------------------- /ha/static/gossip-protoctl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/gossip-protoctl.png -------------------------------------------------------------------------------- /ha/static/prom-ha-with-am-gossip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/prom-ha-with-am-gossip.png -------------------------------------------------------------------------------- /ha/static/prom-ha-with-double-am.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/prom-ha-with-double-am.png -------------------------------------------------------------------------------- /ha/static/prom-ha-with-single-am.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/prom-ha-with-single-am.png -------------------------------------------------------------------------------- /ha/static/prometheus-ha-remote-storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/prometheus-ha-remote-storage.png -------------------------------------------------------------------------------- /ha/static/prometheus-ha-rs-fedreation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/prometheus-ha-rs-fedreation.png -------------------------------------------------------------------------------- /ha/static/prometheus_feradtion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/prometheus_feradtion.png -------------------------------------------------------------------------------- /ha/static/prometheus_feradtion_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/prometheus_feradtion_2.png -------------------------------------------------------------------------------- /ha/static/promethues-alertmanager-ha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/promethues-alertmanager-ha.png -------------------------------------------------------------------------------- /ha/static/promethues-ha-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/promethues-ha-01.png -------------------------------------------------------------------------------- /ha/static/promethues-remote-storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/promethues-remote-storage.png -------------------------------------------------------------------------------- /ha/static/promethues-sharding-targets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/promethues-sharding-targets.png -------------------------------------------------------------------------------- /ha/static/remote-storage-paths.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/remote-storage-paths.png -------------------------------------------------------------------------------- /ha/static/remote-write-path-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/remote-write-path-2.png -------------------------------------------------------------------------------- /ha/static/remote_read_path-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/ha/static/remote_read_path-2.png -------------------------------------------------------------------------------- /kubernetes/READMD.md: -------------------------------------------------------------------------------- 1 | # 第8章 Kubernetes监控实战 2 | 3 | Kubenetes是一款由Google开发的开源的容器编排工具,在Google已经使用超过15年。作为容器领域事实的标准,Kubernetes可以极大的简化应用的管理和部署复杂度。本章中,我们将介绍Kubernetes的一些基本概念,并且从0开始利用Prometheus构建一个完整的Kubernetes集群监控系统。同时我们还将学习如何通过Prometheus Operator简化在Kubernetes下部署和管理Promethues的过程。 4 | 5 | 本章的主要内容: 6 | 7 | * 理解Kubernetes的工作机制 8 | * Prometheus在Kubernetes下的服务发现机制 9 | * 监控Kubernetes集群状态 10 | * 监控集群基础设施 11 | * 监控集群应用容器资源使用情况 12 | * 监控用户部署的应用程序 13 | * 对Service和Ingress进行网络探测 14 | * 通过Operator高效管理和部署在Kubernetes集群中的Prometheus -------------------------------------------------------------------------------- /kubernetes/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # 小结 2 | 3 | Kubernetes与Promethues有着十分相似的历程,均是源自Google内部多年的运维经验。并且相继从CNCF基金会正式毕业。它们分别代表了云原生模式下容器编排以及监控的事实标准。 -------------------------------------------------------------------------------- /kubernetes/hap-with-prometheus.md: -------------------------------------------------------------------------------- 1 | # 基于Prometheus的弹性伸缩 2 | 3 | 弹性伸缩(AutoScaling)是指应用可以根据当前的资源使用情况自动水平扩容或者缩容的能力。 -------------------------------------------------------------------------------- /kubernetes/prometheus-with-kubernetes.md: -------------------------------------------------------------------------------- 1 | # Prometheus与Kubernetes 2 | 3 | Kubernetes作为开源的容器编排工具,为用户提供了一个可以统一调度,统一管理的云操作系统。其解决如用户应用程序如何运行的问题。而一旦在生产环境中大量基于Kubernetes部署和管理应用程序后,作为系统管理员,还需要充分了解应用程序以及Kubernetes集群服务运行质量如何,通过对应用以及集群运行状态数据的收集和分析,持续优化和改进,从而提供一个安全可靠的生产运行环境。 这一小节中我们将讨论当使用Kubernetes时的监控策略该如何设计。 4 | 5 | ## Kubernetes架构 6 | 7 | 为了能够更好的理解Kubernetes下的监控体系,我们需要了解Kubernetes的基本架构,如下所示,是Kubernetes的架构示意图: 8 | 9 | ![Kubernetes架构](./static/kubernetes-artch-overview.png) 10 | 11 | Kubernetes的核心组件主要由两部分组成:Master组件和Node组件,其中Matser组件提供了集群层面的管理功能,它们负责响应用户请求,处理集群实际,并且对集群资源进行统一的调度和管理。Node组件会运行在集群的所有节点上,它们负责管理和维护节点中运行的Pod,为Kubernetes集群提供运行时环境。 12 | 13 | Master组件主要包括: 14 | 15 | * kube-apiserver:负责对外暴露Kubernetes API; 16 | * etcd:用于存储Kubernetes集群的所有数据; 17 | * kube-scheduler: 负责为新创建的Pod选择可供其运行的节点; 18 | * kube-controller-manager: 包含Node Controller,Deployment Controller,Endpoint Controller等等,通过与apiserver交互使相应的资源达到预期状态。 19 | 20 | Node组件主要包括: 21 | 22 | * kubelet:负责维护和管理节点上Pod的运行状态; 23 | * kube-proxy:负责维护主机上的网络规则以及转发。 24 | * Container Runtime:如Docker,rkt,runc等提供容器运行时环境。 25 | 26 | ## 监控Kubernetes 27 | 28 | 从物理结构上讲Kubernetes主要用于整合和管理底层的基础设施资源,对外提供应用容器的自动化部署和管理能力,这些基础设施可能是物理机、虚拟机、云主机等等。因此,基础资源的使用直接影响当前集群的容量和应用的状态。在这部分,我们需要关注集群中各个节点的主机负载,CPU使用率、内存使用率、存储空间以及网络吞吐等监控指标。 29 | 30 | 从自身架构上讲,kube-apiserver是Kubernetes提供所有服务的入口,无论是外部的客户端还是集群内部的组件都直接与kube-apiserver进行通讯。因此,kube-apiserver的并发和吞吐量直接决定了集群性能的好坏。其次,对于外部用户而言,Kubernetes是否能够快速的完成pod的调度以及启动,是影响其使用体验的关键因素。而这个过程主要由kube-scheduler负责完成调度工作,而kubelet完成pod的创建和启动工作。因此在Kubernetes集群本身我们需要评价其自身的服务质量,主要关注在Kubernetes的API响应时间,以及Pod的启动时间等指标上。 31 | 32 | Kubernetes的最终目标还是需要为业务服务,因此我们还需要能够监控应用容器的资源使用情况。对于内置了对Prometheus支持的应用程序,也要支持从这些应用程序中采集内部的监控指标。最后,结合黑盒监控模式,对集群中部署的服务进行探测,从而当应用发生故障后,能够快速处理和恢复。 33 | 34 | 因此,在不考虑Kubernetes自身组件的情况下,如果要构建一个完整的监控体系,我们应该考虑,以下5个方面: 35 | 36 | * 集群节点状态监控:从集群中各节点的kubelet服务获取节点的基本运行状态; 37 | * 集群节点资源用量监控:通过Daemonset的形式在集群中各个节点部署Node Exporter采集节点的资源使用情况; 38 | * 节点中运行的容器监控:通过各个节点中kubelet内置的cAdvisor中获取个节点中所有容器的运行状态和资源使用情况; 39 | * 从黑盒监控的角度在集群中部署Blackbox Exporter探针服务,检测Service和Ingress的可用性; 40 | * 如果在集群中部署的应用程序本身内置了对Prometheus的监控支持,那么我们还应该找到相应的Pod实例,并从该Pod实例中获取其内部运行状态的监控指标。 41 | -------------------------------------------------------------------------------- /kubernetes/static/k8s-sd-with-node-with-relabel-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/k8s-sd-with-node-with-relabel-1.png -------------------------------------------------------------------------------- /kubernetes/static/k8s-sd-with-node-with-relabel-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/k8s-sd-with-node-with-relabel-2.png -------------------------------------------------------------------------------- /kubernetes/static/k8s-service-endpoints.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/k8s-service-endpoints.png -------------------------------------------------------------------------------- /kubernetes/static/kubelet_pod_start_latency_microseconds (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubelet_pod_start_latency_microseconds (1).png -------------------------------------------------------------------------------- /kubernetes/static/kubelet_pod_start_latency_microseconds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubelet_pod_start_latency_microseconds.png -------------------------------------------------------------------------------- /kubernetes/static/kubelet_pod_start_latency_microseconds_avg (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubelet_pod_start_latency_microseconds_avg (1).png -------------------------------------------------------------------------------- /kubernetes/static/kubelet_pod_start_latency_microseconds_avg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubelet_pod_start_latency_microseconds_avg.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-apiservers-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-apiservers-monitor.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-app-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-app-model.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-artch-overview (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-artch-overview (1).png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-artch-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-artch-overview.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-dashboard.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-kubelets-step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-kubelets-step2.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-kubelets-step3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-kubelets-step3.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-prometheus-step1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-prometheus-step1.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-service-endpoints-sd-targets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-service-endpoints-sd-targets.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes-service-endpoints-sd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes-service-endpoints-sd.png -------------------------------------------------------------------------------- /kubernetes/static/kubernetes_service_endpoints.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/kubernetes_service_endpoints.png -------------------------------------------------------------------------------- /kubernetes/static/nginx-home-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/nginx-home-page.png -------------------------------------------------------------------------------- /kubernetes/static/pre-ccm-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/pre-ccm-arch.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-alert-cluster-status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-alert-cluster-status.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-alerting-auto2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-alerting-auto2.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-architecture.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-cadvisor-step1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-cadvisor-step1.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-cadvisor-step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-cadvisor-step2.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-config-with-servermonitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-config-with-servermonitor.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-k8s-sd-example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-k8s-sd-example1.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-k8s-sd-example3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-k8s-sd-example3.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-operator-instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-operator-instance.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-operator-targets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-operator-targets.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-pods-sd-ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-pods-sd-ex1.png -------------------------------------------------------------------------------- /kubernetes/static/prometheus-rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/prometheus-rule.png -------------------------------------------------------------------------------- /kubernetes/static/promethues-api-server-sd.eq1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/static/promethues-api-server-sd.eq1.png -------------------------------------------------------------------------------- /kubernetes/use-alertmanager-operator.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/kubernetes/use-alertmanager-operator.md -------------------------------------------------------------------------------- /kubernetes/use-prometheus-monitor-k8s-cluster-state.md: -------------------------------------------------------------------------------- 1 | # 监控集群状态 2 | 3 | 当使用Kubernetes管理一个多节点的集群中,除了需要关注集群中部署应用的运行状态和节点的资源使用情况以外,我们还应该关注Kubernetes本身的状态。Kubernetes作为一个中央化的任务调度系统,我们希望它能够相对较快的完成对用户操作的响应。在这一小节中,我们将利用Prometheus监控Kubernetes API的响应时间,从而评估当前集群的运行状态以及性能。 4 | 5 | ## 使用Prometheus采集API Server监控数据 6 | 7 | 在Kubernetes集群中命名空间default中会包含一个名为kubernetes的默认Service: 8 | 9 | ``` 10 | $ kubectl get svc kubernetes 11 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 12 | kubernetes ClusterIP 10.96.0.1 443/TCP 133d 13 | ``` 14 | 15 | 该Service实际指向的是Kubernetes组件apiserver提供的服务: 16 | 17 | ``` 18 | $ kubectl get endpoints kubernetes 19 | NAME ENDPOINTS AGE 20 | kubernetes 10.0.2.15:8443 133d 21 | ``` 22 | 23 | Apiserver组件内置了对Prometheus的支持,因此只要通过CA证书和令牌访问[https://kubernetes.default.svc:443/metrics](https://kubernetes.default.svc:443/metrics)即可获取apiserver组件中记录的所有监控样数据。 24 | 25 | 了解以上基础知识以后,我们只需要对应修改Prometheus的配置文件即可。修改prometheus-config.yml文件,为Pometheus配置文件添加以下内容: 26 | 27 | ``` 28 | - job_name: 'kubernetes-apiservers' 29 | kubernetes_sd_configs: 30 | - role: endpoints 31 | scheme: https 32 | tls_config: 33 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 34 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 35 | relabel_configs: 36 | - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] 37 | action: keep 38 | regex: default;kubernetes;https 39 | - target_label: __address__ 40 | replacement: kubernetes.default.svc:443 41 | ``` 42 | 43 | 这里我们添加了一个新的监控采集任务kubernetes-apiservers,该任务基于endpoints模式获取当前集群中的所有endpoints,并且只保留default命名空间下的服务名称为kubernetes的实例作为监控对象。 由于基于ServiceAccount提供的CA证书中,并不包含Endpoint的地址,因此这里还需要将默认的```__address__```替换为集群内的DNS地址kubernetes.default.svc。 44 | 45 | 基于以上服务发现以及relabel的过程后,Prometheus就能够正常的从apiserver中过去监控样本数据: 46 | 47 | ![Kubernetes APIServer任务采集状态](./static/kubernetes-apiservers-monitor.png) 48 | 49 | ## 评估Kubernetes性能 50 | 51 | 当Prometheus能够从Kubernetes的APIServer中获取监控样本数据后,就可以对当前Kubernetes集群的性能做出评估。无论是Kubernetes的自身组件还是客户端请求都需要经过Kubernetes的apiserver,因此在评估Kubernetes性能时,我们首先需要关注Kubernetes的API响应时间。对于Pod启动时间可以通过指标kubelet_pod_start_latency_microseconds获取。 52 | 53 | 例如,通过以下PromQL获取当前集群99%的Pod启动时间大致在18.40s以内: 54 | 55 | ``` 56 | kubelet_pod_start_latency_microseconds{quantile="0.99"} 57 | ``` 58 | 59 | ![99%的Pod启动时间](./static/kubelet_pod_start_latency_microseconds.png) 60 | 61 | Pod平均启动时间大致为42s左右(包含镜像下载时间): 62 | 63 | ``` 64 | kubelet_pod_start_latency_microseconds_sum / kubelet_pod_start_latency_microseconds_count 65 | ``` 66 | 67 | ![Pod平均启动时间](./static/kubelet_pod_start_latency_microseconds_avg.png) 68 | 69 | 其次,对于用户而言,他们更关注通过容器启动服务所需的时间,因此,第二个关键指标即Pod的启动时间。指标apiserver_request_latencies_summary和apiserver_request_latencies_bucket均可用于统计以下各种类型API响应时间的分布情况: 70 | 71 | |Action|Resources| 72 | |-|-| 73 | |PUT|Pods, Nodes, Deployments, DaemonSets等| 74 | |POST|Pods, Nodes, Deployments, DaemonSets等| 75 | |LIST|Pods, Nodes, Deployments, DaemonSets等| 76 | |GET|Pods, Nodes, Deployments, DaemonSets等| -------------------------------------------------------------------------------- /operator/README.md: -------------------------------------------------------------------------------- 1 | # 第9章 Prometheus Operator 2 | 3 | 本章,我们将介绍如何使用Prometheus Operator简化在Kubernetes下部署和管理Prmetheus的复杂度。 4 | 5 | 本章的主要内容: 6 | 7 | * 为什么需要使用Prometheus Operator 8 | * Prometheus Operator的主要概念 9 | * 如何利用Prometheus Operator自动化运维Prometheus 10 | * 如何使用Prometheus Operator自动化管理监控配置 -------------------------------------------------------------------------------- /operator/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # 小结 2 | 3 | 在本章中,我们介绍了在Kubernetes下如何使用Operator来有状态的运维和管理Prometheus以及Alertmanager等组件。 -------------------------------------------------------------------------------- /operator/gs/alertmanager-inst.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Alertmanager 3 | metadata: 4 | name: inst 5 | namespace: monitoring 6 | spec: 7 | replicas: 3 -------------------------------------------------------------------------------- /operator/gs/alertmanager.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | resolve_timeout: 5m 3 | route: 4 | group_by: ['job'] 5 | group_wait: 30s 6 | group_interval: 5m 7 | repeat_interval: 12h 8 | receiver: 'webhook' 9 | receivers: 10 | - name: 'webhook' 11 | webhook_configs: 12 | - url: 'http://alertmanagerwh:30500/' -------------------------------------------------------------------------------- /operator/gs/example-app-service-monitor.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: example-app 5 | namespace: monitoring 6 | labels: 7 | team: frontend 8 | spec: 9 | namespaceSelector: 10 | matchNames: 11 | - default 12 | selector: 13 | matchLabels: 14 | app: example-app 15 | endpoints: 16 | - port: web -------------------------------------------------------------------------------- /operator/gs/example-app.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: example-app 5 | labels: 6 | app: example-app 7 | spec: 8 | selector: 9 | app: example-app 10 | ports: 11 | - name: web 12 | port: 8080 13 | --- 14 | apiVersion: extensions/v1beta1 15 | kind: Deployment 16 | metadata: 17 | name: example-app 18 | spec: 19 | replicas: 3 20 | template: 21 | metadata: 22 | labels: 23 | app: example-app 24 | spec: 25 | containers: 26 | - name: example-app 27 | image: fabxc/instrumented_app 28 | ports: 29 | - name: web 30 | containerPort: 8080 -------------------------------------------------------------------------------- /operator/gs/example-rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | prometheus: example 6 | role: alert-rules 7 | name: prometheus-example-rules 8 | spec: 9 | groups: 10 | - name: ./example.rules 11 | rules: 12 | - alert: ExampleAlert 13 | expr: vector(1) -------------------------------------------------------------------------------- /operator/gs/prometheus-inst-cc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | name: inst-cc 5 | namespace: monitoring 6 | spec: 7 | serviceAccountName: prometheus 8 | resources: 9 | requests: 10 | memory: 400Mi -------------------------------------------------------------------------------- /operator/gs/prometheus-inst.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: Prometheus 3 | metadata: 4 | name: inst 5 | namespace: monitoring 6 | spec: 7 | serviceAccountName: prometheus 8 | serviceMonitorSelector: 9 | matchLabels: 10 | team: frontend 11 | ruleSelector: 12 | matchLabels: 13 | role: alert-rules 14 | prometheus: example 15 | alerting: 16 | alertmanagers: 17 | - name: alertmanager-example 18 | namespace: monitoring 19 | port: web 20 | resources: 21 | requests: 22 | memory: 400Mi -------------------------------------------------------------------------------- /operator/gs/prometheus-rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: prometheus 5 | namespace: monitoring 6 | --- 7 | apiVersion: rbac.authorization.k8s.io/v1beta1 8 | kind: ClusterRole 9 | metadata: 10 | name: prometheus 11 | rules: 12 | - apiGroups: [""] 13 | resources: 14 | - nodes 15 | - services 16 | - endpoints 17 | - pods 18 | verbs: ["get", "list", "watch"] 19 | - apiGroups: [""] 20 | resources: 21 | - configmaps 22 | verbs: ["get"] 23 | - nonResourceURLs: ["/metrics"] 24 | verbs: ["get"] 25 | --- 26 | apiVersion: rbac.authorization.k8s.io/v1beta1 27 | kind: ClusterRoleBinding 28 | metadata: 29 | name: prometheus 30 | roleRef: 31 | apiGroup: rbac.authorization.k8s.io 32 | kind: ClusterRole 33 | name: prometheus 34 | subjects: 35 | - kind: ServiceAccount 36 | name: prometheus 37 | namespace: monitoring -------------------------------------------------------------------------------- /operator/gs/prometheus.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | scrape_timeout: 10s 4 | evaluation_interval: 10s -------------------------------------------------------------------------------- /operator/static/operator-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/operator/static/operator-01.png -------------------------------------------------------------------------------- /operator/static/prometheus-alert-cluster-status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/operator/static/prometheus-alert-cluster-status.png -------------------------------------------------------------------------------- /operator/static/prometheus-alerting-auto2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/operator/static/prometheus-alerting-auto2.png -------------------------------------------------------------------------------- /operator/static/prometheus-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/operator/static/prometheus-architecture.png -------------------------------------------------------------------------------- /operator/static/prometheus-rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunlzheng/prometheus-book/f1c3dff674f9495f2a6793f66c909f773f9bce0d/operator/static/prometheus-rule.png -------------------------------------------------------------------------------- /operator/use-custom-configuration-in-operator.md: -------------------------------------------------------------------------------- 1 | # 在Prometheus Operator中使用自定义配置 2 | 3 | 在Prometheus Operator我们通过声明式的创建如Prometheus, ServiceMonitor这些自定义的资源类型来自动化部署和管理Prometheus的相关组件以及配置。而在一些特殊的情况下,对于用户而言,可能还是希望能够手动管理Prometheus配置文件,而非通过Prometheus Operator自动完成。 为什么? 实际上Prometheus Operator对于Job的配置只适用于在Kubernetes中部署和管理的应用程序。如果你希望使用Prometheus监控一些其他的资源,例如AWS或者其他平台中的基础设施或者应用,这些并不在Prometheus Operator的能力范围之内。 4 | 5 | 为了能够在通过Prometheus Operator创建的Prometheus实例中使用自定义配置文件,我们只能创建一个不包含任何与配置文件内容相关的Prometheus实例 6 | 7 | ``` 8 | apiVersion: monitoring.coreos.com/v1 9 | kind: Prometheus 10 | metadata: 11 | name: inst-cc 12 | namespace: monitoring 13 | spec: 14 | serviceAccountName: prometheus 15 | resources: 16 | requests: 17 | memory: 400Mi 18 | ``` 19 | 20 | 将以上内容保存到prometheus-inst-cc.yaml文件中,并且通过kubectl创建: 21 | 22 | ``` 23 | $ kubectl -n monitoring create -f prometheus-inst-cc.yaml 24 | prometheus.monitoring.coreos.com/inst-cc created 25 | ``` 26 | 27 | 如果查看新建Prometheus的Pod实例YAML定义,我们可以看到Pod中会包含一个volume配置: 28 | 29 | ``` 30 | volumes: 31 | - name: config 32 | secret: 33 | defaultMode: 420 34 | secretName: prometheus-inst-cc 35 | ``` 36 | 37 | Prometheus的配置文件实际上是保存在名为`prometheus-`的Secret中,当用户创建的Prometheus中关联ServiceMonitor这类会影响配置文件内容的定义时,Promethues Operator会自动管理。而如果Prometheus定义中不包含任何与配置相关的定义,那么Secret的管理权限就落到了用户自己手中。 38 | 39 | 通过修改prometheus-inst-cc的内容,从而可以让用户可以使用自定义的Prometheus配置文件,作为示例,我们创建一个prometheus.yaml文件并添加以下内容: 40 | 41 | ``` 42 | global: 43 | scrape_interval: 10s 44 | scrape_timeout: 10s 45 | evaluation_interval: 10s 46 | ``` 47 | 48 | 生成文件内容的base64编码后的内容: 49 | 50 | ``` 51 | $ cat prometheus.yaml | base64 52 | Z2xvYmFsOgogIHNjcmFwZV9pbnRlcnZhbDogMTBzCiAgc2NyYXBlX3RpbWVvdXQ6IDEwcwogIGV2YWx1YXRpb25faW50ZXJ2YWw6IDEwcw== 53 | ``` 54 | 55 | 修改名为prometheus-inst-cc的Secret内容,如下所示: 56 | 57 | ``` 58 | $ kubectl -n monitoring edit secret prometheus-inst-cc 59 | # 省略其它内容 60 | data: 61 | prometheus.yaml: "Z2xvYmFsOgogIHNjcmFwZV9pbnRlcnZhbDogMTBzCiAgc2NyYXBlX3RpbWVvdXQ6IDEwcwogIGV2YWx1YXRpb25faW50ZXJ2YWw6IDEwcw==" 62 | ``` 63 | 64 | 通过port-forward在本地访问新建的Prometheus实例,观察配置文件变化即可: 65 | 66 | ``` 67 | kubectl -n monitoring port-forward statefulsets/prometheus-inst-cc 9091:9090 68 | ``` 69 | 70 | -------------------------------------------------------------------------------- /operator/what-is-prometheus-operator.md: -------------------------------------------------------------------------------- 1 | # 什么是Prometheus Operator 2 | 3 | 在第8章中,为了在Kubernetes能够方便的管理和部署Prometheus,我们使用ConfigMap了管理Prometheus配置文件。每次对Prometheus配置文件进行升级时,,我们需要手动移除已经运行的Pod实例,从而让Kubernetes可以使用最新的配置文件创建Prometheus。 而如果应用实例的数量更多时,通过手动的方式部署和升级Prometheus过程繁琐并且效率低下。 4 | 5 | 从本质上来讲Prometheus属于是典型的有状态应用,而其有包含了一些自身特有的运维管理和配置管理方式。而这些都无法通过Kubernetes原生提供的应用管理概念实现自动化。为了简化这类应用程序的管理复杂度,CoreOS率先引入了Operator的概念,并且首先推出了针对在Kubernetes下运行和管理Etcd的Etcd Operator。并随后推出了Prometheus Operator。 6 | 7 | ## Prometheus Operator的工作原理 8 | 9 | 从概念上来讲Operator就是针对管理特定应用程序的,在Kubernetes基本的Resource和Controller的概念上,以扩展Kubernetes api的形式。帮助用户创建,配置和管理复杂的有状态应用程序。从而实现特定应用程序的常见操作以及运维自动化。 10 | 11 | 在Kubernetes中我们使用Deployment、DamenSet,StatefulSet来管理应用Workload,使用Service,Ingress来管理应用的访问方式,使用ConfigMap和Secret来管理应用配置。我们在集群中对这些资源的创建,更新,删除的动作都会被转换为事件(Event),Kubernetes的Controller Manager负责监听这些事件并触发相应的任务来满足用户的期望。这种方式我们成为声明式,用户只需要关心应用程序的最终状态,其它的都通过Kubernetes来帮助我们完成,通过这种方式可以大大简化应用的配置管理复杂度。 12 | 13 | 而除了这些原生的Resource资源以外,Kubernetes还允许用户添加自己的自定义资源(Custom Resource)。并且通过实现自定义Controller来实现对Kubernetes的扩展。 14 | 15 | 如下所示,是Prometheus Operator的架构示意图: 16 | 17 | ![Prometheus Operator架构](./static/prometheus-architecture.png) 18 | 19 | Prometheus的本职就是一组用户自定义的CRD资源以及Controller的实现,Prometheus Operator负责监听这些自定义资源的变化,并且根据这些资源的定义自动化的完成如Prometheus Server自身以及配置的自动化管理工作。 20 | 21 | ## Prometheus Operator能做什么 22 | 23 | 要了解Prometheus Operator能做什么,其实就是要了解Prometheus Operator为我们提供了哪些自定义的Kubernetes资源,列出了Prometheus Operator目前提供的️4类资源: 24 | 25 | * Prometheus:声明式创建和管理Prometheus Server实例; 26 | * ServiceMonitor:负责声明式的管理监控配置; 27 | * PrometheusRule:负责声明式的管理告警配置; 28 | * Alertmanager:声明式的创建和管理Alertmanager实例。 29 | 30 | 简言之,Prometheus Operator能够帮助用户自动化的创建以及管理Prometheus Server以及其相应的配置。 31 | 32 | ## 在Kubernetes集群中部署Prometheus Operator 33 | 34 | 在Kubernetes中安装Prometheus Operator非常简单,用户可以从以下地址中过去Prometheus Operator的源码: 35 | 36 | ``` 37 | git clone https://github.com/coreos/prometheus-operator.git 38 | ``` 39 | 40 | 这里,我们为Promethues Operator创建一个单独的命名空间monitoring: 41 | 42 | ``` 43 | kubectl create namespace monitoring 44 | ``` 45 | 46 | 由于需要对Prometheus Operator进行RBAC授权,而默认的bundle.yaml中使用了default命名空间,因此,在安装Prometheus Operator之前需要先替换一下bundle.yaml文件中所有namespace定义,由default修改为monitoring。 通过运行一下命令安装Prometheus Operator的Deployment实例: 47 | 48 | ``` 49 | $ kubectl -n monitoring apply -f bundle.yaml 50 | clusterrolebinding.rbac.authorization.k8s.io/prometheus-operator created 51 | clusterrole.rbac.authorization.k8s.io/prometheus-operator created 52 | deployment.apps/prometheus-operator created 53 | serviceaccount/prometheus-operator created 54 | service/prometheus-operator created 55 | ``` 56 | 57 | Prometheus Operator通过Deployment的形式进行部署,为了能够让Prometheus Operator能够监听和管理Kubernetes资源同时也创建了单独的ServiceAccount以及相关的授权动作。 58 | 59 | 查看Prometheus Operator部署状态,以确保已正常运行: 60 | 61 | ``` 62 | $ kubectl -n monitoring get pods 63 | NAME READY STATUS RESTARTS AGE 64 | prometheus-operator-6db8dbb7dd-2hz55 1/1 Running 0 19s 65 | ``` 66 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "prometheus-in-action", 3 | "version": "1.0.0", 4 | "description": "``` npm install -g gitbook-cli ```", 5 | "main": "index.js", 6 | "scripts": { 7 | "build": "gitbook build", 8 | "start": "gitbook serve", 9 | "installPlugins": "gitbook install", 10 | "pdf": "gitbook pdf" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git@gitee.com:moo/prometheus-in-action.git" 15 | }, 16 | "author": "", 17 | "license": "MIT", 18 | "dependencies": { 19 | "gitbook-cli": "^2.3.2", 20 | "gitbook-plugin-anchor-navigation": "0.0.1", 21 | "gitbook-plugin-image-captions": "^3.1.0", 22 | "gitbook-plugin-multipart": "^0.3.0", 23 | "gitbook-plugin-recently-updated": "github:jwarby/gitbook-plugin-recently-updated", 24 | "gitbook-plugin-sectionx": "^3.1.0", 25 | "gitbook-plugin-splitter": "^0.0.8", 26 | "svgexport": "^0.3.2" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /prometheus/prometheus_consul.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 5s 3 | scrape_timeout: 5s 4 | evaluation_interval: 15s 5 | alerting: 6 | alertmanagers: 7 | - static_configs: 8 | - targets: [] 9 | scheme: http 10 | timeout: 10s 11 | scrape_configs: 12 | - job_name: node 13 | metrics_path: /metrics 14 | scheme: http 15 | consul_sd_configs: 16 | - server: consul:8500 17 | scheme: http 18 | services: 19 | - node_exporter 20 | relabel_configs: 21 | - source_labels: ["__meta_consul_dc"] 22 | regex: "(.*)" 23 | replacement: $1 24 | action: replace 25 | target_label: "dc" 26 | - source_labels: ["__meta_consul_tags"] 27 | regex: ".*,development,.*" 28 | action: keep 29 | - job_name: jenkins 30 | metrics_path: /prometheus 31 | scheme: http 32 | consul_sd_configs: 33 | - server: consul:8500 34 | scheme: http 35 | services: 36 | - jenkins 37 | -------------------------------------------------------------------------------- /prometheus/prometheus_static.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 5s 3 | scrape_timeout: 5s 4 | evaluation_interval: 15s 5 | alerting: 6 | alertmanagers: 7 | - static_configs: 8 | - targets: [] 9 | scheme: http 10 | timeout: 10s 11 | scrape_configs: 12 | - job_name: prometheus 13 | metrics_path: /metrics 14 | scheme: http 15 | static_configs: 16 | - targets: 17 | - localhost:9090 18 | - job_name: node 19 | metrics_path: /metrics 20 | scheme: http 21 | static_configs: 22 | - targets: 23 | - node_exporter:9100 24 | - job_name: docker 25 | metrics_path: /metrics 26 | scheme: http 27 | static_configs: 28 | - targets: 29 | - docker_exporter:9095 30 | - job_name: cadvisor 31 | metrics_path: /metrics 32 | scheme: http 33 | static_configs: 34 | - targets: 35 | - cadvisor:8080 36 | -------------------------------------------------------------------------------- /promql/README.md: -------------------------------------------------------------------------------- 1 | # 第2章: 探索PromQL 2 | 3 | 本章将带领读者探秘Prometheus的自定义查询语言PromQL。通过PromQL用户可以非常方便地对监控样本数据进行统计分析,PromQL支持常见的运算操作符,同时PromQL中还提供了大量的内置函数可以实现对数据的高级处理。当然在学习PromQL之前,用户还需要了解Prometheus的样本数据模型。PromQL作为Prometheus的核心能力除了实现数据的对外查询和展现,同时告警监控也是依赖PromQL实现的。 4 | 5 | 本章的主要内容: 6 | 7 | * Prometheus的数据模型 8 | * Prometheus中监控指标的类型 9 | * 深入PromQL 10 | * 4个黄金指标和USE方法 11 | -------------------------------------------------------------------------------- /promql/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # 小结 2 | 3 | PromQL是Prometheus的标准查询语句,通过强大的数据统计能力,使得将监控指标与实际业务进行关联成为可能。同时通过内置的预测函数,能够帮助用户将传统的面向结果转变为面向预测的方式。从而更有效的为业务和系统的正常运行保驾护航。 -------------------------------------------------------------------------------- /promql/prometheus-aggr-ops.md: -------------------------------------------------------------------------------- 1 | # PromQL聚合操作 2 | 3 | Prometheus还提供了下列内置的聚合操作符,这些操作符作用域瞬时向量。可以将瞬时表达式返回的样本数据进行聚合,形成一个新的时间序列。 4 | 5 | * ```sum``` (求和) 6 | * ```min``` (最小值) 7 | * ```max``` (最大值) 8 | * ```avg``` (平均值) 9 | * ```stddev``` (标准差) 10 | * ```stdvar``` (标准方差) 11 | * ```count``` (计数) 12 | * ```count_values``` (对value进行计数) 13 | * ```bottomk``` (后n条时序) 14 | * ```topk``` (前n条时序) 15 | * ```quantile``` (分位数) 16 | 17 | 使用聚合操作的语法如下: 18 | 19 | ``` 20 | ([parameter,] ) [without|by (