├── .circleci └── config.yml ├── CONTRIBUTING.md ├── DCO ├── LICENSE ├── README.md ├── SECURITY.md ├── build.sh ├── ci-scripts └── package.sh ├── configs ├── alertmanager-templates │ ├── default.tmpl │ └── slack.tmpl ├── grafana │ ├── grafana-net-2-dashboard.json │ ├── grafana-net-737-dashboard.json │ └── prometheus-datasource.json └── prometheus │ ├── prometheus.yaml │ └── rules │ ├── cpu-usage.rules │ ├── instance-availability.rules │ ├── low-disk-space.rules │ └── mem-usage.rules ├── development.md ├── docs ├── grafana_cluster_overview.png ├── grafana_edit_admin.png ├── index.md ├── prometheus_alerts.png └── prometheus_targets.png ├── helm └── prometheus-chart │ ├── .helmignore │ ├── Chart.yaml │ ├── OWNERS │ ├── README.md │ ├── grafana-dashboards │ ├── kubernetes-cluster.json │ ├── kubernetes-core-cluster.json │ ├── kubernetes-pods.json │ ├── logging.json │ ├── nodes-overview.json │ └── prometheus.json │ ├── prometheus-alerts │ ├── alertmanager.rules.yml │ ├── configmap.rules.yml │ ├── container.rules.yml │ ├── daemonset.rules.yml │ ├── deployment.rules.yml │ ├── disk.rules.yml │ ├── fluentbit.rules.yml │ ├── ingress-controller.rules.yml │ ├── job.rules.yml │ ├── logging-data.rules.yml │ ├── network.rules.yml │ ├── node.rules.yml │ ├── pod.rules.yml │ ├── prometheus.rules.yml │ ├── up.rules.yml │ └── volume.rules.yml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── alertmanager-configmap.yaml │ ├── alertmanager-deployment.yaml │ ├── alertmanager-ingress.yaml │ ├── alertmanager-networkpolicy.yaml │ ├── alertmanager-pvc.yaml │ ├── alertmanager-service.yaml │ ├── alertmanager-serviceaccount.yaml │ ├── grafana-configmap.yaml │ ├── grafana-deployment.yaml │ ├── grafana-ingress.yaml │ ├── grafana-podsecuritypolicy.yaml │ ├── grafana-role.yaml │ ├── grafana-rolebinding.yaml │ ├── grafana-secret.yaml │ ├── grafana-service.yaml │ ├── grafana-serviceaccount.yaml │ ├── prometheus-clusterrole.yaml │ ├── prometheus-clusterrolebinding.yaml │ ├── prometheus-configmap.yaml │ ├── prometheus-deployment.yaml │ ├── prometheus-ingress.yaml │ ├── prometheus-networkpolicy.yaml │ ├── prometheus-pvc.yaml │ ├── prometheus-rules.yaml │ ├── prometheus-service.yaml │ └── prometheus-serviceaccount.yaml │ └── values.yaml ├── manifests-all.yaml └── manifests ├── 0-namespace.yaml ├── 01-rbac.yaml ├── alertmanager ├── alertmanager-templates.yaml ├── configmap.yaml ├── deployment.yaml └── service.yaml ├── grafana ├── deployment.yaml ├── import-dashboards │ ├── configmap.yaml │ └── job.yaml ├── ingress.yaml ├── secret.yaml └── service.yaml └── prometheus ├── configmap.yaml ├── deployment.yaml ├── kube-state-metrics ├── deployment.yaml ├── rbac.yaml └── service.yaml ├── node-directory-size-metrics └── daemonset.yaml ├── node-exporter ├── daemonset.yaml └── service.yaml ├── prometheus-rules.yaml └── service.yaml /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | lint: 4 | docker: 5 | - image: quay.io/giantswarm/shellcheck-alpine:v0.6.0 6 | steps: 7 | - checkout 8 | 9 | - run: 10 | name: lint scripts 11 | command: shellcheck -x ci-scripts/* 12 | 13 | build: 14 | working_directory: /home/circleci/.go_workspace/src/github.com/giantswarm/prometheus 15 | machine: true 16 | steps: 17 | - checkout 18 | 19 | - run: | 20 | wget -q $(curl -sS -H "Authorization: token $RELEASE_TOKEN" https://api.github.com/repos/giantswarm/architect/releases/latest | grep browser_download_url | head -n 1 | cut -d '"' -f 4) 21 | chmod +x ./architect 22 | ./architect version 23 | 24 | - run: 25 | name: Template and push to quay 26 | command: ./architect build 27 | 28 | - store_test_results: 29 | path: /tmp/results 30 | 31 | publish-github-release: 32 | docker: 33 | - image: quay.io/giantswarm/github:0.12 34 | steps: 35 | - checkout 36 | 37 | - run: 38 | name: "Package Helm Chart" 39 | command: | 40 | ./ci-scripts/package.sh ${CIRCLE_PROJECT_REPONAME} ${CIRCLE_TAG} 41 | 42 | - run: 43 | name: "Publish Release on GitHub" 44 | command: | 45 | ghr -t ${PERSONAL_ACCESS_TOKEN} -u giantswarm -r ${CIRCLE_PROJECT_REPONAME} -c ${CIRCLE_SHA1} ${CIRCLE_TAG} "${CIRCLE_PROJECT_REPONAME}-chart-${CIRCLE_TAG:1}.tgz" 46 | 47 | workflows: 48 | version: 2 49 | build-deploy: 50 | jobs: 51 | - lint 52 | - build: 53 | filters: 54 | tags: 55 | only: /^v\d+\.\d+\.\d+$/ 56 | - publish-github-release: 57 | requires: 58 | - build 59 | filters: 60 | branches: 61 | ignore: /.*/ 62 | tags: 63 | only: /^v\d+\.\d+\.\d+$/ 64 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | Prometheus is Apache 2.0 licensed and accepts contributions via GitHub pull 4 | requests. This document outlines some of the conventions on commit message 5 | formatting, contact points for developers and other resources to make getting 6 | your contribution into PROJECT easier. 7 | 8 | # Email and chat 9 | 10 | - Email: [giantswarm](https://groups.google.com/forum/#!forum/giantswarm) 11 | - IRC: #[giantswarm](irc://irc.freenode.org:6667/#giantswarm) IRC channel on 12 | freenode.org 13 | 14 | ## Getting started 15 | 16 | - Fork the repository on GitHub 17 | - Read the [README](README.md) for build instructions 18 | 19 | ## Reporting Bugs and Creating Issues 20 | 21 | Reporting bugs is one of the best ways to contribute. If you find bugs or 22 | documentation mistakes in the PROJECT project, please let us know by [opening an 23 | issue](https://github.com/giantswarm/PROJECT/issues/new). We treat bugs and 24 | mistakes very seriously and believe no issue is too small. Before creating a bug 25 | report, please check there that one does not already exist. 26 | 27 | To make your bug report accurate and easy to understand, please try to create 28 | bug reports that are: 29 | 30 | - Specific. Include as much details as possible: which version, what 31 | environment, what configuration, etc. You can also attach logs. 32 | 33 | - Reproducible. Include the steps to reproduce the problem. We understand some 34 | issues might be hard to reproduce, please includes the steps that might lead 35 | to the problem. If applicable, you can also attach affected data dir(s) and a 36 | stack trace to the bug report. 37 | 38 | - Isolated. Please try to isolate and reproduce the bug with minimum 39 | dependencies. It would significantly slow down the speed to fix a bug if too 40 | many dependencies are involved in a bug report. Debugging external systems 41 | that rely on PROJECT is out of scope, but we are happy to point you in the 42 | right direction or help you interact with PROJECT in the correct manner. 43 | 44 | - Unique. Do not duplicate existing bug reports. 45 | 46 | - Scoped. One bug per report. Do not follow up with another bug inside one 47 | report. 48 | 49 | You might also want to read [Elika Etemad’s article on filing good bug 50 | reports](http://fantasai.inkedblade.net/style/talks/filing-good-bugs/) before 51 | creating a bug report. 52 | 53 | We might ask you for further information to locate a bug. A duplicated bug 54 | report will be closed. 55 | 56 | ## Contribution flow 57 | 58 | This is a rough outline of what a contributor's workflow looks like: 59 | 60 | - Create a feature branch from where you want to base your work. This is usually 61 | master. 62 | - Make commits of logical units. 63 | - Make sure your commit messages are in the proper format (see below). 64 | - Push your changes to a topic branch in your fork of the repository. 65 | - Submit a pull request to giantswarm/PROJECT. 66 | - Adding unit tests will greatly improve the chance for getting a quick review 67 | and your PR accepted. 68 | - Your PR must receive a LGTM from project owners. 69 | - When PR gets approval, all commits are squashed into one and merged by one of 70 | the owners. 71 | 72 | Thanks for your contributions! 73 | 74 | ### Code style 75 | 76 | The coding style suggested by the Golang community is used. See the [style 77 | doc](https://github.com/golang/go/wiki/CodeReviewComments) for details. 78 | 79 | Please follow this style to make the code easy to review, maintain, and develop. 80 | 81 | ### Format of the Commit Message 82 | 83 | We follow a rough convention for commit messages that is designed to answer two 84 | questions: what changed and why. The subject line should feature the what and 85 | the body of the commit should describe the why. -------------------------------------------------------------------------------- /DCO: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 5 | 660 York Street, Suite 102, 6 | San Francisco, CA 94110 USA 7 | 8 | Everyone is permitted to copy and distribute verbatim copies of this 9 | license document, but changing it is not allowed. 10 | 11 | 12 | Developer's Certificate of Origin 1.1 13 | 14 | By making a contribution to this project, I certify that: 15 | 16 | (a) The contribution was created in whole or in part by me and I 17 | have the right to submit it under the open source license 18 | indicated in the file; or 19 | 20 | (b) The contribution is based upon previous work that, to the best 21 | of my knowledge, is covered under an appropriate open source 22 | license and I have the right under that license to submit that 23 | work with modifications, whether created in whole or in part 24 | by me, under the same open source license (unless I am 25 | permitted to submit under a different license), as indicated 26 | in the file; or 27 | 28 | (c) The contribution was provided directly to me by some other 29 | person who certified (a), (b) or (c) and I have not modified 30 | it. 31 | 32 | (d) I understand and agree that this project and the contribution 33 | are public and that a record of the contribution (including all 34 | personal information I submit with it, including my sign-off) is 35 | maintained indefinitely and may be redistributed consistent with 36 | this project or the open source license(s) involved. 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016 - 2019 Giant Swarm GmbH 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![CircleCI](https://circleci.com/gh/giantswarm/prometheus.svg?style=svg)](https://circleci.com/gh/giantswarm/prometheus) 2 | # Kubernetes Setup for Prometheus and Grafana 3 | 4 | ## Quick start 5 | 6 | To quickly start all the things just do this: 7 | ```bash 8 | kubectl apply \ 9 | --filename https://raw.githubusercontent.com/giantswarm/prometheus/master/manifests-all.yaml 10 | ``` 11 | 12 | This will create the namespace `monitoring` and bring up all components in there. 13 | 14 | To shut down all components again you can just delete that namespace: 15 | ```bash 16 | kubectl delete namespace monitoring 17 | ``` 18 | 19 | ## Default Dashboards 20 | 21 | If you want to re-import the default dashboards from this setup run this job: 22 | ```bash 23 | kubectl apply --filename ./manifests/grafana/import-dashboards/job.yaml 24 | ``` 25 | 26 | In case the job already exists from an earlier run, delete it before: 27 | ```bash 28 | kubectl --namespace monitoring delete job grafana-import-dashboards 29 | ``` 30 | 31 | To access grafana you can use port forward functionality 32 | ```bash 33 | kubectl port-forward --namespace monitoring service/grafana 3000:3000 34 | ``` 35 | And you should be able to access grafana on `http://localhost:3000/login` 36 | 37 | ## More Dashboards 38 | 39 | See grafana.net for some example [dashboards](https://grafana.net/dashboards) and [plugins](https://grafana.net/plugins). 40 | 41 | - Configure [Prometheus](https://grafana.net/plugins/prometheus) data source for Grafana.
42 | `Grafana UI / Data Sources / Add data source` 43 | - `Name`: `prometheus` 44 | - `Type`: `Prometheus` 45 | - `Url`: `http://prometheus:9090` 46 | - `Add` 47 | 48 | - Import [Prometheus Stats](https://grafana.net/dashboards/2):
49 | `Grafana UI / Dashboards / Import` 50 | - `Grafana.net Dashboard`: `https://grafana.net/dashboards/2` 51 | - `Load` 52 | - `Prometheus`: `prometheus` 53 | - `Save & Open` 54 | 55 | - Import [Kubernetes cluster monitoring](https://grafana.net/dashboards/162):
56 | `Grafana UI / Dashboards / Import` 57 | - `Grafana.net Dashboard`: `https://grafana.net/dashboards/162` 58 | - `Load` 59 | - `Prometheus`: `prometheus` 60 | - `Save & Open` 61 | 62 | ## Credit 63 | 64 | Alertmanager configs and integration in this repository was heavily inspired by the implementation in [kayrus/prometheus-kubernetes](https://github.com/kayrus/prometheus-kubernetes). 65 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | Please visit https://www.giantswarm.io/responsible-disclosure for information on reporting security issues. 6 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Create ConfigMap with prometheus rules for alerting 4 | kubectl --namespace monitoring create configmap --dry-run prometheus-rules \ 5 | --from-file=configs/prometheus/rules \ 6 | --output yaml \ 7 | > ./manifests/prometheus/prometheus-rules.yaml 8 | # Workaround since `--namespace monitoring` from above is not preserved 9 | echo " namespace: monitoring" >> ./manifests/prometheus/prometheus-rules.yaml 10 | 11 | # Create ConfigMap for an external url 12 | kubectl --namespace monitoring create configmap --dry-run alertmanager-templates \ 13 | --from-file=configs/alertmanager-templates \ 14 | --output yaml \ 15 | > ./manifests/alertmanager/alertmanager-templates.yaml 16 | # Workaround since `--namespace monitoring` from above is not preserved 17 | echo " namespace: monitoring" >> ./manifests/alertmanager/alertmanager-templates.yaml 18 | 19 | # Create ConfigMap with Grafana dashboards and datasources 20 | kubectl --namespace monitoring create configmap --dry-run grafana-import-dashboards \ 21 | --from-file=configs/grafana \ 22 | --output yaml \ 23 | > ./manifests/grafana/import-dashboards/configmap.yaml 24 | # Workaround since `--namespace monitoring` from above is not preserved 25 | echo " namespace: monitoring" >> ./manifests/grafana/import-dashboards/configmap.yaml 26 | 27 | # Create ConfigMap with Prometheus config 28 | kubectl --namespace monitoring create configmap --dry-run prometheus-core \ 29 | --from-file=configs/prometheus/prometheus.yaml \ 30 | --output yaml \ 31 | > ./manifests/prometheus/configmap.yaml 32 | # Workaround since `--namespace monitoring` from above is not preserved 33 | echo " namespace: monitoring" >> ./manifests/prometheus/configmap.yaml 34 | 35 | # Create one single manifest file 36 | target="./manifests-all.yaml" 37 | rm "$target" 38 | echo "# Derived from ./manifests" >> "$target" 39 | for file in $(find ./manifests -type f -name "*.yaml" | sort) ; do 40 | echo "---" >> "$target" 41 | cat "$file" >> "$target" 42 | done 43 | -------------------------------------------------------------------------------- /ci-scripts/package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | set -o pipefail 6 | 7 | readonly PROJECT=$1 8 | readonly TAG=$2 9 | readonly VERSION=${TAG:1} 10 | 11 | readonly HELM_URL=https://get.helm.sh 12 | readonly HELM_TARBALL=helm-v2.17.0-linux-amd64.tar.gz 13 | 14 | main() { 15 | if ! setup_helm_client; then 16 | log_error "Helm client could not get installed." 17 | exit 1 18 | fi 19 | 20 | if ! package_chart "${PROJECT}" "${VERSION}"; then 21 | log_error "Helm Chart could not be packaged." 22 | exit 1 23 | fi 24 | 25 | echo "Successfully packaged ${PROJECT}-chart-${VERSION}.tgz" 26 | } 27 | 28 | setup_helm_client() { 29 | echo "Setting up Helm client..." 30 | 31 | curl --user-agent curl-ci-sync -sSL -o "${HELM_TARBALL}" "${HELM_URL}/${HELM_TARBALL}" 32 | tar xzf "${HELM_TARBALL}" 33 | 34 | PATH="$(pwd)/linux-amd64/:$PATH" 35 | helm init --client-only 36 | } 37 | 38 | package_chart() { 39 | local project="${1?Specify project}" 40 | local version="${2?Specify version}" 41 | 42 | # Replace CI version with release version 43 | sed -i 's/version:.*/version: '"${version}"'/' "helm/${project}-chart/Chart.yaml" 44 | 45 | helm package --save=false "helm/${project}-chart" 46 | } 47 | 48 | log_error() { 49 | printf '\e[31mERROR: %s\n\e[39m' "$1" >&2 50 | } 51 | 52 | main 53 | -------------------------------------------------------------------------------- /configs/alertmanager-templates/default.tmpl: -------------------------------------------------------------------------------- 1 | {{ define "__alertmanager" }}AlertManager{{ end }} 2 | {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} 3 | 4 | {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} 5 | {{ define "__description" }}{{ end }} 6 | 7 | {{ define "__text_alert_list" }}{{ range . }}Labels: 8 | {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} 9 | {{ end }}Annotations: 10 | {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} 11 | {{ end }}Source: {{ .GeneratorURL }} 12 | {{ end }}{{ end }} 13 | 14 | 15 | {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} 16 | {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} 17 | {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} 18 | {{ define "slack.default.pretext" }}{{ end }} 19 | {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} 20 | {{ define "slack.default.iconemoji" }}{{ end }} 21 | {{ define "slack.default.iconurl" }}{{ end }} 22 | {{ define "slack.default.text" }}{{ end }} 23 | 24 | 25 | {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }} 26 | {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }} 27 | 28 | 29 | {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }} 30 | {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }} 31 | {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }} 32 | {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }} 33 | 34 | 35 | {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }} 36 | {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} 37 | {{ if gt (len .Alerts.Firing) 0 -}} 38 | Alerts Firing: 39 | {{ template "__text_alert_list" .Alerts.Firing }} 40 | {{- end }} 41 | {{ if gt (len .Alerts.Resolved) 0 -}} 42 | Alerts Resolved: 43 | {{ template "__text_alert_list" .Alerts.Resolved }} 44 | {{- end }} 45 | {{- end }} 46 | {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }} 47 | 48 | 49 | {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }} 50 | {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }} 51 | 52 | 53 | {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }} 54 | {{ define "email.default.html" }} 55 | 56 | 82 | 83 | 84 | 85 | 86 | {{ template "__subject" . }} 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 172 | 173 | 174 |
96 |
97 | 98 | 99 | 104 | 105 | 106 | 161 | 162 |
100 | {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} 101 | {{ .Name }}={{ .Value }} 102 | {{ end }} 103 |
107 | 108 | 109 | 112 | 113 | {{ if gt (len .Alerts.Firing) 0 }} 114 | 115 | 118 | 119 | {{ end }} 120 | {{ range .Alerts.Firing }} 121 | 122 | 129 | 130 | {{ end }} 131 | 132 | {{ if gt (len .Alerts.Resolved) 0 }} 133 | {{ if gt (len .Alerts.Firing) 0 }} 134 | 135 | 140 | 141 | {{ end }} 142 | 143 | 146 | 147 | {{ end }} 148 | {{ range .Alerts.Resolved }} 149 | 150 | 157 | 158 | {{ end }} 159 |
110 | View in {{ template "__alertmanager" . }} 111 |
116 | [{{ .Alerts.Firing | len }}] Firing 117 |
123 | Labels
124 | {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} 125 | {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} 126 | {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} 127 | Source
128 |
136 |
137 |
138 |
139 |
144 | [{{ .Alerts.Resolved | len }}] Resolved 145 |
151 | Labels
152 | {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} 153 | {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} 154 | {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} 155 | Source
156 |
160 |
163 | 164 |
165 | 166 | 167 | 168 | 169 |
Sent by {{ template "__alertmanager" . }}
170 |
171 |
175 | 176 | 177 | 178 | 179 | {{ end }} 180 | 181 | {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }} 182 | {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} 183 | {{ if gt (len .Alerts.Firing) 0 }} 184 | Alerts Firing: 185 | {{ template "__text_alert_list" .Alerts.Firing }} 186 | {{ end }} 187 | {{ if gt (len .Alerts.Resolved) 0 }} 188 | Alerts Resolved: 189 | {{ template "__text_alert_list" .Alerts.Resolved }} 190 | {{ end }} 191 | {{ end }} 192 | {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }} 193 | -------------------------------------------------------------------------------- /configs/alertmanager-templates/slack.tmpl: -------------------------------------------------------------------------------- 1 | {{ define "slack.devops.text" }} 2 | {{range .Alerts}}{{.Annotations.DESCRIPTION}} 3 | {{end}} 4 | {{ end }} 5 | -------------------------------------------------------------------------------- /configs/grafana/prometheus-datasource.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "prometheus", 3 | "type": "prometheus", 4 | "url": "http://prometheus:9090", 5 | "access": "proxy", 6 | "basicAuth": false 7 | } 8 | -------------------------------------------------------------------------------- /configs/prometheus/prometheus.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | scrape_timeout: 10s 4 | evaluation_interval: 10s 5 | rule_files: 6 | - "/etc/prometheus-rules/*.rules" 7 | scrape_configs: 8 | 9 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37 10 | - job_name: 'kubernetes-nodes' 11 | tls_config: 12 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 13 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 14 | kubernetes_sd_configs: 15 | - role: node 16 | relabel_configs: 17 | - source_labels: [__address__] 18 | regex: '(.*):10250' 19 | replacement: '${1}:10255' 20 | target_label: __address__ 21 | 22 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79 23 | - job_name: 'kubernetes-endpoints' 24 | kubernetes_sd_configs: 25 | - role: endpoints 26 | relabel_configs: 27 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] 28 | action: keep 29 | regex: true 30 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] 31 | action: replace 32 | target_label: __scheme__ 33 | regex: (https?) 34 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] 35 | action: replace 36 | target_label: __metrics_path__ 37 | regex: (.+) 38 | - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] 39 | action: replace 40 | target_label: __address__ 41 | regex: (.+)(?::\d+);(\d+) 42 | replacement: $1:$2 43 | - action: labelmap 44 | regex: __meta_kubernetes_service_label_(.+) 45 | - source_labels: [__meta_kubernetes_namespace] 46 | action: replace 47 | target_label: kubernetes_namespace 48 | - source_labels: [__meta_kubernetes_service_name] 49 | action: replace 50 | target_label: kubernetes_name 51 | 52 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119 53 | - job_name: 'kubernetes-services' 54 | metrics_path: /probe 55 | params: 56 | module: [http_2xx] 57 | kubernetes_sd_configs: 58 | - role: service 59 | relabel_configs: 60 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] 61 | action: keep 62 | regex: true 63 | - source_labels: [__address__] 64 | target_label: __param_target 65 | - target_label: __address__ 66 | replacement: blackbox 67 | - source_labels: [__param_target] 68 | target_label: instance 69 | - action: labelmap 70 | regex: __meta_kubernetes_service_label_(.+) 71 | - source_labels: [__meta_kubernetes_namespace] 72 | target_label: kubernetes_namespace 73 | - source_labels: [__meta_kubernetes_service_name] 74 | target_label: kubernetes_name 75 | 76 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156 77 | - job_name: 'kubernetes-pods' 78 | kubernetes_sd_configs: 79 | - role: pod 80 | relabel_configs: 81 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] 82 | action: keep 83 | regex: true 84 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] 85 | action: replace 86 | target_label: __metrics_path__ 87 | regex: (.+) 88 | - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] 89 | action: replace 90 | regex: (.+):(?:\d+);(\d+) 91 | replacement: ${1}:${2} 92 | target_label: __address__ 93 | - action: labelmap 94 | regex: __meta_kubernetes_pod_label_(.+) 95 | - source_labels: [__meta_kubernetes_namespace] 96 | action: replace 97 | target_label: kubernetes_namespace 98 | - source_labels: [__meta_kubernetes_pod_name] 99 | action: replace 100 | target_label: kubernetes_pod_name 101 | - source_labels: [__meta_kubernetes_pod_container_port_number] 102 | action: keep 103 | regex: 9\d{3} 104 | 105 | - job_name: 'kubernetes-cadvisor' 106 | scheme: https 107 | tls_config: 108 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 109 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 110 | kubernetes_sd_configs: 111 | - role: node 112 | relabel_configs: 113 | - action: labelmap 114 | regex: __meta_kubernetes_node_label_(.+) 115 | - target_label: __address__ 116 | replacement: kubernetes.default.svc:443 117 | - source_labels: [__meta_kubernetes_node_name] 118 | regex: (.+) 119 | target_label: __metrics_path__ 120 | replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor 121 | -------------------------------------------------------------------------------- /configs/prometheus/rules/cpu-usage.rules: -------------------------------------------------------------------------------- 1 | ALERT NodeCPUUsage 2 | IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75 3 | FOR 2m 4 | LABELS { 5 | severity="page" 6 | } 7 | ANNOTATIONS { 8 | SUMMARY = "{{$labels.instance}}: High CPU usage detected", 9 | DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" 10 | } 11 | -------------------------------------------------------------------------------- /configs/prometheus/rules/instance-availability.rules: -------------------------------------------------------------------------------- 1 | ALERT InstanceDown 2 | IF up == 0 3 | FOR 1m 4 | LABELS { severity = "page" } 5 | ANNOTATIONS { 6 | summary = "Instance {{ $labels.instance }} down", 7 | description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.", 8 | } 9 | -------------------------------------------------------------------------------- /configs/prometheus/rules/low-disk-space.rules: -------------------------------------------------------------------------------- 1 | ALERT NodeLowRootDisk 2 | IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75 3 | FOR 2m 4 | LABELS { 5 | severity="page" 6 | } 7 | ANNOTATIONS { 8 | SUMMARY = "{{$labels.instance}}: Low root disk space", 9 | DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})" 10 | } 11 | 12 | ALERT NodeLowDataDisk 13 | IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75 14 | FOR 2m 15 | LABELS { 16 | severity="page" 17 | } 18 | ANNOTATIONS { 19 | SUMMARY = "{{$labels.instance}}: Low data disk space", 20 | DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})" 21 | } 22 | -------------------------------------------------------------------------------- /configs/prometheus/rules/mem-usage.rules: -------------------------------------------------------------------------------- 1 | ALERT NodeSwapUsage 2 | IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75 3 | FOR 2m 4 | LABELS { 5 | severity="page" 6 | } 7 | ANNOTATIONS { 8 | SUMMARY = "{{$labels.instance}}: Swap usage detected", 9 | DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})" 10 | } 11 | 12 | ALERT NodeMemoryUsage 13 | IF (((node_memory_MemTotal-node_memory_MemAvailable)/(node_memory_MemTotal)*100)) > 75 14 | FOR 2m 15 | LABELS { 16 | severity="page" 17 | } 18 | ANNOTATIONS { 19 | SUMMARY = "{{$labels.instance}}: High memory usage detected", 20 | DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})" 21 | } 22 | -------------------------------------------------------------------------------- /development.md: -------------------------------------------------------------------------------- 1 | # Before pushing to this repo 2 | 3 | Create derived `ConfigMap`s and `manifests-all.yaml` by running the following commands or executing `build.sh`: 4 | 5 | ```bash 6 | # Create ConfigMap with prometheus rules for alerting 7 | kubectl --namespace monitoring create configmap --dry-run prometheus-rules \ 8 | --from-file=configs/prometheus/rules \ 9 | --output yaml \ 10 | > ./manifests/prometheus/prometheus-rules.yaml 11 | # Workaround since `--namespace monitoring` from above is not preserved 12 | echo " namespace: monitoring" >> ./manifests/prometheus/prometheus-rules.yaml 13 | 14 | # Create ConfigMap for an external url 15 | kubectl --namespace monitoring create configmap --dry-run alertmanager-templates \ 16 | --from-file=configs/alertmanager-templates \ 17 | --output yaml \ 18 | > ./manifests/alertmanager/alertmanager-templates.yaml 19 | # Workaround since `--namespace monitoring` from above is not preserved 20 | echo " namespace: monitoring" >> ./manifests/alertmanager/alertmanager-templates.yaml 21 | 22 | # Create ConfigMap with Grafana dashboards and datasources 23 | kubectl --namespace monitoring create configmap --dry-run grafana-import-dashboards \ 24 | --from-file=configs/grafana \ 25 | --output yaml \ 26 | > ./manifests/grafana/import-dashboards/configmap.yaml 27 | # Workaround since `--namespace monitoring` from above is not preserved 28 | echo " namespace: monitoring" >> ./manifests/grafana/import-dashboards/configmap.yaml 29 | 30 | # Create ConfigMap with Prometheus config 31 | kubectl --namespace monitoring create configmap --dry-run prometheus-core \ 32 | --from-file=configs/prometheus/prometheus.yaml \ 33 | --output yaml \ 34 | > ./manifests/prometheus/configmap.yaml 35 | # Workaround since `--namespace monitoring` from above is not preserved 36 | echo " namespace: monitoring" >> ./manifests/prometheus/configmap.yaml 37 | 38 | # Create one single manifest file 39 | target="./manifests-all.yaml" 40 | rm "$target" 41 | echo "# Derived from ./manifests" >> "$target" 42 | for file in $(find ./manifests -type f -name "*.yaml" | sort) ; do 43 | echo "---" >> "$target" 44 | cat "$file" >> "$target" 45 | done 46 | ``` 47 | -------------------------------------------------------------------------------- /docs/grafana_cluster_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/giantswarm/prometheus/e58028316a2489dcd037183a0f9f0431717d68b0/docs/grafana_cluster_overview.png -------------------------------------------------------------------------------- /docs/grafana_edit_admin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/giantswarm/prometheus/e58028316a2489dcd037183a0f9f0431717d68b0/docs/grafana_edit_admin.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Monitoring with Prometheus and Grafana" 3 | description = "Recipe to spin up a monitoring setup with Prometheus and Grafana on Kubernetes." 4 | date = "2017-02-10" 5 | type = "page" 6 | weight = 100 7 | tags = ["recipe"] 8 | +++ 9 | 10 | # Monitoring with Prometheus and Grafana 11 | 12 | [Prometheus](https://prometheus.io/) is an open-source monitoring solution that includes the gathering of metrics, their storage in an internal time series database as well as querying and alerting based on that data. 13 | 14 | It offers a lot of integrations incl. Docker, Kubernetes, etc. 15 | 16 | Prometheus can also visualize your data. However, in this recipe we include another open-source tool, [Grafana](http://grafana.org/), for the visualization part, as it offers a more powerful and flexible way to generate visuals and dashboards. 17 | 18 | ## Deploying Prometheus and Grafana 19 | 20 | The following command will set you up with all neccesary components and some first dashboards to check out. 21 | 22 | ```bash 23 | kubectl apply --filename https://raw.githubusercontent.com/giantswarm/kubernetes-prometheus/master/manifests-all.yaml 24 | ``` 25 | 26 | ## Checking Prometheus 27 | 28 | Wait a bit for all the pods to come up. Then Prometheus should be ready and running. We can check the Prometheus targets at `https://api..k8s.gigantic.io/api/v1/proxy/namespaces/monitoring/services/prometheus:9090/targets` 29 | 30 | ![Prometheus Targets](prometheus_targets.png) 31 | 32 | *Note*: The above URL uses your Kubernetes API to proxy to the service. As the API is guarded with your credentials, you need to [set them up in your system](/guides/accessing-services-from-the-outside/) (and/or browser). We do not recommend to set up an Ingress for Prometheus at this time, as it currently does not support any kind of authentication and thus your cluster would be open to everyone. 33 | 34 | ## Checking Alertmanager 35 | 36 | Prometheus shows the active alerts and rules under `/alerts` in the prometheus frontend: 37 | 38 | ![Prometheus Alerts](prometheus_alerts.png) 39 | 40 | A full list of all rules can also be seen under `/rules`. The Alertmanager frontend offers more options to look at active and silenced alerts. 41 | 42 | ## Checking Grafana 43 | 44 | Now that we know Prometheus is up and running we can check for Grafana. 45 | 46 | There's an Ingress set up for Grafana, however, you need to set it to your desired domain. You can do this by editing the ingress: 47 | 48 | ```bash 49 | kubectl --namespace monitoring edit ingress grafana 50 | ``` 51 | 52 | This will open the ingress YAML in your standard editor. In the `host` field replace `yourchoice` with a subdomain of your choice and `clusterid` with your cluster ID. After saving and exiting your editor, wait a while and Grafana should be available at `http://..k8s.gigantic.io/`. 53 | 54 | You can use the default admin (`admin:admin`) user for your first login. You should change this admin user to reflect your desired username, your email, and a secure password ASAP! 55 | 56 | _Note:_ If persistent storage is not set up in your cluster, changes like the above will be reset to defaults if the Grafana Pod gets rescheduled. You would need to set them again after that. 57 | 58 | ## Changing the admin 59 | 60 | You can change the default admin user at http://grafana.monitoring..k8s.gigantic.io/admin/users/edit/1 61 | 62 | ![Grafana Datasource](grafana_edit_admin.png) 63 | 64 | Please note, that you need to update the password and the user data (username, email, etc.) separately with the respective update buttons below each section. 65 | 66 | ## Check out your dashboards 67 | 68 | You can now checkout the included dashboards, e.g. the [Cluster Monitoring Overview](http://grafana.monitoring.l8.k8s.gigantic.io/dashboard/db/kubernetes-cluster-monitoring-via-prometheus). 69 | 70 | ![Grafana Import Dashboard](grafana_cluster_overview.png) 71 | 72 | _Note:_ If persistent storage is not set up in your cluster, the preset datasource and dashboards will vanish if the Grafana Pod gets rescheduled. To get them back run: 73 | 74 | ```nohighlight 75 | kubectl --namespace=monitoring delete job grafana-import-dashboards 76 | kubectl --namespace=monitoring create --filename https://raw.githubusercontent.com/giantswarm/prometheus/master/manifests/grafana/import-dashboards/job.yaml 77 | ``` 78 | 79 | ## Next Steps 80 | 81 | Next, you should get into the [Grafana](http://docs.grafana.org/) and [Prometheus](https://prometheus.io/docs/introduction/overview/) documentations to get to know the tools and either build your own dashboards or extend the samples from above. 82 | 83 | You can also check out grafana.net for some more example [dashboards](https://grafana.net/dashboards) and [plugins](https://grafana.net/plugins). 84 | 85 | You might also want to set up some [alerting](https://prometheus.io/docs/alerting/overview/). 86 | -------------------------------------------------------------------------------- /docs/prometheus_alerts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/giantswarm/prometheus/e58028316a2489dcd037183a0f9f0431717d68b0/docs/prometheus_alerts.png -------------------------------------------------------------------------------- /docs/prometheus_targets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/giantswarm/prometheus/e58028316a2489dcd037183a0f9f0431717d68b0/docs/prometheus_targets.png -------------------------------------------------------------------------------- /helm/prometheus-chart/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | -------------------------------------------------------------------------------- /helm/prometheus-chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: 2.4.3 3 | description: Prometheus is a monitoring system and time series database. 4 | engine: gotpl 5 | home: https://prometheus.io/ 6 | icon: https://raw.githubusercontent.com/prometheus/prometheus.github.io/master/assets/prometheus_logo-cb55bb5c346.png 7 | maintainers: 8 | - name: giantswarm 9 | email: info@giantswarm.io 10 | name: prometheus-chart 11 | sources: 12 | - https://github.com/prometheus/alertmanager 13 | - https://github.com/prometheus/prometheus 14 | tillerVersion: ">=2.8.0" 15 | version: 0.1.0-[[ .SHA ]] 16 | -------------------------------------------------------------------------------- /helm/prometheus-chart/OWNERS: -------------------------------------------------------------------------------- 1 | approvers: 2 | - giantswarm 3 | reviewers: 4 | - giantswarm 5 | -------------------------------------------------------------------------------- /helm/prometheus-chart/README.md: -------------------------------------------------------------------------------- 1 | # Prometheus 2 | 3 | An opinionated [Prometheus](https://prometheus.io/) Helm chart for deployment in Giant Swarm clusters. 4 | 5 | ## Prerequisites 6 | 7 | - Kubernetes 1.3+ with Beta APIs enabled 8 | - [Helm](https://helm.sh) package manager. 9 | 10 | ## Installing the Chart 11 | 12 | To install the chart: 13 | 14 | ```console 15 | $ helm install ./helm --name my-prometheus 16 | ``` 17 | 18 | The command deploys Prometheus on the Kubernetes cluster in the default configuration. The [configuration](#configuration) section lists the parameters that can be configured during installation. 19 | 20 | ## Uninstalling the Chart 21 | 22 | To uninstall/delete the deployment: 23 | 24 | ```console 25 | $ helm delete my-prometheus --purge 26 | ``` 27 | 28 | The command removes all the Kubernetes components associated with the chart and deletes the release. 29 | 30 | ## Configuration 31 | 32 | The following table lists the configurable parameters of the Prometheus chart and their default values. 33 | 34 | Parameter | Description | Default 35 | --------- | ----------- | ------- 36 | `alertmanager.enabled` | If true, create alertmanager | `true` 37 | `alertmanager.name` | alertmanager container name | `alertmanager` 38 | `alertmanager.image.repository` | alertmanager container image repository | `prom/alertmanager` 39 | `alertmanager.image.tag` | alertmanager container image tag | `v0.15.2` 40 | `alertmanager.image.pullPolicy` | alertmanager container image pull policy | `IfNotPresent` 41 | `alertmanager.prefixURL` | The prefix slug at which the server can be accessed | `` 42 | `alertmanager.baseURL` | The external url at which the server can be accessed | `/` 43 | `alertmanager.extraArgs` | Additional alertmanager container arguments | `{}` 44 | `alertmanager.configMapOverrideName` | Prometheus alertmanager ConfigMap override where full-name is `{{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}}` and setting this value will prevent the default alertmanager ConfigMap from being generated | `""` 45 | `alertmanager.ingress.enabled` | If true, alertmanager Ingress will be created | `false` 46 | `alertmanager.ingress.annotations` | alertmanager Ingress annotations | `{}` 47 | `alertmanager.ingress.extraLabels` | alertmanager Ingress additional labels | `{}` 48 | `alertmanager.ingress.hosts` | alertmanager Ingress hostnames | `[]` 49 | `alertmanager.ingress.tls` | alertmanager Ingress TLS configuration (YAML) | `[]` 50 | `alertmanager.nodeSelector` | node labels for alertmanager pod assignment | `{}` 51 | `alertmanager.tolerations` | node taints to tolerate (requires Kubernetes >=1.6) | `[]` 52 | `alertmanager.affinity` | pod affinity | `{}` 53 | `alertmanager.schedulerName` | alertmanager alternate scheduler name | `nil` 54 | `alertmanager.persistentVolume.enabled` | If true, alertmanager will create a Persistent Volume Claim | `true` 55 | `alertmanager.persistentVolume.accessModes` | alertmanager data Persistent Volume access modes | `[ReadWriteOnce]` 56 | `alertmanager.persistentVolume.annotations` | Annotations for alertmanager Persistent Volume Claim | `{}` 57 | `alertmanager.persistentVolume.existingClaim` | alertmanager data Persistent Volume existing claim name | `""` 58 | `alertmanager.persistentVolume.mountPath` | alertmanager data Persistent Volume mount root path | `/data` 59 | `alertmanager.persistentVolume.size` | alertmanager data Persistent Volume size | `2Gi` 60 | `alertmanager.persistentVolume.storageClass` | alertmanager data Persistent Volume Storage Class | `unset` 61 | `alertmanager.persistentVolume.subPath` | Subdirectory of alertmanager data Persistent Volume to mount | `""` 62 | `alertmanager.podAnnotations` | annotations to be added to alertmanager pods | `{}` 63 | `alertmanager.replicaCount` | desired number of alertmanager pods | `1` 64 | `alertmanager.priorityClassName` | alertmanager priorityClassName | `nil` 65 | `alertmanager.resources` | alertmanager pod resource requests & limits | `{}` 66 | `alertmanager.securityContext` | Custom [security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for Alert Manager containers | `{}` 67 | `alertmanager.service.annotations` | annotations for alertmanager service | `{}` 68 | `alertmanager.service.clusterIP` | internal alertmanager cluster service IP | `""` 69 | `alertmanager.service.externalIPs` | alertmanager service external IP addresses | `[]` 70 | `alertmanager.service.loadBalancerIP` | IP address to assign to load balancer (if supported) | `""` 71 | `alertmanager.service.loadBalancerSourceRanges` | list of IP CIDRs allowed access to load balancer (if supported) | `[]` 72 | `alertmanager.service.servicePort` | alertmanager service port | `80` 73 | `alertmanager.service.type` | type of alertmanager service to create | `ClusterIP` 74 | `alertmanagerFiles.alertmanager.yml` | Prometheus alertmanager configuration | example configuration 75 | `configmapReload.name` | configmap-reload container name | `configmap-reload` 76 | `configmapReload.image.repository` | configmap-reload container image repository | `jimmidyson/configmap-reload` 77 | `configmapReload.image.tag` | configmap-reload container image tag | `v0.2.2` 78 | `configmapReload.image.pullPolicy` | configmap-reload container image pull policy | `IfNotPresent` 79 | `configmapReload.extraArgs` | Additional configmap-reload container arguments | `{}` 80 | `configmapReload.extraVolumeDirs` | Additional configmap-reload volume directories | `{}` 81 | `configmapReload.extraConfigmapMounts` | Additional configmap-reload configMap mounts | `[]` 82 | `configmapReload.resources` | configmap-reload pod resource requests & limits | `{}` 83 | `initChownData.enabled` | If false, don't reset data ownership at startup | true 84 | `initChownData.name` | init-chown-data container name | `init-chown-data` 85 | `initChownData.image.repository` | init-chown-data container image repository | `busybox` 86 | `initChownData.image.tag` | init-chown-data container image tag | `latest` 87 | `initChownData.image.pullPolicy` | init-chown-data container image pull policy | `IfNotPresent` 88 | `initChownData.resources` | init-chown-data pod resource requests & limits | `{}` 89 | `server.name` | Prometheus server container name | `server` 90 | `server.image.repository` | Prometheus server container image repository | `prom/prometheus` 91 | `server.image.tag` | Prometheus server container image tag | `v2.4.3` 92 | `server.image.pullPolicy` | Prometheus server container image pull policy | `IfNotPresent` 93 | `server.enableAdminApi` | If true, Prometheus administrative HTTP API will be enabled. Please note, that you should take care of administrative API access protection (ingress or some frontend Nginx with auth) before enabling it. | `false` 94 | `server.global.scrape_interval` | How frequently to scrape targets by default | `1m` 95 | `server.global.scrape_timeout` | How long until a scrape request times out | `10s` 96 | `server.global.evaluation_interval` | How frequently to evaluate rules | `1m` 97 | `server.extraArgs` | Additional Prometheus server container arguments | `{}` 98 | `server.prefixURL` | The prefix slug at which the server can be accessed | `` 99 | `server.baseURL` | The external url at which the server can be accessed | `` 100 | `server.extraHostPathMounts` | Additional Prometheus server hostPath mounts | `[]` 101 | `server.extraConfigmapMounts` | Additional Prometheus server configMap mounts | `[]` 102 | `server.extraSecretMounts` | Additional Prometheus server Secret mounts | `[]` 103 | `server.configMapOverrideName` | Prometheus server ConfigMap override where full-name is `{{.Release.Name}}-{{.Values.server.configMapOverrideName}}` and setting this value will prevent the default server ConfigMap from being generated | `""` 104 | `server.ingress.enabled` | If true, Prometheus server Ingress will be created | `false` 105 | `server.ingress.annotations` | Prometheus server Ingress annotations | `[]` 106 | `server.ingress.extraLabels` | Prometheus server Ingress additional labels | `{}` 107 | `server.ingress.hosts` | Prometheus server Ingress hostnames | `[]` 108 | `server.ingress.tls` | Prometheus server Ingress TLS configuration (YAML) | `[]` 109 | `server.nodeSelector` | node labels for Prometheus server pod assignment | `{}` 110 | `server.tolerations` | node taints to tolerate (requires Kubernetes >=1.6) | `[]` 111 | `server.affinity` | pod affinity | `{}` 112 | `server.priorityClassName` | Prometheus server priorityClassName | `nil` 113 | `server.schedulerName` | Prometheus server alternate scheduler name | `nil` 114 | `server.persistentVolume.enabled` | If true, Prometheus server will create a Persistent Volume Claim | `true` 115 | `server.persistentVolume.accessModes` | Prometheus server data Persistent Volume access modes | `[ReadWriteOnce]` 116 | `server.persistentVolume.annotations` | Prometheus server data Persistent Volume annotations | `{}` 117 | `server.persistentVolume.existingClaim` | Prometheus server data Persistent Volume existing claim name | `""` 118 | `server.persistentVolume.mountPath` | Prometheus server data Persistent Volume mount root path | `/data` 119 | `server.persistentVolume.size` | Prometheus server data Persistent Volume size | `8Gi` 120 | `server.persistentVolume.storageClass` | Prometheus server data Persistent Volume Storage Class | `unset` 121 | `server.persistentVolume.subPath` | Subdirectory of Prometheus server data Persistent Volume to mount | `""` 122 | `server.podAnnotations` | annotations to be added to Prometheus server pods | `{}` 123 | `server.deploymentAnnotations` | annotations to be added to Prometheus server deployment | `{}' 124 | `server.replicaCount` | desired number of Prometheus server pods | `1` 125 | `server.resources` | Prometheus server resource requests and limits | `{}` 126 | `server.securityContext` | Custom [security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for server containers | `{}` 127 | `server.service.annotations` | annotations for Prometheus server service | `{}` 128 | `server.service.clusterIP` | internal Prometheus server cluster service IP | `""` 129 | `server.service.externalIPs` | Prometheus server service external IP addresses | `[]` 130 | `server.service.loadBalancerIP` | IP address to assign to load balancer (if supported) | `""` 131 | `server.service.loadBalancerSourceRanges` | list of IP CIDRs allowed access to load balancer (if supported) | `[]` 132 | `server.service.nodePort` | Port to be used as the service NodePort (ignored if `server.service.type` is not `NodePort`) | `0` 133 | `server.service.servicePort` | Prometheus server service port | `80` 134 | `server.service.type` | type of Prometheus server service to create | `ClusterIP` 135 | `serviceAccounts.alertmanager.create` | If true, create the alertmanager service account | `true` 136 | `serviceAccounts.alertmanager.name` | name of the alertmanager service account to use or create | `{{ prometheus.alertmanager.fullname }}` 137 | `serviceAccounts.kubeStateMetrics.create` | If true, create the kubeStateMetrics service account | `true` 138 | `serviceAccounts.kubeStateMetrics.name` | name of the kubeStateMetrics service account to use or create | `{{ prometheus.kubeStateMetrics.fullname }}` 139 | `serviceAccounts.nodeExporter.create` | If true, create the nodeExporter service account | `true` 140 | `serviceAccounts.nodeExporter.name` | name of the nodeExporter service account to use or create | `{{ prometheus.nodeExporter.fullname }}` 141 | `serviceAccounts.pushgateway.create` | If true, create the pushgateway service account | `true` 142 | `serviceAccounts.pushgateway.name` | name of the pushgateway service account to use or create | `{{ prometheus.pushgateway.fullname }}` 143 | `serviceAccounts.server.create` | If true, create the server service account | `true` 144 | `serviceAccounts.server.name` | name of the server service account to use or create | `{{ prometheus.server.fullname }}` 145 | `server.terminationGracePeriodSeconds` | Prometheus server Pod termination grace period | `300` 146 | `server.retention` | (optional) Prometheus data retention | `""` 147 | `serverFiles.alerts` | Prometheus server alerts configuration | `{}` 148 | `serverFiles.rules` | Prometheus server rules configuration | `{}` 149 | `serverFiles.prometheus.yml` | Prometheus server scrape configuration | example configuration 150 | `extraScrapeConfigs` | Prometheus server additional scrape configuration | "" 151 | `networkPolicy.enabled` | Enable NetworkPolicy | `false` | 152 | `grafana.replicas` | Number of nodes | `1` 153 | `grafana.deploymentStrategy` | Deployment strategy | `RollingUpdate` 154 | `grafana.livenessProbe` | Liveness Probe settings | `{ "httpGet": { "path": "/api/health", "port": 3000 } "initialDelaySeconds": 60, "timeoutSeconds": 30, "failureThreshold": 10 } 155 | `grafana.readinessProbe` | Rediness Probe settings | `{ "httpGet": { "path": "/api/health", "port": 3000 } 156 | `grafana.securityContext` | Deployment securityContext | `{"runAsUser": 472, "fsGroup": 472}` 157 | `grafana.image.repository` | Image repository | `grafana/grafana` 158 | `grafana.image.tag` | Image tag. (`Must be >= 5.0.0`) | `5.3.4` 159 | `grafana.image.pullPolicy` | Image pull policy | `IfNotPresent` 160 | `grafana.service.type` | Kubernetes service type | `ClusterIP` 161 | `grafana.service.port` | Kubernetes port where service is exposed | `80` 162 | `grafana.service.annotations` | Service annotations | `{}` 163 | `grafana.service.labels` | Custom labels | `{}` 164 | `grafana.ingress.enabled` | Enables Ingress | `false` 165 | `grafana.ingress.annotations` | Ingress annotations | `{}` 166 | `grafana.ingress.labels` | Custom labels | `{}` 167 | `grafana.ingress.hosts` | Ingress accepted hostnames | `[]` 168 | `grafana.ingress.tls` | Ingress TLS configuration | `[]` 169 | `grafana.resources` | CPU/Memory resource requests/limits | `{}` 170 | `grafana.nodeSelector` | Node labels for pod assignment | `{}` 171 | `grafana.tolerations` | Toleration labels for pod assignment | `[]` 172 | `grafana.affinity` | Affinity settings for pod assignment | `{}` 173 | `grafana.schedulerName` | Alternate scheduler name | `nil` 174 | `grafana.env` | Extra environment variables passed to pods | `{}` 175 | `grafana.custom.ini` | Grafana's primary configuration | `{}` 176 | `grafana.annotations` | Deployment annotations | `{}` 177 | `grafana.podAnnotations` | Pod annotations | `{}` 178 | 179 | Specify each parameter using the `--set key=value[,key=value]` argument to `helm install`. For example, 180 | 181 | ```console 182 | $ helm install ./helm --name my-prometheus \ 183 | --set server.terminationGracePeriodSeconds=360 184 | ``` 185 | 186 | Alternatively, a YAML file that specifies the values for the above parameters can be provided while installing the chart. For example, 187 | 188 | ```console 189 | $ helm install ./helm --name my-prometheus -f values.yaml 190 | ``` 191 | 192 | ### ConfigMap Files 193 | AlertManager is configured through [alertmanager.yml](https://prometheus.io/docs/alerting/configuration/). This file (and any others listed in `alertmanagerFiles`) will be mounted into the `alertmanager` pod. 194 | 195 | Prometheus is configured through [prometheus.yml](https://prometheus.io/docs/operating/configuration/). This file (and any others listed in `serverFiles`) will be mounted into the `server` pod. 196 | 197 | ### Ingress TLS 198 | If your cluster allows automatic creation/retrieval of TLS certificates (e.g. [cert manager](https://github.com/jetstack/cert-manager)), please refer to the documentation for that mechanism. 199 | 200 | To manually configure TLS, first create/retrieve a key & certificate pair for the address(es) you wish to protect. Then create a TLS secret in the namespace: 201 | 202 | ```console 203 | kubectl create secret tls prometheus-server-tls --cert=path/to/tls.cert --key=path/to/tls.key 204 | ``` 205 | 206 | Include the secret's name, along with the desired hostnames, in the alertmanager/server Ingress TLS section of your custom `values.yaml` file: 207 | 208 | ```yaml 209 | server: 210 | ingress: 211 | ## If true, Prometheus server Ingress will be created 212 | ## 213 | enabled: true 214 | 215 | ## Prometheus server Ingress hostnames 216 | ## Must be provided if Ingress is enabled 217 | ## 218 | hosts: 219 | - prometheus.domain.com 220 | 221 | ## Prometheus server Ingress TLS configuration 222 | ## Secrets must be manually created in the namespace 223 | ## 224 | tls: 225 | - secretName: prometheus-server-tls 226 | hosts: 227 | - prometheus.domain.com 228 | ``` 229 | 230 | ### NetworkPolicy 231 | 232 | Enabling Network Policy for Prometheus will secure connections to Alert Manager 233 | and Kube State Metrics by only accepting connections from Prometheus Server. 234 | All inbound connections to Prometheus Server are still allowed. 235 | 236 | To enable network policy for Prometheus, install a networking plugin that 237 | implements the Kubernetes NetworkPolicy spec, and set `networkPolicy.enabled` to true. 238 | 239 | If NetworkPolicy is enabled for Prometheus' scrape targets, you may also need 240 | to manually create a networkpolicy which allows it. 241 | 242 | 243 | __Note__: This chart is based off of the [upstream community chart](https://github.com/helm/charts/tree/master/stable/prometheus). 244 | -------------------------------------------------------------------------------- /helm/prometheus-chart/grafana-dashboards/kubernetes-cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 1, 18 | "hideControls": false, 19 | "id": 2, 20 | "links": [], 21 | "refresh": "5m", 22 | "rows": [ 23 | { 24 | "collapse": false, 25 | "height": 250, 26 | "panels": [ 27 | { 28 | "aliasColors": {}, 29 | "bars": false, 30 | "dashLength": 10, 31 | "dashes": false, 32 | "datasource": "Prometheus", 33 | "decimals": 0, 34 | "description": "Number of pods being able to be scheduled per node.", 35 | "editable": true, 36 | "error": false, 37 | "fill": 1, 38 | "grid": { 39 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 40 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 41 | }, 42 | "height": "350", 43 | "id": 3, 44 | "legend": { 45 | "alignAsTable": true, 46 | "avg": false, 47 | "current": true, 48 | "hideEmpty": false, 49 | "hideZero": false, 50 | "max": true, 51 | "min": true, 52 | "rightSide": false, 53 | "show": true, 54 | "total": false, 55 | "values": true 56 | }, 57 | "lines": true, 58 | "linewidth": 1, 59 | "links": [], 60 | "nullPointMode": "null as zero", 61 | "percentage": false, 62 | "pointradius": 5, 63 | "points": false, 64 | "renderer": "flot", 65 | "seriesOverrides": [], 66 | "spaceLength": 11, 67 | "span": 12, 68 | "stack": false, 69 | "steppedLine": false, 70 | "targets": [ 71 | { 72 | "expr": "(sum(kube_node_status_capacity_pods) by (node)) - (sum(kube_pod_info) by (node))", 73 | "format": "time_series", 74 | "intervalFactor": 2, 75 | "legendFormat": "{{node}}", 76 | "refId": "C" 77 | } 78 | ], 79 | "thresholds": [], 80 | "timeFrom": null, 81 | "timeShift": null, 82 | "title": "Pods Schedulable", 83 | "tooltip": { 84 | "msResolution": false, 85 | "shared": true, 86 | "sort": 0, 87 | "value_type": "individual" 88 | }, 89 | "type": "graph", 90 | "xaxis": { 91 | "buckets": null, 92 | "mode": "time", 93 | "name": null, 94 | "show": true, 95 | "values": [] 96 | }, 97 | "yaxes": [ 98 | { 99 | "format": "short", 100 | "logBase": 1, 101 | "min": "0", 102 | "show": true 103 | }, 104 | { 105 | "format": "short", 106 | "logBase": 1, 107 | "show": false 108 | } 109 | ] 110 | } 111 | ], 112 | "repeat": null, 113 | "repeatIteration": null, 114 | "repeatRowId": null, 115 | "showTitle": false, 116 | "title": "Dashboard Row", 117 | "titleSize": "h6" 118 | }, 119 | { 120 | "collapse": false, 121 | "height": 250, 122 | "panels": [ 123 | { 124 | "aliasColors": {}, 125 | "bars": false, 126 | "dashLength": 10, 127 | "dashes": false, 128 | "datasource": "Prometheus", 129 | "decimals": 0, 130 | "description": "Number of pods restarting within the Kubernetes cluster.", 131 | "fill": 8, 132 | "height": "350", 133 | "id": 4, 134 | "legend": { 135 | "alignAsTable": true, 136 | "avg": false, 137 | "current": false, 138 | "max": true, 139 | "min": true, 140 | "show": true, 141 | "total": false, 142 | "values": true 143 | }, 144 | "lines": true, 145 | "linewidth": 0, 146 | "links": [], 147 | "nullPointMode": "null as zero", 148 | "percentage": false, 149 | "pointradius": 5, 150 | "points": false, 151 | "renderer": "flot", 152 | "seriesOverrides": [], 153 | "spaceLength": 10, 154 | "span": 12, 155 | "stack": false, 156 | "steppedLine": true, 157 | "targets": [ 158 | { 159 | "expr": "sum(changes(kube_pod_container_status_restarts_total[5m]))", 160 | "format": "time_series", 161 | "intervalFactor": 2, 162 | "legendFormat": "restarts", 163 | "refId": "A" 164 | } 165 | ], 166 | "thresholds": [], 167 | "timeFrom": null, 168 | "timeShift": null, 169 | "title": "Pods Restarting", 170 | "tooltip": { 171 | "shared": true, 172 | "sort": 0, 173 | "value_type": "individual" 174 | }, 175 | "type": "graph", 176 | "xaxis": { 177 | "buckets": null, 178 | "mode": "time", 179 | "name": null, 180 | "show": true, 181 | "values": [] 182 | }, 183 | "yaxes": [ 184 | { 185 | "decimals": 0, 186 | "format": "short", 187 | "label": null, 188 | "logBase": 1, 189 | "max": null, 190 | "min": "0", 191 | "show": true 192 | }, 193 | { 194 | "format": "short", 195 | "label": null, 196 | "logBase": 1, 197 | "max": null, 198 | "min": null, 199 | "show": false 200 | } 201 | ] 202 | } 203 | ], 204 | "repeat": null, 205 | "repeatIteration": null, 206 | "repeatRowId": null, 207 | "showTitle": false, 208 | "title": "Dashboard Row", 209 | "titleSize": "h6" 210 | }, 211 | { 212 | "collapse": false, 213 | "height": 250, 214 | "panels": [ 215 | { 216 | "aliasColors": {}, 217 | "bars": false, 218 | "dashLength": 10, 219 | "dashes": false, 220 | "datasource": "Prometheus", 221 | "decimals": 2, 222 | "description": "Error rates of the Kubernetes apiserver grouped by request verb. Note that the graph's values are stacked to have a better idea about the overall error rate within the Kubernetes cluster.", 223 | "editable": true, 224 | "error": false, 225 | "fill": 8, 226 | "grid": { 227 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 228 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 229 | }, 230 | "height": "350", 231 | "id": 5, 232 | "isNew": false, 233 | "legend": { 234 | "alignAsTable": true, 235 | "avg": false, 236 | "current": true, 237 | "hideEmpty": false, 238 | "hideZero": false, 239 | "max": true, 240 | "min": true, 241 | "rightSide": false, 242 | "show": true, 243 | "total": false, 244 | "values": true 245 | }, 246 | "lines": true, 247 | "linewidth": 0, 248 | "links": [], 249 | "nullPointMode": "null", 250 | "percentage": false, 251 | "pointradius": 5, 252 | "points": false, 253 | "renderer": "flot", 254 | "seriesOverrides": [], 255 | "spaceLength": 10, 256 | "span": 12, 257 | "stack": true, 258 | "steppedLine": true, 259 | "targets": [ 260 | { 261 | "expr": "sum(rate(apiserver_request_count{code!~\"2..\"}[5m])) by (verb)", 262 | "format": "time_series", 263 | "intervalFactor": 2, 264 | "legendFormat": "{{verb}}", 265 | "refId": "A", 266 | "step": 40 267 | } 268 | ], 269 | "thresholds": [], 270 | "timeFrom": null, 271 | "timeShift": null, 272 | "title": "Error Rates", 273 | "tooltip": { 274 | "msResolution": false, 275 | "shared": true, 276 | "sort": 0, 277 | "value_type": "individual" 278 | }, 279 | "type": "graph", 280 | "xaxis": { 281 | "buckets": null, 282 | "mode": "time", 283 | "name": null, 284 | "show": true, 285 | "values": [] 286 | }, 287 | "yaxes": [ 288 | { 289 | "decimals": 2, 290 | "format": "short", 291 | "label": "", 292 | "logBase": 1, 293 | "max": null, 294 | "min": "0", 295 | "show": true 296 | }, 297 | { 298 | "format": "short", 299 | "logBase": 1, 300 | "show": false 301 | } 302 | ] 303 | } 304 | ], 305 | "repeat": null, 306 | "repeatIteration": null, 307 | "repeatRowId": null, 308 | "showTitle": false, 309 | "title": "Dashboard Row", 310 | "titleSize": "h6" 311 | } 312 | ], 313 | "schemaVersion": 14, 314 | "style": "dark", 315 | "tags": [ 316 | "kubernetes" 317 | ], 318 | "templating": { 319 | "list": [] 320 | }, 321 | "time": { 322 | "from": "now-15m", 323 | "to": "now" 324 | }, 325 | "timepicker": { 326 | "refresh_intervals": [ 327 | "5s", 328 | "10s", 329 | "30s", 330 | "1m", 331 | "5m", 332 | "15m", 333 | "30m", 334 | "1h", 335 | "2h", 336 | "1d" 337 | ], 338 | "time_options": [ 339 | "5m", 340 | "15m", 341 | "1h", 342 | "6h", 343 | "12h", 344 | "24h", 345 | "2d", 346 | "7d", 347 | "30d" 348 | ] 349 | }, 350 | "timezone": "utc", 351 | "title": "Kubernetes Health", 352 | "version": 1 353 | } 354 | -------------------------------------------------------------------------------- /helm/prometheus-chart/grafana-dashboards/prometheus.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "Prometheus", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "4.4.1" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "1.0.0" 30 | } 31 | ], 32 | "annotations": { 33 | "list": [] 34 | }, 35 | "editable": true, 36 | "gnetId": null, 37 | "graphTooltip": 1, 38 | "links": [], 39 | "panels": [ 40 | { 41 | "aliasColors": {}, 42 | "bars": false, 43 | "dashLength": 10, 44 | "dashes": false, 45 | "datasource": "Prometheus", 46 | "description": "5 minute rate of samples appended", 47 | "fill": 1, 48 | "gridPos": { 49 | "h": 7, 50 | "w": 18, 51 | "x": 0, 52 | "y": 0 53 | }, 54 | "id": 4, 55 | "legend": { 56 | "alignAsTable": true, 57 | "avg": false, 58 | "current": true, 59 | "max": true, 60 | "min": true, 61 | "show": true, 62 | "total": false, 63 | "values": true 64 | }, 65 | "lines": true, 66 | "linewidth": 1, 67 | "links": [], 68 | "nullPointMode": "null", 69 | "percentage": false, 70 | "pointradius": 5, 71 | "points": false, 72 | "renderer": "flot", 73 | "seriesOverrides": [], 74 | "spaceLength": 10, 75 | "stack": false, 76 | "steppedLine": false, 77 | "targets": [ 78 | { 79 | "expr": "rate(prometheus_tsdb_head_samples_appended_total[5m])", 80 | "format": "time_series", 81 | "intervalFactor": 2, 82 | "legendFormat": "{{ instance }}", 83 | "refId": "A", 84 | "step": 60 85 | } 86 | ], 87 | "thresholds": [], 88 | "timeFrom": null, 89 | "timeShift": null, 90 | "title": "Rate of Samples Appended", 91 | "tooltip": { 92 | "shared": true, 93 | "sort": 0, 94 | "value_type": "individual" 95 | }, 96 | "type": "graph", 97 | "xaxis": { 98 | "buckets": null, 99 | "mode": "time", 100 | "name": null, 101 | "show": true, 102 | "values": [] 103 | }, 104 | "yaxes": [ 105 | { 106 | "format": "short", 107 | "label": null, 108 | "logBase": 1, 109 | "max": null, 110 | "min": "0", 111 | "show": true 112 | }, 113 | { 114 | "format": "short", 115 | "label": null, 116 | "logBase": 1, 117 | "max": null, 118 | "min": null, 119 | "show": true 120 | } 121 | ], 122 | "yaxis": { 123 | "align": false, 124 | "alignLevel": null 125 | } 126 | }, 127 | { 128 | "cacheTimeout": null, 129 | "colorBackground": false, 130 | "colorValue": true, 131 | "colors": [ 132 | "rgba(50, 172, 45, 0.97)", 133 | "rgba(237, 129, 40, 0.89)", 134 | "rgba(245, 54, 54, 0.9)" 135 | ], 136 | "datasource": "Prometheus", 137 | "description": "Total number of time series in prometheus", 138 | "format": "none", 139 | "gauge": { 140 | "maxValue": 100, 141 | "minValue": 0, 142 | "show": false, 143 | "thresholdLabels": false, 144 | "thresholdMarkers": true 145 | }, 146 | "gridPos": { 147 | "h": 7, 148 | "w": 6, 149 | "x": 18, 150 | "y": 0 151 | }, 152 | "id": 13, 153 | "interval": null, 154 | "links": [], 155 | "mappingType": 1, 156 | "mappingTypes": [ 157 | { 158 | "name": "value to text", 159 | "value": 1 160 | }, 161 | { 162 | "name": "range to text", 163 | "value": 2 164 | } 165 | ], 166 | "maxDataPoints": 100, 167 | "nullPointMode": "connected", 168 | "nullText": null, 169 | "postfix": "", 170 | "postfixFontSize": "50%", 171 | "prefix": "", 172 | "prefixFontSize": "50%", 173 | "rangeMaps": [ 174 | { 175 | "from": "null", 176 | "text": "N/A", 177 | "to": "null" 178 | } 179 | ], 180 | "sparkline": { 181 | "fillColor": "rgba(31, 118, 189, 0.18)", 182 | "full": true, 183 | "lineColor": "rgb(31, 120, 193)", 184 | "show": true 185 | }, 186 | "tableColumn": "", 187 | "targets": [ 188 | { 189 | "expr": "sum(prometheus_tsdb_head_series)", 190 | "format": "time_series", 191 | "intervalFactor": 2, 192 | "refId": "B", 193 | "step": 40 194 | } 195 | ], 196 | "thresholds": "1000000,2000000", 197 | "title": "Total Series", 198 | "type": "singlestat", 199 | "valueFontSize": "100%", 200 | "valueMaps": [ 201 | { 202 | "op": "=", 203 | "text": "N/A", 204 | "value": "null" 205 | } 206 | ], 207 | "valueName": "current" 208 | }, 209 | { 210 | "aliasColors": {}, 211 | "bars": false, 212 | "dashLength": 10, 213 | "dashes": false, 214 | "datasource": "Prometheus", 215 | "description": "Time take for rule evaluation", 216 | "fill": 1, 217 | "gridPos": { 218 | "h": 7, 219 | "w": 12, 220 | "x": 0, 221 | "y": 7 222 | }, 223 | "id": 5, 224 | "legend": { 225 | "alignAsTable": true, 226 | "avg": false, 227 | "current": true, 228 | "max": true, 229 | "min": true, 230 | "show": true, 231 | "total": false, 232 | "values": true 233 | }, 234 | "lines": true, 235 | "linewidth": 1, 236 | "links": [], 237 | "nullPointMode": "null", 238 | "percentage": false, 239 | "pointradius": 5, 240 | "points": false, 241 | "renderer": "flot", 242 | "seriesOverrides": [], 243 | "spaceLength": 10, 244 | "stack": false, 245 | "steppedLine": false, 246 | "targets": [ 247 | { 248 | "expr": "prometheus_rule_evaluation_duration_seconds{quantile=\"0.99\"}", 249 | "format": "time_series", 250 | "intervalFactor": 2, 251 | "legendFormat": "{{ instance }} - 0.99 quantile", 252 | "refId": "A", 253 | "step": 120 254 | }, 255 | { 256 | "expr": "prometheus_rule_evaluation_duration_seconds{quantile=\"0.9\"}", 257 | "format": "time_series", 258 | "intervalFactor": 2, 259 | "legendFormat": "{{ instance }} - 0.9 quantile", 260 | "refId": "B", 261 | "step": 120 262 | }, 263 | { 264 | "expr": "prometheus_rule_evaluation_duration_seconds{quantile=\"0.5\"}", 265 | "format": "time_series", 266 | "intervalFactor": 2, 267 | "legendFormat": "{{ instance }} - 0.5 quantile", 268 | "refId": "C", 269 | "step": 120 270 | } 271 | ], 272 | "thresholds": [], 273 | "timeFrom": null, 274 | "timeShift": null, 275 | "title": "Rule Evaluation Duration", 276 | "tooltip": { 277 | "shared": true, 278 | "sort": 0, 279 | "value_type": "individual" 280 | }, 281 | "type": "graph", 282 | "xaxis": { 283 | "buckets": null, 284 | "mode": "time", 285 | "name": null, 286 | "show": true, 287 | "values": [] 288 | }, 289 | "yaxes": [ 290 | { 291 | "format": "s", 292 | "label": null, 293 | "logBase": 1, 294 | "max": null, 295 | "min": "0", 296 | "show": true 297 | }, 298 | { 299 | "format": "short", 300 | "label": null, 301 | "logBase": 1, 302 | "max": null, 303 | "min": null, 304 | "show": true 305 | } 306 | ], 307 | "yaxis": { 308 | "align": false, 309 | "alignLevel": null 310 | } 311 | }, 312 | { 313 | "aliasColors": {}, 314 | "bars": false, 315 | "dashLength": 10, 316 | "dashes": false, 317 | "datasource": "Prometheus", 318 | "description": "TIme taken to send notifications", 319 | "fill": 1, 320 | "gridPos": { 321 | "h": 7, 322 | "w": 12, 323 | "x": 12, 324 | "y": 7 325 | }, 326 | "id": 6, 327 | "legend": { 328 | "alignAsTable": true, 329 | "avg": false, 330 | "current": true, 331 | "max": true, 332 | "min": true, 333 | "show": true, 334 | "total": false, 335 | "values": true 336 | }, 337 | "lines": true, 338 | "linewidth": 1, 339 | "links": [], 340 | "nullPointMode": "null", 341 | "percentage": false, 342 | "pointradius": 5, 343 | "points": false, 344 | "renderer": "flot", 345 | "seriesOverrides": [], 346 | "spaceLength": 10, 347 | "stack": false, 348 | "steppedLine": false, 349 | "targets": [ 350 | { 351 | "expr": "prometheus_notifications_latency_seconds{quantile=\"0.99\"}", 352 | "format": "time_series", 353 | "intervalFactor": 2, 354 | "legendFormat": "{{ instance }} - 0.99 quantile", 355 | "refId": "A", 356 | "step": 120 357 | }, 358 | { 359 | "expr": "prometheus_notifications_latency_seconds{quantile=\"0.9\"}", 360 | "format": "time_series", 361 | "intervalFactor": 2, 362 | "legendFormat": "{{ instance }} - 0.9 quantile", 363 | "refId": "B", 364 | "step": 120 365 | }, 366 | { 367 | "expr": "prometheus_notifications_latency_seconds{quantile=\"0.5\"}", 368 | "format": "time_series", 369 | "intervalFactor": 2, 370 | "legendFormat": "{{ instance }} - 0.5 quantile", 371 | "refId": "C", 372 | "step": 120 373 | } 374 | ], 375 | "thresholds": [], 376 | "timeFrom": null, 377 | "timeShift": null, 378 | "title": "Notification Latency", 379 | "tooltip": { 380 | "shared": true, 381 | "sort": 0, 382 | "value_type": "individual" 383 | }, 384 | "type": "graph", 385 | "xaxis": { 386 | "buckets": null, 387 | "mode": "time", 388 | "name": null, 389 | "show": true, 390 | "values": [] 391 | }, 392 | "yaxes": [ 393 | { 394 | "format": "s", 395 | "label": null, 396 | "logBase": 1, 397 | "max": null, 398 | "min": "0", 399 | "show": true 400 | }, 401 | { 402 | "format": "short", 403 | "label": null, 404 | "logBase": 1, 405 | "max": null, 406 | "min": null, 407 | "show": true 408 | } 409 | ], 410 | "yaxis": { 411 | "align": false, 412 | "alignLevel": null 413 | } 414 | }, 415 | { 416 | "aliasColors": {}, 417 | "bars": false, 418 | "dashLength": 10, 419 | "dashes": false, 420 | "datasource": "Prometheus", 421 | "fill": 1, 422 | "gridPos": { 423 | "h": 7, 424 | "w": 24, 425 | "x": 0, 426 | "y": 14 427 | }, 428 | "id": 8, 429 | "legend": { 430 | "alignAsTable": true, 431 | "avg": false, 432 | "current": true, 433 | "max": true, 434 | "min": true, 435 | "show": true, 436 | "total": false, 437 | "values": true 438 | }, 439 | "lines": true, 440 | "linewidth": 1, 441 | "links": [], 442 | "nullPointMode": "null", 443 | "percentage": false, 444 | "pointradius": 5, 445 | "points": false, 446 | "renderer": "flot", 447 | "seriesOverrides": [], 448 | "spaceLength": 10, 449 | "stack": false, 450 | "steppedLine": false, 451 | "targets": [ 452 | { 453 | "expr": "kube_pod_container_resource_limits_memory_bytes{container=\"prometheus\"}", 454 | "format": "time_series", 455 | "instant": false, 456 | "intervalFactor": 2, 457 | "legendFormat": "memory limit", 458 | "refId": "B" 459 | }, 460 | { 461 | "expr": "container_memory_usage_bytes{container_name=\"prometheus\"}", 462 | "format": "time_series", 463 | "instant": false, 464 | "intervalFactor": 2, 465 | "legendFormat": "memory used", 466 | "metric": "container_memory_usage_bytes", 467 | "refId": "A", 468 | "step": 60 469 | } 470 | ], 471 | "thresholds": [], 472 | "timeFrom": null, 473 | "timeShift": null, 474 | "title": "Memory usage/limit", 475 | "tooltip": { 476 | "shared": true, 477 | "sort": 0, 478 | "value_type": "individual" 479 | }, 480 | "type": "graph", 481 | "xaxis": { 482 | "buckets": null, 483 | "mode": "time", 484 | "name": null, 485 | "show": true, 486 | "values": [] 487 | }, 488 | "yaxes": [ 489 | { 490 | "format": "bytes", 491 | "label": null, 492 | "logBase": 1, 493 | "max": null, 494 | "min": "0", 495 | "show": true 496 | }, 497 | { 498 | "format": "short", 499 | "label": null, 500 | "logBase": 1, 501 | "max": null, 502 | "min": null, 503 | "show": true 504 | } 505 | ], 506 | "yaxis": { 507 | "align": false, 508 | "alignLevel": null 509 | } 510 | }, 511 | { 512 | "aliasColors": {}, 513 | "bars": false, 514 | "dashLength": 10, 515 | "dashes": false, 516 | "datasource": "Prometheus", 517 | "fill": 1, 518 | "gridPos": { 519 | "h": 7, 520 | "w": 24, 521 | "x": 0, 522 | "y": 21 523 | }, 524 | "id": 9, 525 | "legend": { 526 | "alignAsTable": true, 527 | "avg": false, 528 | "current": true, 529 | "max": true, 530 | "min": true, 531 | "show": true, 532 | "total": false, 533 | "values": true 534 | }, 535 | "lines": true, 536 | "linewidth": 1, 537 | "links": [], 538 | "nullPointMode": "null", 539 | "percentage": false, 540 | "pointradius": 5, 541 | "points": false, 542 | "renderer": "flot", 543 | "seriesOverrides": [], 544 | "spaceLength": 10, 545 | "stack": false, 546 | "steppedLine": false, 547 | "targets": [ 548 | { 549 | "expr": "kube_pod_container_resource_limits_cpu_cores{container=\"prometheus\"}", 550 | "format": "time_series", 551 | "instant": false, 552 | "intervalFactor": 2, 553 | "legendFormat": "cpu limit", 554 | "refId": "B" 555 | }, 556 | { 557 | "expr": "sum(rate(container_cpu_usage_seconds_total{container_name=\"prometheus\"}[3m]))", 558 | "format": "time_series", 559 | "instant": false, 560 | "intervalFactor": 2, 561 | "legendFormat": "cpu used", 562 | "metric": "container_cpu_usage_seconds_total", 563 | "refId": "A", 564 | "step": 60 565 | } 566 | ], 567 | "thresholds": [], 568 | "timeFrom": null, 569 | "timeShift": null, 570 | "title": "CPU usage/limit", 571 | "tooltip": { 572 | "shared": true, 573 | "sort": 0, 574 | "value_type": "individual" 575 | }, 576 | "type": "graph", 577 | "xaxis": { 578 | "buckets": null, 579 | "mode": "time", 580 | "name": null, 581 | "show": true, 582 | "values": [] 583 | }, 584 | "yaxes": [ 585 | { 586 | "format": "none", 587 | "label": "cores", 588 | "logBase": 1, 589 | "max": null, 590 | "min": null, 591 | "show": true 592 | }, 593 | { 594 | "format": "short", 595 | "label": null, 596 | "logBase": 1, 597 | "max": null, 598 | "min": null, 599 | "show": false 600 | } 601 | ], 602 | "yaxis": { 603 | "align": false, 604 | "alignLevel": null 605 | } 606 | } 607 | ], 608 | "refresh": "1m", 609 | "schemaVersion": 16, 610 | "style": "dark", 611 | "tags": [ 612 | ], 613 | "templating": { 614 | "list": [] 615 | }, 616 | "time": { 617 | "from": "now-15m", 618 | "to": "now" 619 | }, 620 | "timepicker": { 621 | "refresh_intervals": [ 622 | "5s", 623 | "10s", 624 | "30s", 625 | "1m", 626 | "5m", 627 | "15m", 628 | "30m", 629 | "1h", 630 | "2h", 631 | "1d" 632 | ], 633 | "time_options": [ 634 | "5m", 635 | "15m", 636 | "1h", 637 | "6h", 638 | "12h", 639 | "24h", 640 | "2d", 641 | "7d", 642 | "30d" 643 | ] 644 | }, 645 | "timezone": "utc", 646 | "title": "Prometheus", 647 | "uid": "iWowmlSmk", 648 | "version": 1 649 | } 650 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/alertmanager.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: alertmanager 3 | rules: 4 | - alert: AlertManagerNotificationsFailing 5 | expr: rate(alertmanager_notifications_failed_total[5m]) > 0 6 | for: 10m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: AlertManager {{ $labels.integration }} notifications are failing. 11 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/configmap.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: configmap 3 | rules: 4 | - alert: ConfigMapCountTooHigh 5 | expr: kube_configmap_info > 1000 6 | for: 15m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: ConfigMap count too high. 11 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/container.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: container 3 | rules: 4 | - alert: ContainerIsRestartingTooFrequently 5 | expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 5 6 | for: 5m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: Container {{ $labels.container }} in pod {{ $labels.exported_namespace }}/{{ $labels.pod }} is restarting too often. 11 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/daemonset.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: daemonset 3 | rules: 4 | - alert: DaemonSetNotSatisfied 5 | expr: kube_daemonset_status_number_unavailable{} > 0 6 | for: 15m 7 | labels: 8 | severity: page 9 | annotations: 10 | description: Daemonset {{ $labels.exported_namespace}}/{{ $labels.daemonset }} 11 | is not satisfied. 12 | 13 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/deployment.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: deployment 3 | rules: 4 | - alert: DeploymentNotSatisfied 5 | expr: kube_deployment_status_replicas_unavailable{} > 0 6 | for: 5m 7 | labels: 8 | severity: page 9 | annotations: 10 | description: Deployment {{ $labels.exported_namespace}}/{{ $labels.deployment }} 11 | is not satisfied. 12 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/disk.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: disk 3 | rules: 4 | - alert: PersistentVolumeSpaceTooLow 5 | expr: 100 * node_filesystem_free{mountpoint=~"/rootfs/var/lib/kubelet/.*"} / node_filesystem_size{mountpoint=~"/rootfs/var/lib/kubelet/.*"} < 10 6 | for: 10m 7 | labels: 8 | severity: page 9 | annotations: 10 | description: Persistent volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space. 11 | 12 | - alert: RootVolumeSpaceTooLow 13 | expr: 100 * node_filesystem_free{mountpoint="/rootfs"} / node_filesystem_size{mountpoint="/rootfs"} < 10 14 | for: 10m 15 | labels: 16 | severity: page 17 | annotations: 18 | description: Root volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space. 19 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/fluentbit.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: fluentbit 3 | rules: 4 | - alert: FluentbitTooManyErrors 5 | expr: rate(fluentbit_output_retries_failed_total[10m]) > 0 6 | for: 10m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: Fluentbit ({{ $labels.instance }}) is erroring. 11 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/ingress-controller.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: ingress-controller 3 | rules: 4 | - alert: IngressControllerReloadIsFailing 5 | expr: increase(ingress_controller_errors{count="reloads"}[5m]) > 1 6 | for: 5m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: Ingress Controller cannot reload new configuration. Please check IC logs. 11 | 12 | - alert: IngressControllerSSLCertificateWillExpireSoon 13 | expr: avg(ingress_controller_ssl_expire_time_seconds{host!~"(etcd|api).*"}) without (instance) < (time() + (10 * 24 * 3600)) 14 | for: 5m 15 | labels: 16 | severity: notify 17 | annotations: 18 | description: SSL certificate for {{ $labels.host }} will expire in less than 10 days. 19 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/job.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: job 3 | rules: 4 | - alert: JobFailed 5 | expr: kube_job_failed{condition="true"} == 1 6 | for: 15m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: Job {{ $labels.exported_namespace }}/{{ $labels.exported_job }} is failed. 11 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/logging-data.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: fluentbit 3 | rules: 4 | - alert: LoggingDataNotAvailable 5 | # threshold value is 3e+8 bytes = 300 MB 6 | expr: kubelet_volume_stats_used_bytes{persistentvolumeclaim="elasticsearch-data"} < (300 * 1000 * 1000) 7 | for: 60m 8 | labels: 9 | severity: notify 10 | annotations: 11 | description: elasticsearch has no or very little log data in its volume. 12 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/network.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: network 3 | rules: 4 | - alert: DNSErrorRateTooHigh 5 | expr: rate(dns_error_total[15m]) > 0 6 | for: 15m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: DNS error rate is too high for {{ $labels.pod_name }}. 11 | 12 | - alert: NetworkErrorRateTooHigh 13 | expr: rate(network_error_total[15m]) > 0 14 | for: 15m 15 | labels: 16 | severity: notify 17 | annotations: 18 | description: Network error rate is too high for {{ $labels.pod_name }}. 19 | 20 | - alert: SYNRetransmissionRateTooHigh 21 | expr: rate(node_netstat_TcpExt_TCPSynRetrans[15m]) > 3 22 | for: 15m 23 | labels: 24 | severity: notify 25 | annotations: 26 | description: SYN retransmission rate is too high for {{ $labels.instance }}. 27 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/node.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: node 3 | rules: 4 | - alert: NodeStateFlapping 5 | # check for flapping node status 6 | # "changes" is returning the amount of value changes in the vector 7 | expr: changes(kube_node_status_condition{condition="Ready",status="true"}[30m]) > 6 8 | for: 5m 9 | labels: 10 | severity: notify 11 | annotations: 12 | description: Node {{ $labels.node }} status is flapping. 13 | 14 | - alert: NodeHasConstantOOMKills 15 | # Alert if node has more than 3 OOM kills for last hour (1 every 5 minutes). 16 | # This is only way to detect OOM kills at the moment. 17 | expr: increase(node_vmstat_oom_kill{}[1h]) > 3 18 | for: 10m 19 | labels: 20 | severity: notify 21 | annotations: 22 | description: Node {{ $labels.ip }} has constant OOM kills. 23 | 24 | - alert: NodeIsUnschedulable 25 | expr: kube_node_spec_unschedulable != 0 26 | for: 45m 27 | labels: 28 | severity: notify 29 | annotations: 30 | description: Node {{ $labels.node }} is unschedulable. 31 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/pod.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: pod 3 | rules: 4 | - alert: PodStuck 5 | expr: kube_pod_status_phase{phase="Pending"} == 1 6 | for: 15m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: Pod {{ $labels.exported_namespace }}/{{ $labels.pod }} is stuck in Pending. 11 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/prometheus.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: prometheus 3 | rules: 4 | - alert: PrometheusCompactionFailed 5 | expr: prometheus_tsdb_compactions_failed_total > 1 6 | for: 5m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: Prometheus compaction has failed. 11 | 12 | - alert: PrometheusCPUUsageTooHigh 13 | expr: sum(rate(container_cpu_usage_seconds_total{container_name="prometheus"}[20m])) / kube_pod_container_resource_limits_cpu_cores{container="prometheus"} > 0.93 14 | for: 5m 15 | labels: 16 | severity: page 17 | annotations: 18 | description: Prometheus cpu usage is reaching the treshold. Monitoring of the installation might be unreliable. 19 | 20 | 21 | - alert: PrometheusMemoryUsageTooHigh 22 | expr: avg_over_time(container_memory_usage_bytes{container_name="prometheus"}[20m]) / kube_pod_container_resource_limits_memory_bytes{container="prometheus"} > 0.93 23 | for: 5m 24 | labels: 25 | severity: page 26 | annotations: 27 | description: Prometheus memory usage is reaching the treshold. Monitoring of the installation might be unreliable. 28 | 29 | 30 | - alert: PrometheusWALCorrupted 31 | expr: prometheus_tsdb_wal_corruptions_total > 1 32 | for: 5m 33 | labels: 34 | severity: notify 35 | annotations: 36 | description: Prometheus WAL is corrupted. 37 | 38 | - alert: PrometheusIsRestarting 39 | # This alert covers the issue when Prometheus restarted (OOM killed) due to 40 | # lack of resources. Just check if total number of restarts more than 3. 41 | # In that case, we should be as quick as possible, so Prometheus will be able 42 | # to alarm before it will be killed by OOM killer. 43 | expr: kube_pod_container_status_restarts_total{container="prometheus"} > 3 44 | for: 1m 45 | labels: 46 | severity: notify 47 | annotations: 48 | description: Prometheus {{ $labels.exported_namespace }}/{{ $labels.pod }} is restarting too much probably due to OOM kills. 49 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/up.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: up 3 | rules: 4 | - alert: APIServerDown 5 | expr: up{app="kubernetes"} == 0 6 | for: 10m 7 | labels: 8 | severity: page 9 | annotations: 10 | description: Kubernetes API Server ({{ $labels.instance }}) is down. 11 | 12 | - alert: CadvisorDown 13 | expr: up{app="cadvisor"} == 0 14 | for: 15m 15 | labels: 16 | severity: notify 17 | annotations: 18 | description: Cadvisor ({{ $labels.ip }}) is down. 19 | 20 | - alert: KubeletDown 21 | expr: up{app="kubelet"} == 0 22 | for: 10m 23 | labels: 24 | severity: page 25 | annotations: 26 | description: Kubelet ({{ $labels.ip }}) is down. 27 | opsrecipe: https://github.com/giantswarm/ops-recipes/blob/master/020-kubelet-is-down.md 28 | 29 | - alert: KubeStateMetricsDown 30 | expr: up{app="kube-state-metrics"} == 0 31 | for: 10m 32 | labels: 33 | severity: page 34 | annotations: 35 | description: KubeStateMetrics ({{ $labels.instance }}) is down. 36 | 37 | - alert: NodeExporterDown 38 | expr: up{app="node-exporter"} == 0 39 | for: 10m 40 | labels: 41 | severity: page 42 | annotations: 43 | description: NodeExporter ({{ $labels.ip }}) is down. 44 | 45 | - alert: TargetDown 46 | expr: up{cluster_type="host", app!~"cadvisor|etcd|kubelet|kubernetes|master|node-exporter|worker"} == 0 47 | for: 10m 48 | labels: 49 | severity: page 50 | annotations: 51 | description: Target {{ $labels.namespace }}/{{ $labels.app }} ({{ $labels.instance }}) is down. 52 | 53 | - alert: TargetIsFlapping 54 | expr: changes(up[30m]) > 5 55 | for: 5m 56 | labels: 57 | severity: notify 58 | annotations: 59 | description: Target {{ $labels.app }} ({{ $labels.instance }}) is flapping. 60 | -------------------------------------------------------------------------------- /helm/prometheus-chart/prometheus-alerts/volume.rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: volume 3 | rules: 4 | - alert: VolumeUsedInodes 5 | expr: kubelet_volume_stats_inodes_used{} / kubelet_volume_stats_inodes{} > 0.9 6 | for: 10m 7 | labels: 8 | severity: notify 9 | annotations: 10 | description: Volume {{ $labels.persistentvolumeclaim }} has more than 90% inodes used. 11 | 12 | - alert: VolumeUsedSpace 13 | expr: kubelet_volume_stats_used_bytes{} / kubelet_volume_stats_capacity_bytes{} > 0.9 14 | for: 10m 15 | labels: 16 | severity: notify 17 | annotations: 18 | description: Volume {{ $labels.persistentvolumeclaim }} has more than 90% space used. 19 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | The Prometheus server can be accessed via port {{ .Values.server.service.servicePort }} on the following DNS name from within your cluster: 2 | {{ template "prometheus.server.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local 3 | 4 | {{ if .Values.server.ingress.enabled -}} 5 | From outside the cluster, the server URL(s) are: 6 | {{- range .Values.server.ingress.hosts }} 7 | http://{{ . }} 8 | {{- end }} 9 | {{- else }} 10 | Get the Prometheus server URL by running these commands in the same shell: 11 | {{- if contains "NodePort" .Values.server.service.type }} 12 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus.server.fullname" . }}) 13 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 14 | echo http://$NODE_IP:$NODE_PORT 15 | {{- else if contains "LoadBalancer" .Values.server.service.type }} 16 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 17 | You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "prometheus.server.fullname" . }}' 18 | 19 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus.server.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') 20 | echo http://$SERVICE_IP:{{ .Values.server.service.servicePort }} 21 | {{- else if contains "ClusterIP" .Values.server.service.type }} 22 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus.name" . }},component={{ .Values.server.name }}" -o jsonpath="{.items[0].metadata.name}") 23 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 9090 24 | {{- end }} 25 | {{- end }} 26 | 27 | {{- if .Values.server.persistentVolume.enabled }} 28 | {{- else }} 29 | ################################################################################# 30 | ###### WARNING: Persistence is disabled!!! You will lose your data when ##### 31 | ###### the Server pod is terminated. ##### 32 | ################################################################################# 33 | {{- end }} 34 | 35 | {{ if .Values.alertmanager.enabled }} 36 | The Prometheus alertmanager can be accessed via port {{ .Values.alertmanager.service.servicePort }} on the following DNS name from within your cluster: 37 | {{ template "prometheus.alertmanager.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local 38 | 39 | {{ if .Values.alertmanager.ingress.enabled -}} 40 | From outside the cluster, the alertmanager URL(s) are: 41 | {{- range .Values.alertmanager.ingress.hosts }} 42 | http://{{ . }} 43 | {{- end }} 44 | {{- else }} 45 | Get the Alertmanager URL by running these commands in the same shell: 46 | {{- if contains "NodePort" .Values.alertmanager.service.type }} 47 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus.alertmanager.fullname" . }}) 48 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 49 | echo http://$NODE_IP:$NODE_PORT 50 | {{- else if contains "LoadBalancer" .Values.alertmanager.service.type }} 51 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 52 | You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "prometheus.alertmanager.fullname" . }}' 53 | 54 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus.alertmanager.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') 55 | echo http://$SERVICE_IP:{{ .Values.alertmanager.service.servicePort }} 56 | {{- else if contains "ClusterIP" .Values.alertmanager.service.type }} 57 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus.name" . }},component={{ .Values.alertmanager.name }}" -o jsonpath="{.items[0].metadata.name}") 58 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 9093 59 | {{- end }} 60 | {{- end }} 61 | 62 | {{- if .Values.alertmanager.persistentVolume.enabled }} 63 | {{- else }} 64 | ################################################################################# 65 | ###### WARNING: Persistence is disabled!!! You will lose your data when ##### 66 | ###### the AlertManager pod is terminated. ##### 67 | ################################################################################# 68 | {{- end }} 69 | {{- end }} 70 | 71 | Grafana 72 | 73 | 1. Get your '{{ .Values.grafana.adminUser }}' user password by running: 74 | 75 | kubectl get secret --namespace {{ .Release.Namespace }} {{ template "prometheus.grafana.fullname" . }} -o jsonpath="{.data.admin-password}" | base64 --decode ; echo 76 | 77 | 2. The Grafana server can be accessed via port {{ .Values.grafana.service.port }} on the following DNS name from within your cluster: 78 | 79 | {{ template "prometheus.grafana.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local 80 | {{ if .Values.grafana.ingress.enabled }} 81 | From outside the cluster, the server URL(s) are: 82 | {{- range .Values.grafana.ingress.hosts }} 83 | http://{{ . }} 84 | {{- end }} 85 | {{ else }} 86 | Get the Grafana URL to visit by running these commands in the same shell: 87 | {{ if contains "NodePort" .Values.grafana.service.type -}} 88 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus.grafana.fullname" . }}) 89 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 90 | echo http://$NODE_IP:$NODE_PORT 91 | {{ else if contains "LoadBalancer" .Values.grafana.service.type -}} 92 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 93 | You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "prometheus.grafana.fullname" . }}' 94 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus.grafana.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') 95 | http://$SERVICE_IP:{{ .Values.grafana.service.port -}} 96 | {{ else if contains "ClusterIP" .Values.grafana.service.type }} 97 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus.name" . }},component={{ .Values.grafana.name }}" -o jsonpath="{.items[0].metadata.name}") 98 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 3000 99 | {{- end }} 100 | {{- end }} 101 | 102 | 3. Login with the password from step 1 and the username: {{ .Values.grafana.adminUser }} 103 | 104 | For more information on running Prometheus, visit: 105 | https://prometheus.io/ -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "prometheus.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | */}} 13 | {{- define "prometheus.fullname" -}} 14 | {{- if .Values.fullnameOverride -}} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 16 | {{- else -}} 17 | {{- $name := default .Chart.Name .Values.nameOverride -}} 18 | {{- if contains $name .Release.Name -}} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 20 | {{- else -}} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 22 | {{- end -}} 23 | {{- end -}} 24 | {{- end -}} 25 | 26 | {{/* 27 | Create a fully qualified grafana name. 28 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 29 | */}} 30 | 31 | {{- define "prometheus.alertmanager.fullname" -}} 32 | {{- if .Values.alertmanager.fullnameOverride -}} 33 | {{- .Values.alertmanager.fullnameOverride | trunc 63 | trimSuffix "-" -}} 34 | {{- else -}} 35 | {{- $name := default .Chart.Name .Values.nameOverride -}} 36 | {{- if contains $name .Release.Name -}} 37 | {{- printf "%s-%s" .Release.Name .Values.alertmanager.name | trunc 63 | trimSuffix "-" -}} 38 | {{- else -}} 39 | {{- printf "%s-%s-%s" .Release.Name $name .Values.alertmanager.name | trunc 63 | trimSuffix "-" -}} 40 | {{- end -}} 41 | {{- end -}} 42 | {{- end -}} 43 | 44 | 45 | {{- define "prometheus.grafana.fullname" -}} 46 | {{- if .Values.grafana.fullnameOverride -}} 47 | {{- .Values.grafana.fullnameOverride | trunc 63 | trimSuffix "-" -}} 48 | {{- else -}} 49 | {{- $name := default .Chart.Name .Values.nameOverride -}} 50 | {{- if contains $name .Release.Name -}} 51 | {{- printf "%s-%s" .Release.Name .Values.grafana.name | trunc 63 | trimSuffix "-" -}} 52 | {{- else -}} 53 | {{- printf "%s-%s-%s" .Release.Name $name .Values.grafana.name | trunc 63 | trimSuffix "-" -}} 54 | {{- end -}} 55 | {{- end -}} 56 | {{- end -}} 57 | 58 | 59 | {{/* 60 | Create a fully qualified kube-state-metrics name. 61 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 62 | */}} 63 | {{- define "prometheus.kubeStateMetrics.fullname" -}} 64 | {{- if .Values.kubeStateMetrics.fullnameOverride -}} 65 | {{- .Values.kubeStateMetrics.fullnameOverride | trunc 63 | trimSuffix "-" -}} 66 | {{- else -}} 67 | {{- $name := default .Chart.Name .Values.nameOverride -}} 68 | {{- if contains $name .Release.Name -}} 69 | {{- printf "%s-%s" .Release.Name .Values.kubeStateMetrics.name | trunc 63 | trimSuffix "-" -}} 70 | {{- else -}} 71 | {{- printf "%s-%s-%s" .Release.Name $name .Values.kubeStateMetrics.name | trunc 63 | trimSuffix "-" -}} 72 | {{- end -}} 73 | {{- end -}} 74 | {{- end -}} 75 | 76 | {{/* 77 | Create a fully qualified node-exporter name. 78 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 79 | */}} 80 | {{- define "prometheus.nodeExporter.fullname" -}} 81 | {{- if .Values.nodeExporter.fullnameOverride -}} 82 | {{- .Values.nodeExporter.fullnameOverride | trunc 63 | trimSuffix "-" -}} 83 | {{- else -}} 84 | {{- $name := default .Chart.Name .Values.nameOverride -}} 85 | {{- if contains $name .Release.Name -}} 86 | {{- printf "%s-%s" .Release.Name .Values.nodeExporter.name | trunc 63 | trimSuffix "-" -}} 87 | {{- else -}} 88 | {{- printf "%s-%s-%s" .Release.Name $name .Values.nodeExporter.name | trunc 63 | trimSuffix "-" -}} 89 | {{- end -}} 90 | {{- end -}} 91 | {{- end -}} 92 | 93 | {{/* 94 | Create a fully qualified Prometheus server name. 95 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 96 | */}} 97 | {{- define "prometheus.server.fullname" -}} 98 | {{- if .Values.server.fullnameOverride -}} 99 | {{- .Values.server.fullnameOverride | trunc 63 | trimSuffix "-" -}} 100 | {{- else -}} 101 | {{- $name := default .Chart.Name .Values.nameOverride -}} 102 | {{- if contains $name .Release.Name -}} 103 | {{- printf "%s-%s" .Release.Name .Values.server.name | trunc 63 | trimSuffix "-" -}} 104 | {{- else -}} 105 | {{- printf "%s-%s-%s" .Release.Name $name .Values.server.name | trunc 63 | trimSuffix "-" -}} 106 | {{- end -}} 107 | {{- end -}} 108 | {{- end -}} 109 | 110 | {{/* 111 | Create a fully qualified pushgateway name. 112 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 113 | */}} 114 | {{- define "prometheus.pushgateway.fullname" -}} 115 | {{- if .Values.pushgateway.fullnameOverride -}} 116 | {{- .Values.pushgateway.fullnameOverride | trunc 63 | trimSuffix "-" -}} 117 | {{- else -}} 118 | {{- $name := default .Chart.Name .Values.nameOverride -}} 119 | {{- if contains $name .Release.Name -}} 120 | {{- printf "%s-%s" .Release.Name .Values.pushgateway.name | trunc 63 | trimSuffix "-" -}} 121 | {{- else -}} 122 | {{- printf "%s-%s-%s" .Release.Name $name .Values.pushgateway.name | trunc 63 | trimSuffix "-" -}} 123 | {{- end -}} 124 | {{- end -}} 125 | {{- end -}} 126 | 127 | {{/* 128 | Return the appropriate apiVersion for networkpolicy. 129 | */}} 130 | {{- define "prometheus.networkPolicy.apiVersion" -}} 131 | {{- if semverCompare ">=1.4-0, <1.7-0" .Capabilities.KubeVersion.GitVersion -}} 132 | {{- print "extensions/v1beta1" -}} 133 | {{- else if semverCompare "^1.7-0" .Capabilities.KubeVersion.GitVersion -}} 134 | {{- print "networking.k8s.io/v1" -}} 135 | {{- end -}} 136 | {{- end -}} 137 | 138 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/alertmanager-configmap.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.alertmanager.enabled (empty .Values.alertmanager.configMapOverrideName) -}} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 8 | component: "{{ .Values.alertmanager.name }}" 9 | heritage: {{ .Release.Service }} 10 | release: {{ .Release.Name }} 11 | name: {{ template "prometheus.alertmanager.fullname" . }} 12 | data: 13 | {{- $root := . -}} 14 | {{- range $key, $value := .Values.alertmanagerFiles }} 15 | {{ $key }}: | 16 | {{ toYaml $value | default "{}" | indent 4 }} 17 | {{- end -}} 18 | {{- end -}} 19 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/alertmanager-deployment.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.alertmanager.enabled -}} 2 | apiVersion: extensions/v1beta1 3 | kind: Deployment 4 | metadata: 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 8 | component: "{{ .Values.alertmanager.name }}" 9 | heritage: {{ .Release.Service }} 10 | release: {{ .Release.Name }} 11 | name: {{ template "prometheus.alertmanager.fullname" . }} 12 | spec: 13 | replicas: {{ .Values.alertmanager.replicaCount }} 14 | {{- if .Values.server.strategy }} 15 | strategy: 16 | {{ toYaml .Values.server.strategy | indent 4 }} 17 | {{- end }} 18 | template: 19 | metadata: 20 | {{- if .Values.alertmanager.podAnnotations }} 21 | annotations: 22 | {{ toYaml .Values.alertmanager.podAnnotations | indent 8 }} 23 | {{- end }} 24 | labels: 25 | app: {{ template "prometheus.name" . }} 26 | component: "{{ .Values.alertmanager.name }}" 27 | release: {{ .Release.Name }} 28 | spec: 29 | {{- if .Values.alertmanager.affinity }} 30 | affinity: 31 | {{ toYaml .Values.alertmanager.affinity | indent 8 }} 32 | {{- end }} 33 | {{- if .Values.alertmanager.schedulerName }} 34 | schedulerName: "{{ .Values.alertmanager.schedulerName }}" 35 | {{- end }} 36 | serviceAccountName: alertmanager 37 | {{- if .Values.alertmanager.priorityClassName }} 38 | priorityClassName: "{{ .Values.alertmanager.priorityClassName }}" 39 | {{- end }} 40 | containers: 41 | - name: {{ template "prometheus.name" . }}-{{ .Values.alertmanager.name }} 42 | image: "{{ .Values.alertmanager.image.repository }}:{{ .Values.alertmanager.image.tag }}" 43 | imagePullPolicy: "{{ .Values.alertmanager.image.pullPolicy }}" 44 | env: 45 | {{- range $key, $value := .Values.alertmanager.extraEnv }} 46 | - name: {{ $key }} 47 | value: {{ $value }} 48 | {{- end }} 49 | - name: POD_IP 50 | valueFrom: 51 | fieldRef: 52 | apiVersion: v1 53 | fieldPath: status.podIP 54 | args: 55 | - --config.file=/etc/config/alertmanager.yml 56 | - --storage.path={{ .Values.alertmanager.persistentVolume.mountPath }} 57 | - --cluster.advertise-address=$(POD_IP):6783 58 | {{- range $key, $value := .Values.alertmanager.extraArgs }} 59 | - --{{ $key }}={{ $value }} 60 | {{- end }} 61 | {{- if .Values.alertmanager.baseURL }} 62 | - --web.external-url={{ .Values.alertmanager.baseURL }} 63 | {{- end }} 64 | 65 | ports: 66 | - containerPort: 9093 67 | readinessProbe: 68 | httpGet: 69 | path: {{ .Values.alertmanager.prefixURL }}/#/status 70 | port: 9093 71 | initialDelaySeconds: 30 72 | timeoutSeconds: 30 73 | resources: 74 | {{ toYaml .Values.alertmanager.resources | indent 12 }} 75 | volumeMounts: 76 | - name: config-volume 77 | mountPath: /etc/config 78 | - name: storage-volume 79 | mountPath: "{{ .Values.alertmanager.persistentVolume.mountPath }}" 80 | subPath: "{{ .Values.alertmanager.persistentVolume.subPath }}" 81 | 82 | - name: {{ template "prometheus.name" . }}-{{ .Values.alertmanager.name }}-{{ .Values.configmapReload.name }} 83 | image: "{{ .Values.configmapReload.image.repository }}:{{ .Values.configmapReload.image.tag }}" 84 | imagePullPolicy: "{{ .Values.configmapReload.image.pullPolicy }}" 85 | args: 86 | - --volume-dir=/etc/config 87 | - --webhook-url=http://localhost:9093{{ .Values.alertmanager.prefixURL }}/-/reload 88 | resources: 89 | {{ toYaml .Values.configmapReload.resources | indent 12 }} 90 | volumeMounts: 91 | - name: config-volume 92 | mountPath: /etc/config 93 | readOnly: true 94 | {{- if .Values.imagePullSecrets }} 95 | imagePullSecrets: 96 | {{ toYaml .Values.imagePullSecrets | indent 2 }} 97 | {{- end }} 98 | {{- if .Values.alertmanager.nodeSelector }} 99 | nodeSelector: 100 | {{ toYaml .Values.alertmanager.nodeSelector | indent 8 }} 101 | {{- end }} 102 | {{- if .Values.alertmanager.securityContext }} 103 | securityContext: 104 | {{ toYaml .Values.alertmanager.securityContext | indent 8 }} 105 | {{- end }} 106 | {{- if .Values.alertmanager.tolerations }} 107 | tolerations: 108 | {{ toYaml .Values.alertmanager.tolerations | indent 8 }} 109 | {{- end }} 110 | {{- if .Values.alertmanager.affinity }} 111 | affinity: 112 | {{ toYaml .Values.alertmanager.affinity | indent 8 }} 113 | {{- end }} 114 | volumes: 115 | - name: config-volume 116 | configMap: 117 | name: {{ if .Values.alertmanager.configMapOverrideName }}{{ .Release.Name }}-{{ .Values.alertmanager.configMapOverrideName }}{{- else }}{{ template "prometheus.alertmanager.fullname" . }}{{- end }} 118 | - name: storage-volume 119 | {{- if .Values.alertmanager.persistentVolume.enabled }} 120 | persistentVolumeClaim: 121 | claimName: {{ if .Values.alertmanager.persistentVolume.existingClaim }}{{ .Values.alertmanager.persistentVolume.existingClaim }}{{- else }}{{ template "prometheus.alertmanager.fullname" . }}{{- end }} 122 | {{- else }} 123 | emptyDir: {} 124 | {{- end -}} 125 | {{- end }} 126 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/alertmanager-ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.alertmanager.enabled .Values.alertmanager.ingress.enabled -}} 2 | {{- $releaseName := .Release.Name -}} 3 | {{- $serviceName := include "prometheus.alertmanager.fullname" . }} 4 | {{- $servicePort := .Values.alertmanager.service.servicePort -}} 5 | apiVersion: extensions/v1beta1 6 | kind: Ingress 7 | metadata: 8 | {{- if .Values.alertmanager.ingress.annotations }} 9 | annotations: 10 | {{ toYaml .Values.alertmanager.ingress.annotations | indent 4 }} 11 | {{- end }} 12 | labels: 13 | app: {{ template "prometheus.name" . }} 14 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 15 | component: "{{ .Values.alertmanager.name }}" 16 | heritage: {{ .Release.Service }} 17 | release: {{ .Release.Name }} 18 | {{- range $key, $value := .Values.alertmanager.ingress.extraLabels }} 19 | {{ $key }}: {{ $value }} 20 | {{- end }} 21 | name: {{ template "prometheus.alertmanager.fullname" . }} 22 | spec: 23 | rules: 24 | {{- range .Values.alertmanager.ingress.hosts }} 25 | {{- $url := splitList "/" . }} 26 | - host: {{ first $url }} 27 | http: 28 | paths: 29 | - path: /{{ rest $url | join "/" }} 30 | backend: 31 | serviceName: {{ $serviceName }} 32 | servicePort: {{ $servicePort }} 33 | {{- end -}} 34 | {{- if .Values.alertmanager.ingress.tls }} 35 | tls: 36 | {{ toYaml .Values.alertmanager.ingress.tls | indent 4 }} 37 | {{- end -}} 38 | {{- end -}} 39 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/alertmanager-networkpolicy.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.networkPolicy.enabled }} 2 | apiVersion: {{ template "prometheus.networkPolicy.apiVersion" . }} 3 | kind: NetworkPolicy 4 | metadata: 5 | name: {{ template "prometheus.alertmanager.fullname" . }} 6 | labels: 7 | app: {{ template "prometheus.name" . }} 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 9 | component: "{{ .Values.alertmanager.name }}" 10 | heritage: {{ .Release.Service }} 11 | release: {{ .Release.Name }} 12 | spec: 13 | podSelector: 14 | matchLabels: 15 | app: {{ template "prometheus.name" . }} 16 | component: "{{ .Values.alertmanager.name }}" 17 | release: {{ .Release.Name }} 18 | ingress: 19 | - from: 20 | - podSelector: 21 | matchLabels: 22 | release: {{ .Release.Name }} 23 | component: "{{ .Values.server.name }}" 24 | - ports: 25 | - port: 9093 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/alertmanager-pvc.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.alertmanager.enabled .Values.alertmanager.persistentVolume.enabled -}} 2 | {{- if not .Values.alertmanager.persistentVolume.existingClaim -}} 3 | apiVersion: v1 4 | kind: PersistentVolumeClaim 5 | metadata: 6 | {{- if .Values.alertmanager.persistentVolume.annotations }} 7 | annotations: 8 | {{ toYaml .Values.alertmanager.persistentVolume.annotations | indent 4 }} 9 | {{- end }} 10 | labels: 11 | app: {{ template "prometheus.name" . }} 12 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 13 | component: "{{ .Values.alertmanager.name }}" 14 | heritage: {{ .Release.Service }} 15 | release: {{ .Release.Name }} 16 | name: {{ template "prometheus.alertmanager.fullname" . }} 17 | spec: 18 | accessModes: 19 | {{ toYaml .Values.alertmanager.persistentVolume.accessModes | indent 4 }} 20 | {{- if .Values.alertmanager.persistentVolume.storageClass }} 21 | {{- if (eq "-" .Values.alertmanager.persistentVolume.storageClass) }} 22 | storageClassName: "" 23 | {{- else }} 24 | storageClassName: "{{ .Values.alertmanager.persistentVolume.storageClass }}" 25 | {{- end }} 26 | {{- end }} 27 | resources: 28 | requests: 29 | storage: "{{ .Values.alertmanager.persistentVolume.size }}" 30 | {{- end -}} 31 | {{- end -}} 32 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/alertmanager-service.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.alertmanager.enabled -}} 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | {{- if .Values.alertmanager.service.annotations }} 6 | annotations: 7 | {{ toYaml .Values.alertmanager.service.annotations | indent 4 }} 8 | {{- end }} 9 | labels: 10 | app: {{ template "prometheus.name" . }} 11 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 12 | component: "{{ .Values.alertmanager.name }}" 13 | heritage: {{ .Release.Service }} 14 | release: {{ .Release.Name }} 15 | {{- if .Values.alertmanager.service.labels }} 16 | {{ toYaml .Values.alertmanager.service.labels | indent 4 }} 17 | {{- end }} 18 | name: {{ template "prometheus.alertmanager.fullname" . }} 19 | spec: 20 | {{- if .Values.alertmanager.service.clusterIP }} 21 | clusterIP: {{ .Values.alertmanager.service.clusterIP }} 22 | {{- end }} 23 | {{- if .Values.alertmanager.service.externalIPs }} 24 | externalIPs: 25 | {{ toYaml .Values.alertmanager.service.externalIPs | indent 4 }} 26 | {{- end }} 27 | {{- if .Values.alertmanager.service.loadBalancerIP }} 28 | loadBalancerIP: {{ .Values.alertmanager.service.loadBalancerIP }} 29 | {{- end }} 30 | {{- if .Values.alertmanager.service.loadBalancerSourceRanges }} 31 | loadBalancerSourceRanges: 32 | {{- range $cidr := .Values.alertmanager.service.loadBalancerSourceRanges }} 33 | - {{ $cidr }} 34 | {{- end }} 35 | {{- end }} 36 | ports: 37 | - name: http 38 | port: {{ .Values.alertmanager.service.servicePort }} 39 | protocol: TCP 40 | targetPort: 9093 41 | {{- if .Values.alertmanager.service.nodePort }} 42 | nodePort: {{ .Values.alertmanager.service.nodePort }} 43 | {{- end }} 44 | {{- if .Values.alertmanager.service.enableMeshPeer }} 45 | - name: meshpeer 46 | port: 6783 47 | protocol: TCP 48 | targetPort: 6783 49 | {{- end }} 50 | selector: 51 | app: {{ template "prometheus.name" . }} 52 | component: "{{ .Values.alertmanager.name }}" 53 | release: {{ .Release.Name }} 54 | type: "{{ .Values.alertmanager.service.type }}" 55 | {{- end }} 56 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/alertmanager-serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: {{ template "prometheus.name" . }} 6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 7 | component: "{{ .Values.alertmanager.name }}" 8 | heritage: {{ .Release.Service }} 9 | release: {{ .Release.Name }} 10 | name: alertmanager 11 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-configmap.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ template "prometheus.grafana.fullname". }} 6 | labels: 7 | app: {{ template "prometheus.name" . }} 8 | component: "{{ .Values.grafana.name }}" 9 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 10 | release: {{ .Release.Name }} 11 | heritage: {{ .Release.Service }} 12 | data: 13 | grafana.ini: | 14 | {{- range $key, $value := index .Values.grafana "grafana.ini" }} 15 | [{{ $key }}] 16 | {{- range $elem, $elemVal := $value }} 17 | {{ $elem }} = {{ $elemVal }} 18 | {{- end }} 19 | {{- end }} 20 | --- 21 | apiVersion: v1 22 | kind: ConfigMap 23 | metadata: 24 | name: grafana-datasources 25 | labels: 26 | app: {{ template "prometheus.name" . }} 27 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 28 | release: {{ .Release.Name }} 29 | heritage: {{ .Release.Service }} 30 | data: 31 | datasources.yaml: | 32 | apiVersion: 1 33 | datasources: 34 | - name: Prometheus 35 | type: prometheus 36 | access: proxy 37 | orgId: 1 38 | url: http://{{ template "prometheus.server.fullname" . }}:{{ .Values.server.service.servicePort }} 39 | basicAuth: false 40 | withCredentials: false 41 | isDefault: true 42 | editable: false 43 | version: 1 44 | - name: Elasticsearch 45 | type: elasticsearch 46 | access: proxy 47 | orgId: 1 48 | url: http://elasticsearch:9200 49 | basicAuth: false 50 | withCredentials: false 51 | isDefault: false 52 | editable: false 53 | version: 1 54 | database: "[gslogs-]YYYY.MM.DD" 55 | jsonData: 56 | interval: Daily 57 | timeField: '@timestamp' 58 | --- 59 | apiVersion: v1 60 | kind: ConfigMap 61 | metadata: 62 | name: grafana-dashboards-main 63 | labels: 64 | app: {{ template "prometheus.name" . }} 65 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 66 | release: {{ .Release.Name }} 67 | heritage: {{ .Release.Service }} 68 | data: 69 | dashboards.yaml: | 70 | apiVersion: 1 71 | providers: 72 | - name: 'default' 73 | orgId: 1 74 | folder: '' 75 | type: file 76 | disableDeletion: true 77 | options: 78 | path: /etc/grafana/provisioning/dashboards-json 79 | --- 80 | apiVersion: v1 81 | kind: ConfigMap 82 | metadata: 83 | name: grafana-dashboards-json 84 | labels: 85 | app: {{ template "prometheus.name" . }} 86 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 87 | release: {{ .Release.Name }} 88 | heritage: {{ .Release.Service }} 89 | data: 90 | {{ (.Files.Glob "grafana-dashboards/*").AsConfig | indent 4 }} 91 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta2 2 | kind: Deployment 3 | metadata: 4 | name: {{ template "prometheus.grafana.fullname" . }} 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | component: "{{ .Values.grafana.name }}" 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 9 | release: {{ .Release.Name }} 10 | heritage: {{ .Release.Service }} 11 | {{- with .Values.grafana.annotations }} 12 | annotations: 13 | {{ toYaml . | indent 4 }} 14 | {{- end }} 15 | spec: 16 | replicas: {{ .Values.grafana.replicas }} 17 | selector: 18 | matchLabels: 19 | app: {{ template "prometheus.name" . }} 20 | release: {{ .Release.Name }} 21 | strategy: 22 | type: {{ .Values.grafana.deploymentStrategy }} 23 | {{- if ne .Values.grafana.deploymentStrategy "RollingUpdate" }} 24 | rollingUpdate: null 25 | {{- end }} 26 | template: 27 | metadata: 28 | labels: 29 | app: {{ template "prometheus.name" . }} 30 | component: "{{ .Values.grafana.name }}" 31 | release: {{ .Release.Name }} 32 | {{- with .Values.grafana.podAnnotations }} 33 | annotations: 34 | {{ toYaml . | indent 8 }} 35 | {{- end }} 36 | spec: 37 | serviceAccountName: grafana 38 | {{- if .Values.grafana.schedulerName }} 39 | schedulerName: "{{ .Values.grafana.schedulerName }}" 40 | {{- end }} 41 | {{- if .Values.grafana.securityContext }} 42 | securityContext: 43 | {{ toYaml .Values.grafana.securityContext | indent 8 }} 44 | {{- end }} 45 | {{- if .Values.grafana.dashboards }} 46 | initContainers: 47 | - name: download-dashboards 48 | image: "{{ .Values.grafana.downloadDashboardsImage.repository }}:{{ .Values.grafana.downloadDashboardsImage.tag }}" 49 | imagePullPolicy: {{ .Values.grafana.downloadDashboardsImage.pullPolicy }} 50 | command: ["sh", "/etc/grafana/download_dashboards.sh"] 51 | volumeMounts: 52 | - name: config 53 | mountPath: "/etc/grafana/download_dashboards.sh" 54 | subPath: download_dashboards.sh 55 | - name: storage 56 | mountPath: "/var/lib/grafana" 57 | subPath: {{ .Values.grafana.persistence.subPath }} 58 | {{- range .Values.grafana.extraSecretMounts }} 59 | - name: {{ .name }} 60 | mountPath: {{ .mountPath }} 61 | readOnly: {{ .readOnly }} 62 | {{- end }} 63 | {{- end }} 64 | {{- if .Values.grafana.image.pullSecrets }} 65 | imagePullSecrets: 66 | {{- range .Values.grafana.image.pullSecrets }} 67 | - name: {{ . }} 68 | {{- end}} 69 | {{- end }} 70 | containers: 71 | - name: {{ .Chart.Name }} 72 | image: "{{ .Values.grafana.image.repository }}:{{ .Values.grafana.image.tag }}" 73 | imagePullPolicy: {{ .Values.grafana.image.pullPolicy }} 74 | volumeMounts: 75 | - name: config 76 | mountPath: "/etc/grafana/grafana.ini" 77 | subPath: grafana.ini 78 | - name: ldap 79 | mountPath: "/etc/grafana/ldap.toml" 80 | subPath: ldap.toml 81 | # Data sources to provision on startup 82 | - name: datasources 83 | mountPath: /etc/grafana/provisioning/datasources 84 | # Main dashboard provisioning file directory 85 | - name: dashboards-main 86 | mountPath: /etc/grafana/provisioning/dashboards 87 | # Individual dashboards JSON directory 88 | - name: dashboards-json 89 | mountPath: /etc/grafana/provisioning/dashboards-json 90 | ports: 91 | - name: service 92 | containerPort: {{ .Values.grafana.service.port }} 93 | protocol: TCP 94 | - name: grafana 95 | containerPort: 3000 96 | protocol: TCP 97 | env: 98 | - name: GF_SECURITY_ADMIN_USER 99 | valueFrom: 100 | secretKeyRef: 101 | name: {{ template "prometheus.grafana.fullname" . }} 102 | key: admin-user 103 | - name: GF_SECURITY_ADMIN_PASSWORD 104 | valueFrom: 105 | secretKeyRef: 106 | name: {{ template "prometheus.grafana.fullname" . }} 107 | key: admin-password 108 | livenessProbe: 109 | {{ toYaml .Values.grafana.livenessProbe | indent 12 }} 110 | readinessProbe: 111 | {{ toYaml .Values.grafana.readinessProbe | indent 12 }} 112 | resources: 113 | {{ toYaml .Values.grafana.resources | indent 12 }} 114 | {{- with .Values.grafana.nodeSelector }} 115 | nodeSelector: 116 | {{ toYaml . | indent 8 }} 117 | {{- end }} 118 | {{- with .Values.grafana.affinity }} 119 | affinity: 120 | {{ toYaml . | indent 8 }} 121 | {{- end }} 122 | {{- with .Values.grafana.tolerations }} 123 | tolerations: 124 | {{ toYaml . | indent 8 }} 125 | {{- end }} 126 | volumes: 127 | - name: config 128 | configMap: 129 | name: {{ template "prometheus.grafana.fullname" . }} 130 | - name: ldap 131 | secret: 132 | secretName: {{ template "prometheus.grafana.fullname" . }} 133 | items: 134 | - key: ldap-toml 135 | path: ldap.toml 136 | - name: datasources 137 | configMap: 138 | name: grafana-datasources 139 | - name: dashboards-main 140 | configMap: 141 | name: grafana-dashboards-main 142 | - name: dashboards-json 143 | configMap: 144 | name: grafana-dashboards-json 145 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.grafana.ingress.enabled -}} 2 | {{- $fullName := include "prometheus.grafana.fullname" . -}} 3 | {{- $servicePort := .Values.grafana.service.port -}} 4 | {{- $ingressPath := .Values.grafana.ingress.path -}} 5 | apiVersion: extensions/v1beta1 6 | kind: Ingress 7 | metadata: 8 | name: {{ $fullName }} 9 | labels: 10 | app: {{ template "prometheus.name" . }} 11 | component: "{{ .Values.grafana.name }}" 12 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 13 | release: {{ .Release.Name }} 14 | heritage: {{ .Release.Service }} 15 | {{- if .Values.grafana.ingress.labels }} 16 | {{ toYaml .Values.grafana.ingress.labels | indent 4 }} 17 | {{- end }} 18 | {{- with .Values.grafana.ingress.annotations }} 19 | annotations: 20 | {{ toYaml . | indent 4 }} 21 | {{- end }} 22 | spec: 23 | {{- if .Values.grafana.ingress.tls }} 24 | tls: 25 | {{- range .Values.grafana.ingress.tls }} 26 | - hosts: 27 | {{- range .hosts }} 28 | - {{ . | quote }} 29 | {{- end }} 30 | secretName: {{ .secretName }} 31 | {{- end }} 32 | {{- end }} 33 | rules: 34 | {{- range .Values.grafana.ingress.hosts }} 35 | - host: {{ . }} 36 | http: 37 | paths: 38 | - path: {{ $ingressPath }} 39 | backend: 40 | serviceName: {{ $fullName }} 41 | servicePort: {{ $servicePort }} 42 | {{- end }} 43 | {{- end }} 44 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-podsecuritypolicy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: PodSecurityPolicy 3 | metadata: 4 | name: {{ template "prometheus.grafana.fullname" . }} 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | component: "{{ .Values.grafana.name }}" 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 9 | heritage: {{ .Release.Service }} 10 | release: {{ .Release.Name }} 11 | annotations: 12 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: 'docker/default' 13 | apparmor.security.beta.kubernetes.io/allowedProfileNames: 'runtime/default' 14 | seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' 15 | apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' 16 | spec: 17 | privileged: false 18 | allowPrivilegeEscalation: false 19 | requiredDropCapabilities: 20 | - ALL 21 | volumes: 22 | - 'configMap' 23 | - 'emptyDir' 24 | - 'projected' 25 | - 'secret' 26 | - 'downwardAPI' 27 | - 'persistentVolumeClaim' 28 | hostNetwork: false 29 | hostIPC: false 30 | hostPID: false 31 | runAsUser: 32 | rule: 'RunAsAny' 33 | seLinux: 34 | rule: 'RunAsAny' 35 | supplementalGroups: 36 | rule: 'RunAsAny' 37 | fsGroup: 38 | rule: 'RunAsAny' 39 | readOnlyRootFilesystem: false 40 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: Role 3 | metadata: 4 | name: {{ template "prometheus.grafana.fullname" . }} 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | component: "{{ .Values.grafana.name }}" 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 9 | heritage: {{ .Release.Service }} 10 | release: {{ .Release.Name }} 11 | rules: 12 | - apiGroups: ['extensions'] 13 | resources: ['podsecuritypolicies'] 14 | verbs: ['use'] 15 | resourceNames: [{{ template "prometheus.grafana.fullname" . }}] 16 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: RoleBinding 3 | metadata: 4 | name: {{ template "prometheus.grafana.fullname" . }} 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | component: "{{ .Values.grafana.name }}" 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 9 | heritage: {{ .Release.Service }} 10 | release: {{ .Release.Name }} 11 | roleRef: 12 | apiGroup: rbac.authorization.k8s.io 13 | kind: Role 14 | name: {{ template "prometheus.grafana.fullname" . }} 15 | subjects: 16 | - kind: ServiceAccount 17 | name: grafana 18 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: {{ template "prometheus.grafana.fullname" . }} 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | component: "{{ .Values.grafana.name }}" 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 9 | release: {{ .Release.Name }} 10 | heritage: {{ .Release.Service }} 11 | type: Opaque 12 | data: 13 | admin-user: {{ .Values.grafana.adminUser | b64enc | quote }} 14 | {{- if .Values.grafana.adminPassword }} 15 | admin-password: {{ .Values.grafana.adminPassword | b64enc | quote }} 16 | {{- else }} 17 | admin-password: {{ randAlphaNum 40 | b64enc | quote }} 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ template "prometheus.grafana.fullname" . }} 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | component: "{{ .Values.grafana.name }}" 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 9 | release: {{ .Release.Name }} 10 | heritage: {{ .Release.Service }} 11 | {{- if .Values.grafana.service.labels }} 12 | {{ toYaml .Values.grafana.service.labels | indent 4 }} 13 | {{- end }} 14 | {{- with .Values.grafana.service.annotations }} 15 | annotations: 16 | {{ toYaml . | indent 4 }} 17 | {{- end }} 18 | spec: 19 | {{- if (or (eq .Values.grafana.service.type "ClusterIP") (empty .Values.grafana.service.type)) }} 20 | type: ClusterIP 21 | {{- if .Values.grafana.service.clusterIP }} 22 | clusterIP: {{ .Values.grafana.service.clusterIP }} 23 | {{end}} 24 | {{- else if eq .Values.grafana.service.type "LoadBalancer" }} 25 | type: {{ .Values.grafana.service.type }} 26 | {{- if .Values.grafana.service.loadBalancerIP }} 27 | loadBalancerIP: {{ .Values.grafana.service.loadBalancerIP }} 28 | {{- end }} 29 | {{- if .Values.grafana.service.loadBalancerSourceRanges }} 30 | loadBalancerSourceRanges: 31 | {{ toYaml .Values.grafana.service.loadBalancerSourceRanges | indent 4 }} 32 | {{- end -}} 33 | {{- else }} 34 | type: {{ .Values.grafana.service.type }} 35 | {{- end }} 36 | {{- if .Values.grafana.service.externalIPs }} 37 | externalIPs: 38 | {{ toYaml .Values.grafana.service.externalIPs | indent 4 }} 39 | {{- end }} 40 | ports: 41 | - name: service 42 | port: {{ .Values.grafana.service.port }} 43 | protocol: TCP 44 | targetPort: 3000 45 | {{ if (and (eq .Values.grafana.service.type "NodePort") (not (empty .Values.grafana.service.nodePort))) }} 46 | nodePort: {{.Values.grafana.service.nodePort}} 47 | {{ end }} 48 | selector: 49 | app: {{ template "prometheus.name" . }} 50 | release: {{ .Release.Name }} 51 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/grafana-serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: {{ template "prometheus.name" . }} 6 | component: "{{ .Values.grafana.name }}" 7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 8 | heritage: {{ .Release.Service }} 9 | release: {{ .Release.Name }} 10 | name: grafana 11 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: ClusterRole 3 | metadata: 4 | labels: 5 | app: {{ template "prometheus.name" . }} 6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 7 | component: "{{ .Values.server.name }}" 8 | heritage: {{ .Release.Service }} 9 | release: {{ .Release.Name }} 10 | name: {{ template "prometheus.server.fullname" . }} 11 | rules: 12 | - apiGroups: 13 | - "" 14 | resources: 15 | - nodes 16 | - nodes/proxy 17 | - services 18 | - endpoints 19 | - pods 20 | - ingresses 21 | verbs: 22 | - get 23 | - list 24 | - watch 25 | - apiGroups: 26 | - "" 27 | resources: 28 | - configmaps 29 | verbs: 30 | - get 31 | - apiGroups: 32 | - "extensions" 33 | resources: 34 | - ingresses/status 35 | - ingresses 36 | verbs: 37 | - get 38 | - list 39 | - watch 40 | - nonResourceURLs: 41 | - "/metrics" 42 | verbs: 43 | - get 44 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1beta1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app: {{ template "prometheus.name" . }} 6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 7 | component: "{{ .Values.server.name }}" 8 | heritage: {{ .Release.Service }} 9 | release: {{ .Release.Name }} 10 | name: {{ template "prometheus.server.fullname" . }} 11 | subjects: 12 | - kind: ServiceAccount 13 | name: prometheus 14 | namespace: {{ .Release.Namespace }} 15 | roleRef: 16 | apiGroup: rbac.authorization.k8s.io 17 | kind: ClusterRole 18 | name: {{ template "prometheus.server.fullname" . }} 19 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-configmap.yaml: -------------------------------------------------------------------------------- 1 | {{- if (empty .Values.server.configMapOverrideName) -}} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | labels: 6 | app: {{ template "prometheus.name" . }} 7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 8 | component: "{{ .Values.server.name }}" 9 | heritage: {{ .Release.Service }} 10 | release: {{ .Release.Name }} 11 | name: {{ template "prometheus.server.fullname" . }} 12 | data: 13 | {{- $root := . -}} 14 | {{- range $key, $value := .Values.serverFiles }} 15 | {{ $key }}: | 16 | {{- if eq $key "prometheus.yml" }} 17 | global: 18 | {{ $root.Values.server.global | toYaml | indent 6 }} 19 | {{- end }} 20 | {{ toYaml $value | default "{}" | indent 4 }} 21 | {{- if eq $key "prometheus.yml" -}} 22 | {{- if $root.Values.extraScrapeConfigs }} 23 | {{ tpl $root.Values.extraScrapeConfigs $root | indent 4 }} 24 | {{- end -}} 25 | {{- if $root.Values.alertmanager.enabled }} 26 | alerting: 27 | alertmanagers: 28 | - kubernetes_sd_configs: 29 | - role: pod 30 | tls_config: 31 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 32 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 33 | {{- if $root.Values.alertmanager.prefixURL }} 34 | path_prefix: {{ $root.Values.alertmanager.prefixURL }} 35 | {{- end }} 36 | relabel_configs: 37 | - source_labels: [__meta_kubernetes_namespace] 38 | regex: {{ $root.Release.Namespace }} 39 | action: keep 40 | - source_labels: [__meta_kubernetes_pod_label_app] 41 | regex: {{ template "prometheus.name" $root }} 42 | action: keep 43 | - source_labels: [__meta_kubernetes_pod_label_component] 44 | regex: alertmanager 45 | action: keep 46 | - source_labels: [__meta_kubernetes_pod_container_port_number] 47 | regex: 48 | action: drop 49 | {{- end -}} 50 | {{- end -}} 51 | {{- end -}} 52 | {{- end -}} 53 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | {{- if .Values.server.deploymentAnnotations }} 5 | annotations: 6 | {{ toYaml .Values.server.deploymentAnnotations | indent 4 }} 7 | {{- end }} 8 | labels: 9 | app: {{ template "prometheus.name" . }} 10 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 11 | component: "{{ .Values.server.name }}" 12 | heritage: {{ .Release.Service }} 13 | release: {{ .Release.Name }} 14 | name: {{ template "prometheus.server.fullname" . }} 15 | spec: 16 | replicas: {{ .Values.server.replicaCount }} 17 | {{- if .Values.server.strategy }} 18 | strategy: 19 | {{ toYaml .Values.server.strategy | indent 4 }} 20 | {{- end }} 21 | template: 22 | metadata: 23 | {{- if .Values.server.podAnnotations }} 24 | annotations: 25 | {{ toYaml .Values.server.podAnnotations | indent 8 }} 26 | {{- end }} 27 | labels: 28 | app: {{ template "prometheus.name" . }} 29 | component: "{{ .Values.server.name }}" 30 | release: {{ .Release.Name }} 31 | spec: 32 | {{- if .Values.server.affinity }} 33 | affinity: 34 | {{ toYaml .Values.server.affinity | indent 8 }} 35 | {{- end }} 36 | {{- if .Values.server.priorityClassName }} 37 | priorityClassName: "{{ .Values.server.priorityClassName }}" 38 | {{- end }} 39 | {{- if .Values.server.schedulerName }} 40 | schedulerName: "{{ .Values.server.schedulerName }}" 41 | {{- end }} 42 | serviceAccountName: prometheus 43 | {{- if .Values.initChownData.enabled }} 44 | initContainers: 45 | - name: "{{ .Values.initChownData.name }}" 46 | image: "{{ .Values.initChownData.image.repository }}:{{ .Values.initChownData.image.tag }}" 47 | imagePullPolicy: "{{ .Values.initChownData.image.pullPolicy }}" 48 | resources: 49 | {{ toYaml .Values.initChownData.resources | indent 12 }} 50 | # 65534 is the nobody user that prometheus uses. 51 | command: ["chown", "-R", "65534:65534", "{{ .Values.server.persistentVolume.mountPath }}"] 52 | volumeMounts: 53 | - name: storage-volume 54 | mountPath: {{ .Values.server.persistentVolume.mountPath }} 55 | subPath: "{{ .Values.server.persistentVolume.subPath }}" 56 | {{- end }} 57 | containers: 58 | - name: {{ template "prometheus.name" . }}-{{ .Values.server.name }}-{{ .Values.configmapReload.name }} 59 | image: "{{ .Values.configmapReload.image.repository }}:{{ .Values.configmapReload.image.tag }}" 60 | imagePullPolicy: "{{ .Values.configmapReload.image.pullPolicy }}" 61 | args: 62 | - --volume-dir=/etc/config 63 | - --webhook-url=http://127.0.0.1:9090{{ .Values.server.prefixURL }}/-/reload 64 | {{- range $key, $value := .Values.configmapReload.extraArgs }} 65 | - --{{ $key }}={{ $value }} 66 | {{- end }} 67 | {{- range .Values.configmapReload.extraVolumeDirs }} 68 | - --volume-dir={{ . }} 69 | {{- end }} 70 | resources: 71 | {{ toYaml .Values.configmapReload.resources | indent 12 }} 72 | volumeMounts: 73 | - name: config-volume 74 | mountPath: /etc/config 75 | readOnly: true 76 | {{- range .Values.configmapReload.extraConfigmapMounts }} 77 | - name: {{ $.Values.configmapReload.name }}-{{ .name }} 78 | mountPath: {{ .mountPath }} 79 | subPath: {{ .subPath }} 80 | readOnly: {{ .readOnly }} 81 | {{- end }} 82 | 83 | - name: {{ template "prometheus.name" . }}-{{ .Values.server.name }} 84 | image: "{{ .Values.server.image.repository }}:{{ .Values.server.image.tag }}" 85 | imagePullPolicy: "{{ .Values.server.image.pullPolicy }}" 86 | args: 87 | {{- if .Values.server.retention }} 88 | - --storage.tsdb.retention={{ .Values.server.retention }} 89 | {{- end }} 90 | - --config.file=/etc/config/prometheus.yml 91 | - --storage.tsdb.path={{ .Values.server.persistentVolume.mountPath }} 92 | - --web.console.libraries=/etc/prometheus/console_libraries 93 | - --web.console.templates=/etc/prometheus/consoles 94 | - --web.enable-lifecycle 95 | {{- range $key, $value := .Values.server.extraArgs }} 96 | - --{{ $key }}={{ $value }} 97 | {{- end }} 98 | {{- if .Values.server.baseURL }} 99 | - --web.external-url={{ .Values.server.baseURL }} 100 | {{- end }} 101 | {{- if .Values.server.enableAdminApi }} 102 | - --web.enable-admin-api 103 | {{- end }} 104 | ports: 105 | - containerPort: 9090 106 | readinessProbe: 107 | httpGet: 108 | path: {{ .Values.server.prefixURL }}/-/ready 109 | port: 9090 110 | initialDelaySeconds: 30 111 | timeoutSeconds: 30 112 | livenessProbe: 113 | httpGet: 114 | path: {{ .Values.server.prefixURL }}/-/healthy 115 | port: 9090 116 | initialDelaySeconds: 30 117 | timeoutSeconds: 30 118 | resources: 119 | {{ toYaml .Values.server.resources | indent 12 }} 120 | volumeMounts: 121 | - name: rules-volume 122 | mountPath: /etc/prometheus-rules 123 | - name: config-volume 124 | mountPath: /etc/config 125 | - name: storage-volume 126 | mountPath: {{ .Values.server.persistentVolume.mountPath }} 127 | subPath: "{{ .Values.server.persistentVolume.subPath }}" 128 | {{- range .Values.server.extraHostPathMounts }} 129 | - name: {{ .name }} 130 | mountPath: {{ .mountPath }} 131 | subPath: {{ .subPath }} 132 | readOnly: {{ .readOnly }} 133 | {{- end }} 134 | {{- range .Values.server.extraConfigmapMounts }} 135 | - name: {{ $.Values.server.name }}-{{ .name }} 136 | mountPath: {{ .mountPath }} 137 | subPath: {{ .subPath }} 138 | readOnly: {{ .readOnly }} 139 | {{- end }} 140 | {{- range .Values.server.extraSecretMounts }} 141 | - name: {{ .name }} 142 | mountPath: {{ .mountPath }} 143 | subPath: {{ .subPath }} 144 | readOnly: {{ .readOnly }} 145 | {{- end }} 146 | {{- if .Values.imagePullSecrets }} 147 | imagePullSecrets: 148 | {{ toYaml .Values.imagePullSecrets | indent 2 }} 149 | {{- end }} 150 | {{- if .Values.server.nodeSelector }} 151 | nodeSelector: 152 | {{ toYaml .Values.server.nodeSelector | indent 8 }} 153 | {{- end }} 154 | {{- if .Values.server.securityContext }} 155 | securityContext: 156 | {{ toYaml .Values.server.securityContext | indent 8 }} 157 | {{- end }} 158 | {{- if .Values.server.tolerations }} 159 | tolerations: 160 | {{ toYaml .Values.server.tolerations | indent 8 }} 161 | {{- end }} 162 | {{- if .Values.server.affinity }} 163 | affinity: 164 | {{ toYaml .Values.server.affinity | indent 8 }} 165 | {{- end }} 166 | terminationGracePeriodSeconds: {{ .Values.server.terminationGracePeriodSeconds }} 167 | volumes: 168 | - name: rules-volume 169 | configMap: 170 | name: prometheus-rules 171 | - name: config-volume 172 | configMap: 173 | name: {{ if .Values.server.configMapOverrideName }}{{ .Release.Name }}-{{ .Values.server.configMapOverrideName }}{{- else }}{{ template "prometheus.server.fullname" . }}{{- end }} 174 | - name: storage-volume 175 | {{- if .Values.server.persistentVolume.enabled }} 176 | persistentVolumeClaim: 177 | claimName: {{ if .Values.server.persistentVolume.existingClaim }}{{ .Values.server.persistentVolume.existingClaim }}{{- else }}{{ template "prometheus.server.fullname" . }}{{- end }} 178 | {{- else }} 179 | emptyDir: {} 180 | {{- end -}} 181 | {{- range .Values.server.extraHostPathMounts }} 182 | - name: {{ .name }} 183 | hostPath: 184 | path: {{ .hostPath }} 185 | {{- end }} 186 | {{- range .Values.configmapReload.extraConfigmapMounts }} 187 | - name: {{ $.Values.configmapReload.name }}-{{ .name }} 188 | configMap: 189 | name: {{ .configMap }} 190 | {{- end }} 191 | {{- range .Values.server.extraConfigmapMounts }} 192 | - name: {{ $.Values.server.name }}-{{ .name }} 193 | configMap: 194 | name: {{ .configMap }} 195 | {{- end }} 196 | {{- range .Values.server.extraSecretMounts }} 197 | - name: {{ .name }} 198 | secret: 199 | secretName: {{ .secretName }} 200 | {{- end }} 201 | {{- range .Values.configmapReload.extraConfigmapMounts }} 202 | - name: {{ .name }} 203 | configMap: 204 | name: {{ .configMap }} 205 | {{- end }} 206 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.server.ingress.enabled -}} 2 | {{- $releaseName := .Release.Name -}} 3 | {{- $serviceName := include "prometheus.server.fullname" . }} 4 | {{- $servicePort := .Values.server.service.servicePort -}} 5 | apiVersion: extensions/v1beta1 6 | kind: Ingress 7 | metadata: 8 | {{- if .Values.server.ingress.annotations }} 9 | annotations: 10 | {{ toYaml .Values.server.ingress.annotations | indent 4 }} 11 | {{- end }} 12 | labels: 13 | app: {{ template "prometheus.name" . }} 14 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 15 | component: "{{ .Values.server.name }}" 16 | heritage: {{ .Release.Service }} 17 | release: {{ .Release.Name }} 18 | {{- range $key, $value := .Values.server.ingress.extraLabels }} 19 | {{ $key }}: {{ $value }} 20 | {{- end }} 21 | name: {{ template "prometheus.server.fullname" . }} 22 | spec: 23 | rules: 24 | {{- range .Values.server.ingress.hosts }} 25 | {{- $url := splitList "/" . }} 26 | - host: {{ first $url }} 27 | http: 28 | paths: 29 | - path: /{{ rest $url | join "/" }} 30 | backend: 31 | serviceName: {{ $serviceName }} 32 | servicePort: {{ $servicePort }} 33 | {{- end -}} 34 | {{- if .Values.server.ingress.tls }} 35 | tls: 36 | {{ toYaml .Values.server.ingress.tls | indent 4 }} 37 | {{- end -}} 38 | {{- end -}} 39 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-networkpolicy.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.networkPolicy.enabled }} 2 | apiVersion: {{ template "prometheus.networkPolicy.apiVersion" . }} 3 | kind: NetworkPolicy 4 | metadata: 5 | name: {{ template "prometheus.server.fullname" . }} 6 | labels: 7 | app: {{ template "prometheus.name" . }} 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 9 | component: "{{ .Values.server.name }}" 10 | heritage: {{ .Release.Service }} 11 | release: {{ .Release.Name }} 12 | spec: 13 | podSelector: 14 | matchLabels: 15 | app: {{ template "prometheus.name" . }} 16 | component: "{{ .Values.server.name }}" 17 | release: {{ .Release.Name }} 18 | ingress: 19 | - ports: 20 | - port: 9090 21 | {{- end }} 22 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-pvc.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.server.persistentVolume.enabled -}} 2 | {{- if not .Values.server.persistentVolume.existingClaim -}} 3 | apiVersion: v1 4 | kind: PersistentVolumeClaim 5 | metadata: 6 | {{- if .Values.server.persistentVolume.annotations }} 7 | annotations: 8 | {{ toYaml .Values.server.persistentVolume.annotations | indent 4 }} 9 | {{- end }} 10 | labels: 11 | app: {{ template "prometheus.name" . }} 12 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 13 | component: "{{ .Values.server.name }}" 14 | heritage: {{ .Release.Service }} 15 | release: {{ .Release.Name }} 16 | name: {{ template "prometheus.server.fullname" . }} 17 | spec: 18 | accessModes: 19 | {{ toYaml .Values.server.persistentVolume.accessModes | indent 4 }} 20 | {{- if .Values.server.persistentVolume.storageClass }} 21 | {{- if (eq "-" .Values.server.persistentVolume.storageClass) }} 22 | storageClassName: "" 23 | {{- else }} 24 | storageClassName: "{{ .Values.server.persistentVolume.storageClass }}" 25 | {{- end }} 26 | {{- end }} 27 | resources: 28 | requests: 29 | storage: "{{ .Values.server.persistentVolume.size }}" 30 | {{- end -}} 31 | {{- end -}} 32 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-rules.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: prometheus-rules 5 | labels: 6 | app: prometheus 7 | data: 8 | {{ (.Files.Glob "prometheus-alerts/*").AsConfig | indent 2 }} 9 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | {{- if .Values.server.service.annotations }} 5 | annotations: 6 | {{ toYaml .Values.server.service.annotations | indent 4 }} 7 | {{- end }} 8 | labels: 9 | app: {{ template "prometheus.name" . }} 10 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 11 | component: "{{ .Values.server.name }}" 12 | heritage: {{ .Release.Service }} 13 | release: {{ .Release.Name }} 14 | {{- if .Values.server.service.labels }} 15 | {{ toYaml .Values.server.service.labels | indent 4 }} 16 | {{- end }} 17 | name: {{ template "prometheus.server.fullname" . }} 18 | spec: 19 | {{- if .Values.server.service.clusterIP }} 20 | clusterIP: {{ .Values.server.service.clusterIP }} 21 | {{- end }} 22 | {{- if .Values.server.service.externalIPs }} 23 | externalIPs: 24 | {{ toYaml .Values.server.service.externalIPs | indent 4 }} 25 | {{- end }} 26 | {{- if .Values.server.service.loadBalancerIP }} 27 | loadBalancerIP: {{ .Values.server.service.loadBalancerIP }} 28 | {{- end }} 29 | {{- if .Values.server.service.loadBalancerSourceRanges }} 30 | loadBalancerSourceRanges: 31 | {{- range $cidr := .Values.server.service.loadBalancerSourceRanges }} 32 | - {{ $cidr }} 33 | {{- end }} 34 | {{- end }} 35 | ports: 36 | - name: http 37 | port: {{ .Values.server.service.servicePort }} 38 | protocol: TCP 39 | targetPort: 9090 40 | {{- if .Values.server.service.nodePort }} 41 | nodePort: {{ .Values.server.service.nodePort }} 42 | {{- end }} 43 | selector: 44 | app: {{ template "prometheus.name" . }} 45 | component: "{{ .Values.server.name }}" 46 | release: {{ .Release.Name }} 47 | type: "{{ .Values.server.service.type }}" 48 | -------------------------------------------------------------------------------- /helm/prometheus-chart/templates/prometheus-serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: {{ template "prometheus.name" . }} 6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 7 | component: "{{ .Values.server.name }}" 8 | heritage: {{ .Release.Service }} 9 | release: {{ .Release.Name }} 10 | name: prometheus 11 | -------------------------------------------------------------------------------- /manifests/0-namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: monitoring 5 | -------------------------------------------------------------------------------- /manifests/01-rbac.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: prometheus 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: ClusterRole 9 | name: prometheus 10 | subjects: 11 | - kind: ServiceAccount 12 | name: prometheus-k8s 13 | namespace: monitoring 14 | --- 15 | apiVersion: rbac.authorization.k8s.io/v1 16 | kind: ClusterRole 17 | metadata: 18 | name: prometheus 19 | rules: 20 | - apiGroups: [""] 21 | resources: 22 | - nodes 23 | - nodes/proxy 24 | - services 25 | - endpoints 26 | - pods 27 | verbs: ["get", "list", "watch"] 28 | - apiGroups: [""] 29 | resources: 30 | - configmaps 31 | verbs: ["get"] 32 | - nonResourceURLs: ["/metrics"] 33 | verbs: ["get"] 34 | --- 35 | apiVersion: v1 36 | kind: ServiceAccount 37 | metadata: 38 | name: prometheus-k8s 39 | namespace: monitoring 40 | -------------------------------------------------------------------------------- /manifests/alertmanager/alertmanager-templates.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | default.tmpl: | 4 | {{ define "__alertmanager" }}AlertManager{{ end }} 5 | {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} 6 | 7 | {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} 8 | {{ define "__description" }}{{ end }} 9 | 10 | {{ define "__text_alert_list" }}{{ range . }}Labels: 11 | {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} 12 | {{ end }}Annotations: 13 | {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} 14 | {{ end }}Source: {{ .GeneratorURL }} 15 | {{ end }}{{ end }} 16 | 17 | 18 | {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} 19 | {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} 20 | {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} 21 | {{ define "slack.default.pretext" }}{{ end }} 22 | {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} 23 | {{ define "slack.default.iconemoji" }}{{ end }} 24 | {{ define "slack.default.iconurl" }}{{ end }} 25 | {{ define "slack.default.text" }}{{ end }} 26 | 27 | 28 | {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }} 29 | {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }} 30 | 31 | 32 | {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }} 33 | {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }} 34 | {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }} 35 | {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }} 36 | 37 | 38 | {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }} 39 | {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} 40 | {{ if gt (len .Alerts.Firing) 0 -}} 41 | Alerts Firing: 42 | {{ template "__text_alert_list" .Alerts.Firing }} 43 | {{- end }} 44 | {{ if gt (len .Alerts.Resolved) 0 -}} 45 | Alerts Resolved: 46 | {{ template "__text_alert_list" .Alerts.Resolved }} 47 | {{- end }} 48 | {{- end }} 49 | {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }} 50 | 51 | 52 | {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }} 53 | {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }} 54 | 55 | 56 | {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }} 57 | {{ define "email.default.html" }} 58 | 59 | 85 | 86 | 87 | 88 | 89 | {{ template "__subject" . }} 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 175 | 176 | 177 |
99 |
100 | 101 | 102 | 107 | 108 | 109 | 164 | 165 |
103 | {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} 104 | {{ .Name }}={{ .Value }} 105 | {{ end }} 106 |
110 | 111 | 112 | 115 | 116 | {{ if gt (len .Alerts.Firing) 0 }} 117 | 118 | 121 | 122 | {{ end }} 123 | {{ range .Alerts.Firing }} 124 | 125 | 132 | 133 | {{ end }} 134 | 135 | {{ if gt (len .Alerts.Resolved) 0 }} 136 | {{ if gt (len .Alerts.Firing) 0 }} 137 | 138 | 143 | 144 | {{ end }} 145 | 146 | 149 | 150 | {{ end }} 151 | {{ range .Alerts.Resolved }} 152 | 153 | 160 | 161 | {{ end }} 162 |
113 | View in {{ template "__alertmanager" . }} 114 |
119 | [{{ .Alerts.Firing | len }}] Firing 120 |
126 | Labels
127 | {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} 128 | {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} 129 | {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} 130 | Source
131 |
139 |
140 |
141 |
142 |
147 | [{{ .Alerts.Resolved | len }}] Resolved 148 |
154 | Labels
155 | {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} 156 | {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} 157 | {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} 158 | Source
159 |
163 |
166 | 167 |
168 | 169 | 170 | 171 | 172 |
Sent by {{ template "__alertmanager" . }}
173 |
174 |
178 | 179 | 180 | 181 | 182 | {{ end }} 183 | 184 | {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }} 185 | {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} 186 | {{ if gt (len .Alerts.Firing) 0 }} 187 | Alerts Firing: 188 | {{ template "__text_alert_list" .Alerts.Firing }} 189 | {{ end }} 190 | {{ if gt (len .Alerts.Resolved) 0 }} 191 | Alerts Resolved: 192 | {{ template "__text_alert_list" .Alerts.Resolved }} 193 | {{ end }} 194 | {{ end }} 195 | {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }} 196 | slack.tmpl: | 197 | {{ define "slack.devops.text" }} 198 | {{range .Alerts}}{{.Annotations.DESCRIPTION}} 199 | {{end}} 200 | {{ end }} 201 | kind: ConfigMap 202 | metadata: 203 | creationTimestamp: null 204 | name: alertmanager-templates 205 | namespace: monitoring 206 | -------------------------------------------------------------------------------- /manifests/alertmanager/configmap.yaml: -------------------------------------------------------------------------------- 1 | kind: ConfigMap 2 | apiVersion: v1 3 | metadata: 4 | name: alertmanager 5 | namespace: monitoring 6 | data: 7 | config.yml: |- 8 | global: 9 | # ResolveTimeout is the time after which an alert is declared resolved 10 | # if it has not been updated. 11 | resolve_timeout: 5m 12 | 13 | # The smarthost and SMTP sender used for mail notifications. 14 | smtp_smarthost: 'smtp.gmail.com:587' 15 | smtp_from: 'foo@bar.com' 16 | smtp_auth_username: 'foo@bar.com' 17 | smtp_auth_password: 'barfoo' 18 | 19 | # The API URL to use for Slack notifications. 20 | slack_api_url: 'https://hooks.slack.com/services/some/api/token' 21 | 22 | # # The directory from which notification templates are read. 23 | templates: 24 | - '/etc/alertmanager-templates/*.tmpl' 25 | 26 | # The root route on which each incoming alert enters. 27 | route: 28 | 29 | # The labels by which incoming alerts are grouped together. For example, 30 | # multiple alerts coming in for cluster=A and alertname=LatencyHigh would 31 | # be batched into a single group. 32 | 33 | group_by: ['alertname', 'cluster', 'service'] 34 | 35 | # When a new group of alerts is created by an incoming alert, wait at 36 | # least 'group_wait' to send the initial notification. 37 | # This way ensures that you get multiple alerts for the same group that start 38 | # firing shortly after another are batched together on the first 39 | # notification. 40 | 41 | group_wait: 30s 42 | 43 | # When the first notification was sent, wait 'group_interval' to send a batch 44 | # of new alerts that started firing for that group. 45 | 46 | group_interval: 5m 47 | 48 | # If an alert has successfully been sent, wait 'repeat_interval' to 49 | # resend them. 50 | 51 | #repeat_interval: 1m 52 | repeat_interval: 15m 53 | 54 | # A default receiver 55 | 56 | # If an alert isn't caught by a route, send it to default. 57 | receiver: default 58 | 59 | # All the above attributes are inherited by all child routes and can 60 | # overwritten on each. 61 | 62 | # The child route trees. 63 | routes: 64 | # Send severity=slack alerts to slack. 65 | - match: 66 | severity: slack 67 | receiver: slack_alert 68 | # - match: 69 | # severity: email 70 | # receiver: email_alert 71 | 72 | receivers: 73 | - name: 'default' 74 | slack_configs: 75 | - channel: '#alertmanager-test' 76 | text: '{{ template "slack.devops.text" . }}' 77 | send_resolved: true 78 | 79 | - name: 'slack_alert' 80 | slack_configs: 81 | - channel: '#alertmanager-test' 82 | send_resolved: true 83 | -------------------------------------------------------------------------------- /manifests/alertmanager/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: alertmanager 5 | namespace: monitoring 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: alertmanager 11 | template: 12 | metadata: 13 | name: alertmanager 14 | labels: 15 | app: alertmanager 16 | spec: 17 | containers: 18 | - name: alertmanager 19 | image: quay.io/prometheus/alertmanager:v0.7.1 20 | args: 21 | - '-config.file=/etc/alertmanager/config.yml' 22 | - '-storage.path=/alertmanager' 23 | ports: 24 | - name: alertmanager 25 | containerPort: 9093 26 | volumeMounts: 27 | - name: config-volume 28 | mountPath: /etc/alertmanager 29 | - name: templates-volume 30 | mountPath: /etc/alertmanager-templates 31 | - name: alertmanager 32 | mountPath: /alertmanager 33 | volumes: 34 | - name: config-volume 35 | configMap: 36 | name: alertmanager 37 | - name: templates-volume 38 | configMap: 39 | name: alertmanager-templates 40 | - name: alertmanager 41 | emptyDir: {} 42 | -------------------------------------------------------------------------------- /manifests/alertmanager/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | annotations: 5 | prometheus.io/scrape: 'true' 6 | prometheus.io/path: '/metrics' 7 | labels: 8 | name: alertmanager 9 | name: alertmanager 10 | namespace: monitoring 11 | spec: 12 | selector: 13 | app: alertmanager 14 | type: NodePort 15 | ports: 16 | - name: alertmanager 17 | protocol: TCP 18 | port: 9093 19 | targetPort: 9093 20 | -------------------------------------------------------------------------------- /manifests/grafana/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: grafana-core 5 | namespace: monitoring 6 | labels: 7 | app: grafana 8 | component: core 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app: grafana 14 | template: 15 | metadata: 16 | labels: 17 | app: grafana 18 | component: core 19 | spec: 20 | containers: 21 | - image: grafana/grafana:4.2.0 22 | name: grafana-core 23 | imagePullPolicy: IfNotPresent 24 | # env: 25 | resources: 26 | # keep request = limit to keep this container in guaranteed class 27 | limits: 28 | cpu: 100m 29 | memory: 100Mi 30 | requests: 31 | cpu: 100m 32 | memory: 100Mi 33 | env: 34 | # The following env variables set up basic auth twith the default admin user and admin password. 35 | - name: GF_AUTH_BASIC_ENABLED 36 | value: "true" 37 | - name: GF_SECURITY_ADMIN_USER 38 | valueFrom: 39 | secretKeyRef: 40 | name: grafana 41 | key: admin-username 42 | - name: GF_SECURITY_ADMIN_PASSWORD 43 | valueFrom: 44 | secretKeyRef: 45 | name: grafana 46 | key: admin-password 47 | - name: GF_AUTH_ANONYMOUS_ENABLED 48 | value: "false" 49 | # - name: GF_AUTH_ANONYMOUS_ORG_ROLE 50 | # value: Admin 51 | # does not really work, because of template variables in exported dashboards: 52 | # - name: GF_DASHBOARDS_JSON_ENABLED 53 | # value: "true" 54 | readinessProbe: 55 | httpGet: 56 | path: /login 57 | port: 3000 58 | # initialDelaySeconds: 30 59 | # timeoutSeconds: 1 60 | volumeMounts: 61 | - name: grafana-persistent-storage 62 | mountPath: /var/lib/grafana 63 | volumes: 64 | - name: grafana-persistent-storage 65 | emptyDir: {} 66 | -------------------------------------------------------------------------------- /manifests/grafana/import-dashboards/job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: grafana-import-dashboards 5 | namespace: monitoring 6 | labels: 7 | app: grafana 8 | component: import-dashboards 9 | spec: 10 | template: 11 | metadata: 12 | name: grafana-import-dashboards 13 | labels: 14 | app: grafana 15 | component: import-dashboards 16 | spec: 17 | serviceAccountName: prometheus-k8s 18 | initContainers: 19 | - name: wait-for-grafana 20 | image: giantswarm/tiny-tools 21 | args: 22 | - /bin/sh 23 | - -c 24 | - > 25 | set -x; 26 | while [ $(curl -Lsw '%{http_code}' "http://grafana:3000" -o /dev/null) -ne 200 ]; do 27 | echo '.' 28 | sleep 15; 29 | done 30 | containers: 31 | - name: grafana-import-dashboards 32 | image: giantswarm/tiny-tools 33 | command: ["/bin/sh", "-c"] 34 | workingDir: /opt/grafana-import-dashboards 35 | args: 36 | - > 37 | for file in *-datasource.json ; do 38 | if [ -e "$file" ] ; then 39 | echo "importing $file" && 40 | curl --silent --fail --show-error \ 41 | --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/datasources \ 42 | --header "Content-Type: application/json" \ 43 | --data-binary "@$file" ; 44 | echo "" ; 45 | fi 46 | done ; 47 | for file in *-dashboard.json ; do 48 | if [ -e "$file" ] ; then 49 | echo "importing $file" && 50 | ( echo '{"dashboard":'; \ 51 | cat "$file"; \ 52 | echo ',"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}' ) \ 53 | | jq -c '.' \ 54 | | curl --silent --fail --show-error \ 55 | --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/dashboards/import \ 56 | --header "Content-Type: application/json" \ 57 | --data-binary "@-" ; 58 | echo "" ; 59 | fi 60 | done 61 | 62 | env: 63 | - name: GF_ADMIN_USER 64 | valueFrom: 65 | secretKeyRef: 66 | name: grafana 67 | key: admin-username 68 | - name: GF_ADMIN_PASSWORD 69 | valueFrom: 70 | secretKeyRef: 71 | name: grafana 72 | key: admin-password 73 | volumeMounts: 74 | - name: config-volume 75 | mountPath: /opt/grafana-import-dashboards 76 | restartPolicy: Never 77 | volumes: 78 | - name: config-volume 79 | configMap: 80 | name: grafana-import-dashboards 81 | -------------------------------------------------------------------------------- /manifests/grafana/ingress.yaml: -------------------------------------------------------------------------------- 1 | # apiVersion: apps/v1 2 | # kind: Ingress 3 | # metadata: 4 | # name: grafana 5 | # namespace: monitoring 6 | # spec: 7 | # rules: 8 | # - host: ..k8s.gigantic.io 9 | # http: 10 | # paths: 11 | # - path: / 12 | # backend: 13 | # serviceName: grafana 14 | # servicePort: 3000 15 | -------------------------------------------------------------------------------- /manifests/grafana/secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | data: 4 | admin-password: YWRtaW4= 5 | admin-username: YWRtaW4= 6 | metadata: 7 | name: grafana 8 | namespace: monitoring 9 | type: Opaque 10 | -------------------------------------------------------------------------------- /manifests/grafana/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: grafana 5 | namespace: monitoring 6 | labels: 7 | app: grafana 8 | component: core 9 | spec: 10 | type: NodePort 11 | ports: 12 | - port: 3000 13 | selector: 14 | app: grafana 15 | component: core 16 | -------------------------------------------------------------------------------- /manifests/prometheus/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | prometheus.yaml: | 4 | global: 5 | scrape_interval: 10s 6 | scrape_timeout: 10s 7 | evaluation_interval: 10s 8 | rule_files: 9 | - "/etc/prometheus-rules/*.rules" 10 | scrape_configs: 11 | 12 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37 13 | - job_name: 'kubernetes-nodes' 14 | tls_config: 15 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 16 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 17 | kubernetes_sd_configs: 18 | - role: node 19 | relabel_configs: 20 | - source_labels: [__address__] 21 | regex: '(.*):10250' 22 | replacement: '${1}:10255' 23 | target_label: __address__ 24 | 25 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79 26 | - job_name: 'kubernetes-endpoints' 27 | kubernetes_sd_configs: 28 | - role: endpoints 29 | relabel_configs: 30 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] 31 | action: keep 32 | regex: true 33 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] 34 | action: replace 35 | target_label: __scheme__ 36 | regex: (https?) 37 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] 38 | action: replace 39 | target_label: __metrics_path__ 40 | regex: (.+) 41 | - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] 42 | action: replace 43 | target_label: __address__ 44 | regex: (.+)(?::\d+);(\d+) 45 | replacement: $1:$2 46 | - action: labelmap 47 | regex: __meta_kubernetes_service_label_(.+) 48 | - source_labels: [__meta_kubernetes_namespace] 49 | action: replace 50 | target_label: kubernetes_namespace 51 | - source_labels: [__meta_kubernetes_service_name] 52 | action: replace 53 | target_label: kubernetes_name 54 | 55 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119 56 | - job_name: 'kubernetes-services' 57 | metrics_path: /probe 58 | params: 59 | module: [http_2xx] 60 | kubernetes_sd_configs: 61 | - role: service 62 | relabel_configs: 63 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] 64 | action: keep 65 | regex: true 66 | - source_labels: [__address__] 67 | target_label: __param_target 68 | - target_label: __address__ 69 | replacement: blackbox 70 | - source_labels: [__param_target] 71 | target_label: instance 72 | - action: labelmap 73 | regex: __meta_kubernetes_service_label_(.+) 74 | - source_labels: [__meta_kubernetes_namespace] 75 | target_label: kubernetes_namespace 76 | - source_labels: [__meta_kubernetes_service_name] 77 | target_label: kubernetes_name 78 | 79 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156 80 | - job_name: 'kubernetes-pods' 81 | kubernetes_sd_configs: 82 | - role: pod 83 | relabel_configs: 84 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] 85 | action: keep 86 | regex: true 87 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] 88 | action: replace 89 | target_label: __metrics_path__ 90 | regex: (.+) 91 | - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] 92 | action: replace 93 | regex: (.+):(?:\d+);(\d+) 94 | replacement: ${1}:${2} 95 | target_label: __address__ 96 | - action: labelmap 97 | regex: __meta_kubernetes_pod_label_(.+) 98 | - source_labels: [__meta_kubernetes_namespace] 99 | action: replace 100 | target_label: kubernetes_namespace 101 | - source_labels: [__meta_kubernetes_pod_name] 102 | action: replace 103 | target_label: kubernetes_pod_name 104 | - source_labels: [__meta_kubernetes_pod_container_port_number] 105 | action: keep 106 | regex: 9\d{3} 107 | 108 | - job_name: 'kubernetes-cadvisor' 109 | scheme: https 110 | tls_config: 111 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 112 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 113 | kubernetes_sd_configs: 114 | - role: node 115 | relabel_configs: 116 | - action: labelmap 117 | regex: __meta_kubernetes_node_label_(.+) 118 | - target_label: __address__ 119 | replacement: kubernetes.default.svc:443 120 | - source_labels: [__meta_kubernetes_node_name] 121 | regex: (.+) 122 | target_label: __metrics_path__ 123 | replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor 124 | kind: ConfigMap 125 | metadata: 126 | creationTimestamp: null 127 | name: prometheus-core 128 | namespace: monitoring 129 | -------------------------------------------------------------------------------- /manifests/prometheus/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: prometheus-core 5 | namespace: monitoring 6 | labels: 7 | app: prometheus 8 | component: core 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app: prometheus 14 | template: 15 | metadata: 16 | name: prometheus-main 17 | labels: 18 | app: prometheus 19 | component: core 20 | spec: 21 | serviceAccountName: prometheus-k8s 22 | containers: 23 | - name: prometheus 24 | image: prom/prometheus:v1.7.0 25 | args: 26 | - '-storage.local.retention=12h' 27 | - '-storage.local.memory-chunks=500000' 28 | - '-config.file=/etc/prometheus/prometheus.yaml' 29 | - '-alertmanager.url=http://alertmanager:9093/' 30 | ports: 31 | - name: webui 32 | containerPort: 9090 33 | resources: 34 | requests: 35 | cpu: 500m 36 | memory: 500M 37 | limits: 38 | cpu: 500m 39 | memory: 500M 40 | volumeMounts: 41 | - name: config-volume 42 | mountPath: /etc/prometheus 43 | - name: rules-volume 44 | mountPath: /etc/prometheus-rules 45 | volumes: 46 | - name: config-volume 47 | configMap: 48 | name: prometheus-core 49 | - name: rules-volume 50 | configMap: 51 | name: prometheus-rules 52 | -------------------------------------------------------------------------------- /manifests/prometheus/kube-state-metrics/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: kube-state-metrics 5 | namespace: monitoring 6 | spec: 7 | replicas: 1 8 | template: 9 | metadata: 10 | labels: 11 | app: kube-state-metrics 12 | spec: 13 | serviceAccountName: kube-state-metrics 14 | containers: 15 | - name: kube-state-metrics 16 | image: gcr.io/google_containers/kube-state-metrics:v0.5.0 17 | ports: 18 | - containerPort: 8080 19 | -------------------------------------------------------------------------------- /manifests/prometheus/kube-state-metrics/rbac.yaml: -------------------------------------------------------------------------------- 1 | # --- 2 | # apiVersion: rbac.authorization.k8s.io/v1 3 | # kind: ClusterRoleBinding 4 | # metadata: 5 | # name: kube-state-metrics 6 | # roleRef: 7 | # apiGroup: rbac.authorization.k8s.io 8 | # kind: ClusterRole 9 | # name: kube-state-metrics 10 | # subjects: 11 | # - kind: ServiceAccount 12 | # name: kube-state-metrics 13 | # namespace: monitoring 14 | # --- 15 | # apiVersion: rbac.authorization.k8s.io/v1 16 | # kind: ClusterRole 17 | # metadata: 18 | # name: kube-state-metrics 19 | # rules: 20 | # - apiGroups: [""] 21 | # resources: 22 | # - nodes 23 | # - pods 24 | # - services 25 | # - resourcequotas 26 | # - replicationcontrollers 27 | # - limitranges 28 | # verbs: ["list", "watch"] 29 | # - apiGroups: ["extensions"] 30 | # resources: 31 | # - daemonsets 32 | # - deployments 33 | # - replicasets 34 | # verbs: ["list", "watch"] 35 | # --- 36 | apiVersion: v1 37 | kind: ServiceAccount 38 | metadata: 39 | name: kube-state-metrics 40 | namespace: monitoring 41 | -------------------------------------------------------------------------------- /manifests/prometheus/kube-state-metrics/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | annotations: 5 | prometheus.io/scrape: 'true' 6 | name: kube-state-metrics 7 | namespace: monitoring 8 | labels: 9 | app: kube-state-metrics 10 | spec: 11 | ports: 12 | - name: kube-state-metrics 13 | port: 8080 14 | protocol: TCP 15 | selector: 16 | app: kube-state-metrics 17 | 18 | -------------------------------------------------------------------------------- /manifests/prometheus/node-directory-size-metrics/daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: node-directory-size-metrics 5 | namespace: monitoring 6 | labels: 7 | app: node-directory-size-metrics 8 | annotations: 9 | description: | 10 | This `DaemonSet` provides metrics in Prometheus format about disk usage on the nodes. 11 | The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now. 12 | The other container `caddy` just hands out the contents of that file on request via `http` on `/metrics` at port `9102` which are the defaults for Prometheus. 13 | These are scheduled on every node in the Kubernetes cluster. 14 | To choose directories from the node to check, just mount them on the `read-du` container below `/mnt`. 15 | spec: 16 | selector: 17 | matchLabels: 18 | app: node-directory-size-metrics 19 | template: 20 | metadata: 21 | labels: 22 | app: node-directory-size-metrics 23 | annotations: 24 | prometheus.io/scrape: 'true' 25 | prometheus.io/port: '9102' 26 | description: | 27 | This `Pod` provides metrics in Prometheus format about disk usage on the node. 28 | The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now. 29 | The other container `caddy` just hands out the contents of that file on request on `/metrics` at port `9102` which are the defaults for Prometheus. 30 | This `Pod` is scheduled on every node in the Kubernetes cluster. 31 | To choose directories from the node to check just mount them on `read-du` below `/mnt`. 32 | spec: 33 | containers: 34 | - name: read-du 35 | image: giantswarm/tiny-tools 36 | imagePullPolicy: Always 37 | # FIXME threshold via env var 38 | # The 39 | command: 40 | - fish 41 | - --command 42 | - | 43 | touch /tmp/metrics-temp 44 | while true 45 | for directory in (du --bytes --separate-dirs --threshold=100M /mnt) 46 | echo $directory | read size path 47 | echo "node_directory_size_bytes{path=\"$path\"} $size" \ 48 | >> /tmp/metrics-temp 49 | end 50 | mv /tmp/metrics-temp /tmp/metrics 51 | sleep 300 52 | end 53 | volumeMounts: 54 | - name: host-fs-var 55 | mountPath: /mnt/var 56 | readOnly: true 57 | - name: metrics 58 | mountPath: /tmp 59 | - name: caddy 60 | image: dockermuenster/caddy:0.9.3 61 | command: 62 | - "caddy" 63 | - "-port=9102" 64 | - "-root=/var/www" 65 | ports: 66 | - containerPort: 9102 67 | volumeMounts: 68 | - name: metrics 69 | mountPath: /var/www 70 | volumes: 71 | - name: host-fs-var 72 | hostPath: 73 | path: /var 74 | - name: metrics 75 | emptyDir: 76 | medium: Memory 77 | -------------------------------------------------------------------------------- /manifests/prometheus/node-exporter/daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: prometheus-node-exporter 5 | namespace: monitoring 6 | labels: 7 | app: prometheus 8 | component: node-exporter 9 | spec: 10 | selector: 11 | matchLabels: 12 | app: prometheus 13 | template: 14 | metadata: 15 | name: prometheus-node-exporter 16 | labels: 17 | app: prometheus 18 | component: node-exporter 19 | spec: 20 | containers: 21 | - image: prom/node-exporter:v0.14.0 22 | name: prometheus-node-exporter 23 | ports: 24 | - name: prom-node-exp 25 | #^ must be an IANA_SVC_NAME (at most 15 characters, ..) 26 | containerPort: 9100 27 | hostPort: 9100 28 | hostNetwork: true 29 | hostPID: true 30 | -------------------------------------------------------------------------------- /manifests/prometheus/node-exporter/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | annotations: 5 | prometheus.io/scrape: 'true' 6 | name: prometheus-node-exporter 7 | namespace: monitoring 8 | labels: 9 | app: prometheus 10 | component: node-exporter 11 | spec: 12 | clusterIP: None 13 | ports: 14 | - name: prometheus-node-exporter 15 | port: 9100 16 | protocol: TCP 17 | selector: 18 | app: prometheus 19 | component: node-exporter 20 | type: ClusterIP 21 | -------------------------------------------------------------------------------- /manifests/prometheus/prometheus-rules.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | cpu-usage.rules: | 4 | ALERT NodeCPUUsage 5 | IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75 6 | FOR 2m 7 | LABELS { 8 | severity="page" 9 | } 10 | ANNOTATIONS { 11 | SUMMARY = "{{$labels.instance}}: High CPU usage detected", 12 | DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" 13 | } 14 | instance-availability.rules: | 15 | ALERT InstanceDown 16 | IF up == 0 17 | FOR 1m 18 | LABELS { severity = "page" } 19 | ANNOTATIONS { 20 | summary = "Instance {{ $labels.instance }} down", 21 | description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.", 22 | } 23 | low-disk-space.rules: | 24 | ALERT NodeLowRootDisk 25 | IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75 26 | FOR 2m 27 | LABELS { 28 | severity="page" 29 | } 30 | ANNOTATIONS { 31 | SUMMARY = "{{$labels.instance}}: Low root disk space", 32 | DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})" 33 | } 34 | 35 | ALERT NodeLowDataDisk 36 | IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75 37 | FOR 2m 38 | LABELS { 39 | severity="page" 40 | } 41 | ANNOTATIONS { 42 | SUMMARY = "{{$labels.instance}}: Low data disk space", 43 | DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})" 44 | } 45 | mem-usage.rules: | 46 | ALERT NodeSwapUsage 47 | IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75 48 | FOR 2m 49 | LABELS { 50 | severity="page" 51 | } 52 | ANNOTATIONS { 53 | SUMMARY = "{{$labels.instance}}: Swap usage detected", 54 | DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})" 55 | } 56 | 57 | ALERT NodeMemoryUsage 58 | IF (((node_memory_MemTotal-node_memory_MemAvailable)/(node_memory_MemTotal)*100)) > 75 59 | FOR 2m 60 | LABELS { 61 | severity="page" 62 | } 63 | ANNOTATIONS { 64 | SUMMARY = "{{$labels.instance}}: High memory usage detected", 65 | DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})" 66 | } 67 | kind: ConfigMap 68 | metadata: 69 | creationTimestamp: null 70 | name: prometheus-rules 71 | namespace: monitoring 72 | -------------------------------------------------------------------------------- /manifests/prometheus/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: prometheus 5 | namespace: monitoring 6 | labels: 7 | app: prometheus 8 | component: core 9 | annotations: 10 | prometheus.io/scrape: 'true' 11 | spec: 12 | type: NodePort 13 | ports: 14 | - port: 9090 15 | protocol: TCP 16 | name: webui 17 | selector: 18 | app: prometheus 19 | component: core 20 | --------------------------------------------------------------------------------