├── .circleci
└── config.yml
├── CONTRIBUTING.md
├── DCO
├── LICENSE
├── README.md
├── SECURITY.md
├── build.sh
├── ci-scripts
└── package.sh
├── configs
├── alertmanager-templates
│ ├── default.tmpl
│ └── slack.tmpl
├── grafana
│ ├── grafana-net-2-dashboard.json
│ ├── grafana-net-737-dashboard.json
│ └── prometheus-datasource.json
└── prometheus
│ ├── prometheus.yaml
│ └── rules
│ ├── cpu-usage.rules
│ ├── instance-availability.rules
│ ├── low-disk-space.rules
│ └── mem-usage.rules
├── development.md
├── docs
├── grafana_cluster_overview.png
├── grafana_edit_admin.png
├── index.md
├── prometheus_alerts.png
└── prometheus_targets.png
├── helm
└── prometheus-chart
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── OWNERS
│ ├── README.md
│ ├── grafana-dashboards
│ ├── kubernetes-cluster.json
│ ├── kubernetes-core-cluster.json
│ ├── kubernetes-pods.json
│ ├── logging.json
│ ├── nodes-overview.json
│ └── prometheus.json
│ ├── prometheus-alerts
│ ├── alertmanager.rules.yml
│ ├── configmap.rules.yml
│ ├── container.rules.yml
│ ├── daemonset.rules.yml
│ ├── deployment.rules.yml
│ ├── disk.rules.yml
│ ├── fluentbit.rules.yml
│ ├── ingress-controller.rules.yml
│ ├── job.rules.yml
│ ├── logging-data.rules.yml
│ ├── network.rules.yml
│ ├── node.rules.yml
│ ├── pod.rules.yml
│ ├── prometheus.rules.yml
│ ├── up.rules.yml
│ └── volume.rules.yml
│ ├── templates
│ ├── NOTES.txt
│ ├── _helpers.tpl
│ ├── alertmanager-configmap.yaml
│ ├── alertmanager-deployment.yaml
│ ├── alertmanager-ingress.yaml
│ ├── alertmanager-networkpolicy.yaml
│ ├── alertmanager-pvc.yaml
│ ├── alertmanager-service.yaml
│ ├── alertmanager-serviceaccount.yaml
│ ├── grafana-configmap.yaml
│ ├── grafana-deployment.yaml
│ ├── grafana-ingress.yaml
│ ├── grafana-podsecuritypolicy.yaml
│ ├── grafana-role.yaml
│ ├── grafana-rolebinding.yaml
│ ├── grafana-secret.yaml
│ ├── grafana-service.yaml
│ ├── grafana-serviceaccount.yaml
│ ├── prometheus-clusterrole.yaml
│ ├── prometheus-clusterrolebinding.yaml
│ ├── prometheus-configmap.yaml
│ ├── prometheus-deployment.yaml
│ ├── prometheus-ingress.yaml
│ ├── prometheus-networkpolicy.yaml
│ ├── prometheus-pvc.yaml
│ ├── prometheus-rules.yaml
│ ├── prometheus-service.yaml
│ └── prometheus-serviceaccount.yaml
│ └── values.yaml
├── manifests-all.yaml
└── manifests
├── 0-namespace.yaml
├── 01-rbac.yaml
├── alertmanager
├── alertmanager-templates.yaml
├── configmap.yaml
├── deployment.yaml
└── service.yaml
├── grafana
├── deployment.yaml
├── import-dashboards
│ ├── configmap.yaml
│ └── job.yaml
├── ingress.yaml
├── secret.yaml
└── service.yaml
└── prometheus
├── configmap.yaml
├── deployment.yaml
├── kube-state-metrics
├── deployment.yaml
├── rbac.yaml
└── service.yaml
├── node-directory-size-metrics
└── daemonset.yaml
├── node-exporter
├── daemonset.yaml
└── service.yaml
├── prometheus-rules.yaml
└── service.yaml
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | jobs:
3 | lint:
4 | docker:
5 | - image: quay.io/giantswarm/shellcheck-alpine:v0.6.0
6 | steps:
7 | - checkout
8 |
9 | - run:
10 | name: lint scripts
11 | command: shellcheck -x ci-scripts/*
12 |
13 | build:
14 | working_directory: /home/circleci/.go_workspace/src/github.com/giantswarm/prometheus
15 | machine: true
16 | steps:
17 | - checkout
18 |
19 | - run: |
20 | wget -q $(curl -sS -H "Authorization: token $RELEASE_TOKEN" https://api.github.com/repos/giantswarm/architect/releases/latest | grep browser_download_url | head -n 1 | cut -d '"' -f 4)
21 | chmod +x ./architect
22 | ./architect version
23 |
24 | - run:
25 | name: Template and push to quay
26 | command: ./architect build
27 |
28 | - store_test_results:
29 | path: /tmp/results
30 |
31 | publish-github-release:
32 | docker:
33 | - image: quay.io/giantswarm/github:0.12
34 | steps:
35 | - checkout
36 |
37 | - run:
38 | name: "Package Helm Chart"
39 | command: |
40 | ./ci-scripts/package.sh ${CIRCLE_PROJECT_REPONAME} ${CIRCLE_TAG}
41 |
42 | - run:
43 | name: "Publish Release on GitHub"
44 | command: |
45 | ghr -t ${PERSONAL_ACCESS_TOKEN} -u giantswarm -r ${CIRCLE_PROJECT_REPONAME} -c ${CIRCLE_SHA1} ${CIRCLE_TAG} "${CIRCLE_PROJECT_REPONAME}-chart-${CIRCLE_TAG:1}.tgz"
46 |
47 | workflows:
48 | version: 2
49 | build-deploy:
50 | jobs:
51 | - lint
52 | - build:
53 | filters:
54 | tags:
55 | only: /^v\d+\.\d+\.\d+$/
56 | - publish-github-release:
57 | requires:
58 | - build
59 | filters:
60 | branches:
61 | ignore: /.*/
62 | tags:
63 | only: /^v\d+\.\d+\.\d+$/
64 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | Prometheus is Apache 2.0 licensed and accepts contributions via GitHub pull
4 | requests. This document outlines some of the conventions on commit message
5 | formatting, contact points for developers and other resources to make getting
6 | your contribution into PROJECT easier.
7 |
8 | # Email and chat
9 |
10 | - Email: [giantswarm](https://groups.google.com/forum/#!forum/giantswarm)
11 | - IRC: #[giantswarm](irc://irc.freenode.org:6667/#giantswarm) IRC channel on
12 | freenode.org
13 |
14 | ## Getting started
15 |
16 | - Fork the repository on GitHub
17 | - Read the [README](README.md) for build instructions
18 |
19 | ## Reporting Bugs and Creating Issues
20 |
21 | Reporting bugs is one of the best ways to contribute. If you find bugs or
22 | documentation mistakes in the PROJECT project, please let us know by [opening an
23 | issue](https://github.com/giantswarm/PROJECT/issues/new). We treat bugs and
24 | mistakes very seriously and believe no issue is too small. Before creating a bug
25 | report, please check there that one does not already exist.
26 |
27 | To make your bug report accurate and easy to understand, please try to create
28 | bug reports that are:
29 |
30 | - Specific. Include as much details as possible: which version, what
31 | environment, what configuration, etc. You can also attach logs.
32 |
33 | - Reproducible. Include the steps to reproduce the problem. We understand some
34 | issues might be hard to reproduce, please includes the steps that might lead
35 | to the problem. If applicable, you can also attach affected data dir(s) and a
36 | stack trace to the bug report.
37 |
38 | - Isolated. Please try to isolate and reproduce the bug with minimum
39 | dependencies. It would significantly slow down the speed to fix a bug if too
40 | many dependencies are involved in a bug report. Debugging external systems
41 | that rely on PROJECT is out of scope, but we are happy to point you in the
42 | right direction or help you interact with PROJECT in the correct manner.
43 |
44 | - Unique. Do not duplicate existing bug reports.
45 |
46 | - Scoped. One bug per report. Do not follow up with another bug inside one
47 | report.
48 |
49 | You might also want to read [Elika Etemad’s article on filing good bug
50 | reports](http://fantasai.inkedblade.net/style/talks/filing-good-bugs/) before
51 | creating a bug report.
52 |
53 | We might ask you for further information to locate a bug. A duplicated bug
54 | report will be closed.
55 |
56 | ## Contribution flow
57 |
58 | This is a rough outline of what a contributor's workflow looks like:
59 |
60 | - Create a feature branch from where you want to base your work. This is usually
61 | master.
62 | - Make commits of logical units.
63 | - Make sure your commit messages are in the proper format (see below).
64 | - Push your changes to a topic branch in your fork of the repository.
65 | - Submit a pull request to giantswarm/PROJECT.
66 | - Adding unit tests will greatly improve the chance for getting a quick review
67 | and your PR accepted.
68 | - Your PR must receive a LGTM from project owners.
69 | - When PR gets approval, all commits are squashed into one and merged by one of
70 | the owners.
71 |
72 | Thanks for your contributions!
73 |
74 | ### Code style
75 |
76 | The coding style suggested by the Golang community is used. See the [style
77 | doc](https://github.com/golang/go/wiki/CodeReviewComments) for details.
78 |
79 | Please follow this style to make the code easy to review, maintain, and develop.
80 |
81 | ### Format of the Commit Message
82 |
83 | We follow a rough convention for commit messages that is designed to answer two
84 | questions: what changed and why. The subject line should feature the what and
85 | the body of the commit should describe the why.
--------------------------------------------------------------------------------
/DCO:
--------------------------------------------------------------------------------
1 | Developer Certificate of Origin
2 | Version 1.1
3 |
4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
5 | 660 York Street, Suite 102,
6 | San Francisco, CA 94110 USA
7 |
8 | Everyone is permitted to copy and distribute verbatim copies of this
9 | license document, but changing it is not allowed.
10 |
11 |
12 | Developer's Certificate of Origin 1.1
13 |
14 | By making a contribution to this project, I certify that:
15 |
16 | (a) The contribution was created in whole or in part by me and I
17 | have the right to submit it under the open source license
18 | indicated in the file; or
19 |
20 | (b) The contribution is based upon previous work that, to the best
21 | of my knowledge, is covered under an appropriate open source
22 | license and I have the right under that license to submit that
23 | work with modifications, whether created in whole or in part
24 | by me, under the same open source license (unless I am
25 | permitted to submit under a different license), as indicated
26 | in the file; or
27 |
28 | (c) The contribution was provided directly to me by some other
29 | person who certified (a), (b) or (c) and I have not modified
30 | it.
31 |
32 | (d) I understand and agree that this project and the contribution
33 | are public and that a record of the contribution (including all
34 | personal information I submit with it, including my sign-off) is
35 | maintained indefinitely and may be redistributed consistent with
36 | this project or the open source license(s) involved.
37 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2016 - 2019 Giant Swarm GmbH
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://circleci.com/gh/giantswarm/prometheus)
2 | # Kubernetes Setup for Prometheus and Grafana
3 |
4 | ## Quick start
5 |
6 | To quickly start all the things just do this:
7 | ```bash
8 | kubectl apply \
9 | --filename https://raw.githubusercontent.com/giantswarm/prometheus/master/manifests-all.yaml
10 | ```
11 |
12 | This will create the namespace `monitoring` and bring up all components in there.
13 |
14 | To shut down all components again you can just delete that namespace:
15 | ```bash
16 | kubectl delete namespace monitoring
17 | ```
18 |
19 | ## Default Dashboards
20 |
21 | If you want to re-import the default dashboards from this setup run this job:
22 | ```bash
23 | kubectl apply --filename ./manifests/grafana/import-dashboards/job.yaml
24 | ```
25 |
26 | In case the job already exists from an earlier run, delete it before:
27 | ```bash
28 | kubectl --namespace monitoring delete job grafana-import-dashboards
29 | ```
30 |
31 | To access grafana you can use port forward functionality
32 | ```bash
33 | kubectl port-forward --namespace monitoring service/grafana 3000:3000
34 | ```
35 | And you should be able to access grafana on `http://localhost:3000/login`
36 |
37 | ## More Dashboards
38 |
39 | See grafana.net for some example [dashboards](https://grafana.net/dashboards) and [plugins](https://grafana.net/plugins).
40 |
41 | - Configure [Prometheus](https://grafana.net/plugins/prometheus) data source for Grafana.
42 | `Grafana UI / Data Sources / Add data source`
43 | - `Name`: `prometheus`
44 | - `Type`: `Prometheus`
45 | - `Url`: `http://prometheus:9090`
46 | - `Add`
47 |
48 | - Import [Prometheus Stats](https://grafana.net/dashboards/2):
49 | `Grafana UI / Dashboards / Import`
50 | - `Grafana.net Dashboard`: `https://grafana.net/dashboards/2`
51 | - `Load`
52 | - `Prometheus`: `prometheus`
53 | - `Save & Open`
54 |
55 | - Import [Kubernetes cluster monitoring](https://grafana.net/dashboards/162):
56 | `Grafana UI / Dashboards / Import`
57 | - `Grafana.net Dashboard`: `https://grafana.net/dashboards/162`
58 | - `Load`
59 | - `Prometheus`: `prometheus`
60 | - `Save & Open`
61 |
62 | ## Credit
63 |
64 | Alertmanager configs and integration in this repository was heavily inspired by the implementation in [kayrus/prometheus-kubernetes](https://github.com/kayrus/prometheus-kubernetes).
65 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Reporting a Vulnerability
4 |
5 | Please visit https://www.giantswarm.io/responsible-disclosure for information on reporting security issues.
6 |
--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Create ConfigMap with prometheus rules for alerting
4 | kubectl --namespace monitoring create configmap --dry-run prometheus-rules \
5 | --from-file=configs/prometheus/rules \
6 | --output yaml \
7 | > ./manifests/prometheus/prometheus-rules.yaml
8 | # Workaround since `--namespace monitoring` from above is not preserved
9 | echo " namespace: monitoring" >> ./manifests/prometheus/prometheus-rules.yaml
10 |
11 | # Create ConfigMap for an external url
12 | kubectl --namespace monitoring create configmap --dry-run alertmanager-templates \
13 | --from-file=configs/alertmanager-templates \
14 | --output yaml \
15 | > ./manifests/alertmanager/alertmanager-templates.yaml
16 | # Workaround since `--namespace monitoring` from above is not preserved
17 | echo " namespace: monitoring" >> ./manifests/alertmanager/alertmanager-templates.yaml
18 |
19 | # Create ConfigMap with Grafana dashboards and datasources
20 | kubectl --namespace monitoring create configmap --dry-run grafana-import-dashboards \
21 | --from-file=configs/grafana \
22 | --output yaml \
23 | > ./manifests/grafana/import-dashboards/configmap.yaml
24 | # Workaround since `--namespace monitoring` from above is not preserved
25 | echo " namespace: monitoring" >> ./manifests/grafana/import-dashboards/configmap.yaml
26 |
27 | # Create ConfigMap with Prometheus config
28 | kubectl --namespace monitoring create configmap --dry-run prometheus-core \
29 | --from-file=configs/prometheus/prometheus.yaml \
30 | --output yaml \
31 | > ./manifests/prometheus/configmap.yaml
32 | # Workaround since `--namespace monitoring` from above is not preserved
33 | echo " namespace: monitoring" >> ./manifests/prometheus/configmap.yaml
34 |
35 | # Create one single manifest file
36 | target="./manifests-all.yaml"
37 | rm "$target"
38 | echo "# Derived from ./manifests" >> "$target"
39 | for file in $(find ./manifests -type f -name "*.yaml" | sort) ; do
40 | echo "---" >> "$target"
41 | cat "$file" >> "$target"
42 | done
43 |
--------------------------------------------------------------------------------
/ci-scripts/package.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -o errexit
4 | set -o nounset
5 | set -o pipefail
6 |
7 | readonly PROJECT=$1
8 | readonly TAG=$2
9 | readonly VERSION=${TAG:1}
10 |
11 | readonly HELM_URL=https://get.helm.sh
12 | readonly HELM_TARBALL=helm-v2.17.0-linux-amd64.tar.gz
13 |
14 | main() {
15 | if ! setup_helm_client; then
16 | log_error "Helm client could not get installed."
17 | exit 1
18 | fi
19 |
20 | if ! package_chart "${PROJECT}" "${VERSION}"; then
21 | log_error "Helm Chart could not be packaged."
22 | exit 1
23 | fi
24 |
25 | echo "Successfully packaged ${PROJECT}-chart-${VERSION}.tgz"
26 | }
27 |
28 | setup_helm_client() {
29 | echo "Setting up Helm client..."
30 |
31 | curl --user-agent curl-ci-sync -sSL -o "${HELM_TARBALL}" "${HELM_URL}/${HELM_TARBALL}"
32 | tar xzf "${HELM_TARBALL}"
33 |
34 | PATH="$(pwd)/linux-amd64/:$PATH"
35 | helm init --client-only
36 | }
37 |
38 | package_chart() {
39 | local project="${1?Specify project}"
40 | local version="${2?Specify version}"
41 |
42 | # Replace CI version with release version
43 | sed -i 's/version:.*/version: '"${version}"'/' "helm/${project}-chart/Chart.yaml"
44 |
45 | helm package --save=false "helm/${project}-chart"
46 | }
47 |
48 | log_error() {
49 | printf '\e[31mERROR: %s\n\e[39m' "$1" >&2
50 | }
51 |
52 | main
53 |
--------------------------------------------------------------------------------
/configs/alertmanager-templates/default.tmpl:
--------------------------------------------------------------------------------
1 | {{ define "__alertmanager" }}AlertManager{{ end }}
2 | {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
3 |
4 | {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
5 | {{ define "__description" }}{{ end }}
6 |
7 | {{ define "__text_alert_list" }}{{ range . }}Labels:
8 | {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }}
9 | {{ end }}Annotations:
10 | {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }}
11 | {{ end }}Source: {{ .GeneratorURL }}
12 | {{ end }}{{ end }}
13 |
14 |
15 | {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }}
16 | {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }}
17 | {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }}
18 | {{ define "slack.default.pretext" }}{{ end }}
19 | {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }}
20 | {{ define "slack.default.iconemoji" }}{{ end }}
21 | {{ define "slack.default.iconurl" }}{{ end }}
22 | {{ define "slack.default.text" }}{{ end }}
23 |
24 |
25 | {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }}
26 | {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }}
27 |
28 |
29 | {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }}
30 | {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }}
31 | {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }}
32 | {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }}
33 |
34 |
35 | {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }}
36 | {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
37 | {{ if gt (len .Alerts.Firing) 0 -}}
38 | Alerts Firing:
39 | {{ template "__text_alert_list" .Alerts.Firing }}
40 | {{- end }}
41 | {{ if gt (len .Alerts.Resolved) 0 -}}
42 | Alerts Resolved:
43 | {{ template "__text_alert_list" .Alerts.Resolved }}
44 | {{- end }}
45 | {{- end }}
46 | {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }}
47 |
48 |
49 | {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }}
50 | {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }}
51 |
52 |
53 | {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }}
54 | {{ define "email.default.html" }}
55 |
56 |
82 |
83 |
84 |
85 |
86 | {{ template "__subject" . }}
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 | |
95 |
96 |
97 |
98 |
99 |
100 | {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }}
101 | {{ .Name }}={{ .Value }}
102 | {{ end }}
103 | |
104 |
105 |
106 |
107 |
108 |
109 |
110 | View in {{ template "__alertmanager" . }}
111 | |
112 |
113 | {{ if gt (len .Alerts.Firing) 0 }}
114 |
115 |
116 | [{{ .Alerts.Firing | len }}] Firing
117 | |
118 |
119 | {{ end }}
120 | {{ range .Alerts.Firing }}
121 |
122 |
123 | Labels
124 | {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
125 | {{ if gt (len .Annotations) 0 }}Annotations {{ end }}
126 | {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
127 | Source
128 | |
129 |
130 | {{ end }}
131 |
132 | {{ if gt (len .Alerts.Resolved) 0 }}
133 | {{ if gt (len .Alerts.Firing) 0 }}
134 |
135 |
136 |
137 |
138 |
139 | |
140 |
141 | {{ end }}
142 |
143 |
144 | [{{ .Alerts.Resolved | len }}] Resolved
145 | |
146 |
147 | {{ end }}
148 | {{ range .Alerts.Resolved }}
149 |
150 |
151 | Labels
152 | {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
153 | {{ if gt (len .Annotations) 0 }}Annotations {{ end }}
154 | {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
155 | Source
156 | |
157 |
158 | {{ end }}
159 |
160 | |
161 |
162 |
163 |
164 |
171 | |
172 | |
173 |
174 |
175 |
176 |
177 |
178 |
179 | {{ end }}
180 |
181 | {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }}
182 | {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
183 | {{ if gt (len .Alerts.Firing) 0 }}
184 | Alerts Firing:
185 | {{ template "__text_alert_list" .Alerts.Firing }}
186 | {{ end }}
187 | {{ if gt (len .Alerts.Resolved) 0 }}
188 | Alerts Resolved:
189 | {{ template "__text_alert_list" .Alerts.Resolved }}
190 | {{ end }}
191 | {{ end }}
192 | {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }}
193 |
--------------------------------------------------------------------------------
/configs/alertmanager-templates/slack.tmpl:
--------------------------------------------------------------------------------
1 | {{ define "slack.devops.text" }}
2 | {{range .Alerts}}{{.Annotations.DESCRIPTION}}
3 | {{end}}
4 | {{ end }}
5 |
--------------------------------------------------------------------------------
/configs/grafana/prometheus-datasource.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "prometheus",
3 | "type": "prometheus",
4 | "url": "http://prometheus:9090",
5 | "access": "proxy",
6 | "basicAuth": false
7 | }
8 |
--------------------------------------------------------------------------------
/configs/prometheus/prometheus.yaml:
--------------------------------------------------------------------------------
1 | global:
2 | scrape_interval: 10s
3 | scrape_timeout: 10s
4 | evaluation_interval: 10s
5 | rule_files:
6 | - "/etc/prometheus-rules/*.rules"
7 | scrape_configs:
8 |
9 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37
10 | - job_name: 'kubernetes-nodes'
11 | tls_config:
12 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
13 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
14 | kubernetes_sd_configs:
15 | - role: node
16 | relabel_configs:
17 | - source_labels: [__address__]
18 | regex: '(.*):10250'
19 | replacement: '${1}:10255'
20 | target_label: __address__
21 |
22 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79
23 | - job_name: 'kubernetes-endpoints'
24 | kubernetes_sd_configs:
25 | - role: endpoints
26 | relabel_configs:
27 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
28 | action: keep
29 | regex: true
30 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
31 | action: replace
32 | target_label: __scheme__
33 | regex: (https?)
34 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
35 | action: replace
36 | target_label: __metrics_path__
37 | regex: (.+)
38 | - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
39 | action: replace
40 | target_label: __address__
41 | regex: (.+)(?::\d+);(\d+)
42 | replacement: $1:$2
43 | - action: labelmap
44 | regex: __meta_kubernetes_service_label_(.+)
45 | - source_labels: [__meta_kubernetes_namespace]
46 | action: replace
47 | target_label: kubernetes_namespace
48 | - source_labels: [__meta_kubernetes_service_name]
49 | action: replace
50 | target_label: kubernetes_name
51 |
52 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119
53 | - job_name: 'kubernetes-services'
54 | metrics_path: /probe
55 | params:
56 | module: [http_2xx]
57 | kubernetes_sd_configs:
58 | - role: service
59 | relabel_configs:
60 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
61 | action: keep
62 | regex: true
63 | - source_labels: [__address__]
64 | target_label: __param_target
65 | - target_label: __address__
66 | replacement: blackbox
67 | - source_labels: [__param_target]
68 | target_label: instance
69 | - action: labelmap
70 | regex: __meta_kubernetes_service_label_(.+)
71 | - source_labels: [__meta_kubernetes_namespace]
72 | target_label: kubernetes_namespace
73 | - source_labels: [__meta_kubernetes_service_name]
74 | target_label: kubernetes_name
75 |
76 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156
77 | - job_name: 'kubernetes-pods'
78 | kubernetes_sd_configs:
79 | - role: pod
80 | relabel_configs:
81 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
82 | action: keep
83 | regex: true
84 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
85 | action: replace
86 | target_label: __metrics_path__
87 | regex: (.+)
88 | - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
89 | action: replace
90 | regex: (.+):(?:\d+);(\d+)
91 | replacement: ${1}:${2}
92 | target_label: __address__
93 | - action: labelmap
94 | regex: __meta_kubernetes_pod_label_(.+)
95 | - source_labels: [__meta_kubernetes_namespace]
96 | action: replace
97 | target_label: kubernetes_namespace
98 | - source_labels: [__meta_kubernetes_pod_name]
99 | action: replace
100 | target_label: kubernetes_pod_name
101 | - source_labels: [__meta_kubernetes_pod_container_port_number]
102 | action: keep
103 | regex: 9\d{3}
104 |
105 | - job_name: 'kubernetes-cadvisor'
106 | scheme: https
107 | tls_config:
108 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
109 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
110 | kubernetes_sd_configs:
111 | - role: node
112 | relabel_configs:
113 | - action: labelmap
114 | regex: __meta_kubernetes_node_label_(.+)
115 | - target_label: __address__
116 | replacement: kubernetes.default.svc:443
117 | - source_labels: [__meta_kubernetes_node_name]
118 | regex: (.+)
119 | target_label: __metrics_path__
120 | replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
121 |
--------------------------------------------------------------------------------
/configs/prometheus/rules/cpu-usage.rules:
--------------------------------------------------------------------------------
1 | ALERT NodeCPUUsage
2 | IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75
3 | FOR 2m
4 | LABELS {
5 | severity="page"
6 | }
7 | ANNOTATIONS {
8 | SUMMARY = "{{$labels.instance}}: High CPU usage detected",
9 | DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})"
10 | }
11 |
--------------------------------------------------------------------------------
/configs/prometheus/rules/instance-availability.rules:
--------------------------------------------------------------------------------
1 | ALERT InstanceDown
2 | IF up == 0
3 | FOR 1m
4 | LABELS { severity = "page" }
5 | ANNOTATIONS {
6 | summary = "Instance {{ $labels.instance }} down",
7 | description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.",
8 | }
9 |
--------------------------------------------------------------------------------
/configs/prometheus/rules/low-disk-space.rules:
--------------------------------------------------------------------------------
1 | ALERT NodeLowRootDisk
2 | IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75
3 | FOR 2m
4 | LABELS {
5 | severity="page"
6 | }
7 | ANNOTATIONS {
8 | SUMMARY = "{{$labels.instance}}: Low root disk space",
9 | DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})"
10 | }
11 |
12 | ALERT NodeLowDataDisk
13 | IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75
14 | FOR 2m
15 | LABELS {
16 | severity="page"
17 | }
18 | ANNOTATIONS {
19 | SUMMARY = "{{$labels.instance}}: Low data disk space",
20 | DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})"
21 | }
22 |
--------------------------------------------------------------------------------
/configs/prometheus/rules/mem-usage.rules:
--------------------------------------------------------------------------------
1 | ALERT NodeSwapUsage
2 | IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75
3 | FOR 2m
4 | LABELS {
5 | severity="page"
6 | }
7 | ANNOTATIONS {
8 | SUMMARY = "{{$labels.instance}}: Swap usage detected",
9 | DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})"
10 | }
11 |
12 | ALERT NodeMemoryUsage
13 | IF (((node_memory_MemTotal-node_memory_MemAvailable)/(node_memory_MemTotal)*100)) > 75
14 | FOR 2m
15 | LABELS {
16 | severity="page"
17 | }
18 | ANNOTATIONS {
19 | SUMMARY = "{{$labels.instance}}: High memory usage detected",
20 | DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})"
21 | }
22 |
--------------------------------------------------------------------------------
/development.md:
--------------------------------------------------------------------------------
1 | # Before pushing to this repo
2 |
3 | Create derived `ConfigMap`s and `manifests-all.yaml` by running the following commands or executing `build.sh`:
4 |
5 | ```bash
6 | # Create ConfigMap with prometheus rules for alerting
7 | kubectl --namespace monitoring create configmap --dry-run prometheus-rules \
8 | --from-file=configs/prometheus/rules \
9 | --output yaml \
10 | > ./manifests/prometheus/prometheus-rules.yaml
11 | # Workaround since `--namespace monitoring` from above is not preserved
12 | echo " namespace: monitoring" >> ./manifests/prometheus/prometheus-rules.yaml
13 |
14 | # Create ConfigMap for an external url
15 | kubectl --namespace monitoring create configmap --dry-run alertmanager-templates \
16 | --from-file=configs/alertmanager-templates \
17 | --output yaml \
18 | > ./manifests/alertmanager/alertmanager-templates.yaml
19 | # Workaround since `--namespace monitoring` from above is not preserved
20 | echo " namespace: monitoring" >> ./manifests/alertmanager/alertmanager-templates.yaml
21 |
22 | # Create ConfigMap with Grafana dashboards and datasources
23 | kubectl --namespace monitoring create configmap --dry-run grafana-import-dashboards \
24 | --from-file=configs/grafana \
25 | --output yaml \
26 | > ./manifests/grafana/import-dashboards/configmap.yaml
27 | # Workaround since `--namespace monitoring` from above is not preserved
28 | echo " namespace: monitoring" >> ./manifests/grafana/import-dashboards/configmap.yaml
29 |
30 | # Create ConfigMap with Prometheus config
31 | kubectl --namespace monitoring create configmap --dry-run prometheus-core \
32 | --from-file=configs/prometheus/prometheus.yaml \
33 | --output yaml \
34 | > ./manifests/prometheus/configmap.yaml
35 | # Workaround since `--namespace monitoring` from above is not preserved
36 | echo " namespace: monitoring" >> ./manifests/prometheus/configmap.yaml
37 |
38 | # Create one single manifest file
39 | target="./manifests-all.yaml"
40 | rm "$target"
41 | echo "# Derived from ./manifests" >> "$target"
42 | for file in $(find ./manifests -type f -name "*.yaml" | sort) ; do
43 | echo "---" >> "$target"
44 | cat "$file" >> "$target"
45 | done
46 | ```
47 |
--------------------------------------------------------------------------------
/docs/grafana_cluster_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/giantswarm/prometheus/e58028316a2489dcd037183a0f9f0431717d68b0/docs/grafana_cluster_overview.png
--------------------------------------------------------------------------------
/docs/grafana_edit_admin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/giantswarm/prometheus/e58028316a2489dcd037183a0f9f0431717d68b0/docs/grafana_edit_admin.png
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | +++
2 | title = "Monitoring with Prometheus and Grafana"
3 | description = "Recipe to spin up a monitoring setup with Prometheus and Grafana on Kubernetes."
4 | date = "2017-02-10"
5 | type = "page"
6 | weight = 100
7 | tags = ["recipe"]
8 | +++
9 |
10 | # Monitoring with Prometheus and Grafana
11 |
12 | [Prometheus](https://prometheus.io/) is an open-source monitoring solution that includes the gathering of metrics, their storage in an internal time series database as well as querying and alerting based on that data.
13 |
14 | It offers a lot of integrations incl. Docker, Kubernetes, etc.
15 |
16 | Prometheus can also visualize your data. However, in this recipe we include another open-source tool, [Grafana](http://grafana.org/), for the visualization part, as it offers a more powerful and flexible way to generate visuals and dashboards.
17 |
18 | ## Deploying Prometheus and Grafana
19 |
20 | The following command will set you up with all neccesary components and some first dashboards to check out.
21 |
22 | ```bash
23 | kubectl apply --filename https://raw.githubusercontent.com/giantswarm/kubernetes-prometheus/master/manifests-all.yaml
24 | ```
25 |
26 | ## Checking Prometheus
27 |
28 | Wait a bit for all the pods to come up. Then Prometheus should be ready and running. We can check the Prometheus targets at `https://api..k8s.gigantic.io/api/v1/proxy/namespaces/monitoring/services/prometheus:9090/targets`
29 |
30 | 
31 |
32 | *Note*: The above URL uses your Kubernetes API to proxy to the service. As the API is guarded with your credentials, you need to [set them up in your system](/guides/accessing-services-from-the-outside/) (and/or browser). We do not recommend to set up an Ingress for Prometheus at this time, as it currently does not support any kind of authentication and thus your cluster would be open to everyone.
33 |
34 | ## Checking Alertmanager
35 |
36 | Prometheus shows the active alerts and rules under `/alerts` in the prometheus frontend:
37 |
38 | 
39 |
40 | A full list of all rules can also be seen under `/rules`. The Alertmanager frontend offers more options to look at active and silenced alerts.
41 |
42 | ## Checking Grafana
43 |
44 | Now that we know Prometheus is up and running we can check for Grafana.
45 |
46 | There's an Ingress set up for Grafana, however, you need to set it to your desired domain. You can do this by editing the ingress:
47 |
48 | ```bash
49 | kubectl --namespace monitoring edit ingress grafana
50 | ```
51 |
52 | This will open the ingress YAML in your standard editor. In the `host` field replace `yourchoice` with a subdomain of your choice and `clusterid` with your cluster ID. After saving and exiting your editor, wait a while and Grafana should be available at `http://..k8s.gigantic.io/`.
53 |
54 | You can use the default admin (`admin:admin`) user for your first login. You should change this admin user to reflect your desired username, your email, and a secure password ASAP!
55 |
56 | _Note:_ If persistent storage is not set up in your cluster, changes like the above will be reset to defaults if the Grafana Pod gets rescheduled. You would need to set them again after that.
57 |
58 | ## Changing the admin
59 |
60 | You can change the default admin user at http://grafana.monitoring..k8s.gigantic.io/admin/users/edit/1
61 |
62 | 
63 |
64 | Please note, that you need to update the password and the user data (username, email, etc.) separately with the respective update buttons below each section.
65 |
66 | ## Check out your dashboards
67 |
68 | You can now checkout the included dashboards, e.g. the [Cluster Monitoring Overview](http://grafana.monitoring.l8.k8s.gigantic.io/dashboard/db/kubernetes-cluster-monitoring-via-prometheus).
69 |
70 | 
71 |
72 | _Note:_ If persistent storage is not set up in your cluster, the preset datasource and dashboards will vanish if the Grafana Pod gets rescheduled. To get them back run:
73 |
74 | ```nohighlight
75 | kubectl --namespace=monitoring delete job grafana-import-dashboards
76 | kubectl --namespace=monitoring create --filename https://raw.githubusercontent.com/giantswarm/prometheus/master/manifests/grafana/import-dashboards/job.yaml
77 | ```
78 |
79 | ## Next Steps
80 |
81 | Next, you should get into the [Grafana](http://docs.grafana.org/) and [Prometheus](https://prometheus.io/docs/introduction/overview/) documentations to get to know the tools and either build your own dashboards or extend the samples from above.
82 |
83 | You can also check out grafana.net for some more example [dashboards](https://grafana.net/dashboards) and [plugins](https://grafana.net/plugins).
84 |
85 | You might also want to set up some [alerting](https://prometheus.io/docs/alerting/overview/).
86 |
--------------------------------------------------------------------------------
/docs/prometheus_alerts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/giantswarm/prometheus/e58028316a2489dcd037183a0f9f0431717d68b0/docs/prometheus_alerts.png
--------------------------------------------------------------------------------
/docs/prometheus_targets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/giantswarm/prometheus/e58028316a2489dcd037183a0f9f0431717d68b0/docs/prometheus_targets.png
--------------------------------------------------------------------------------
/helm/prometheus-chart/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: 2.4.3
3 | description: Prometheus is a monitoring system and time series database.
4 | engine: gotpl
5 | home: https://prometheus.io/
6 | icon: https://raw.githubusercontent.com/prometheus/prometheus.github.io/master/assets/prometheus_logo-cb55bb5c346.png
7 | maintainers:
8 | - name: giantswarm
9 | email: info@giantswarm.io
10 | name: prometheus-chart
11 | sources:
12 | - https://github.com/prometheus/alertmanager
13 | - https://github.com/prometheus/prometheus
14 | tillerVersion: ">=2.8.0"
15 | version: 0.1.0-[[ .SHA ]]
16 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/OWNERS:
--------------------------------------------------------------------------------
1 | approvers:
2 | - giantswarm
3 | reviewers:
4 | - giantswarm
5 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/README.md:
--------------------------------------------------------------------------------
1 | # Prometheus
2 |
3 | An opinionated [Prometheus](https://prometheus.io/) Helm chart for deployment in Giant Swarm clusters.
4 |
5 | ## Prerequisites
6 |
7 | - Kubernetes 1.3+ with Beta APIs enabled
8 | - [Helm](https://helm.sh) package manager.
9 |
10 | ## Installing the Chart
11 |
12 | To install the chart:
13 |
14 | ```console
15 | $ helm install ./helm --name my-prometheus
16 | ```
17 |
18 | The command deploys Prometheus on the Kubernetes cluster in the default configuration. The [configuration](#configuration) section lists the parameters that can be configured during installation.
19 |
20 | ## Uninstalling the Chart
21 |
22 | To uninstall/delete the deployment:
23 |
24 | ```console
25 | $ helm delete my-prometheus --purge
26 | ```
27 |
28 | The command removes all the Kubernetes components associated with the chart and deletes the release.
29 |
30 | ## Configuration
31 |
32 | The following table lists the configurable parameters of the Prometheus chart and their default values.
33 |
34 | Parameter | Description | Default
35 | --------- | ----------- | -------
36 | `alertmanager.enabled` | If true, create alertmanager | `true`
37 | `alertmanager.name` | alertmanager container name | `alertmanager`
38 | `alertmanager.image.repository` | alertmanager container image repository | `prom/alertmanager`
39 | `alertmanager.image.tag` | alertmanager container image tag | `v0.15.2`
40 | `alertmanager.image.pullPolicy` | alertmanager container image pull policy | `IfNotPresent`
41 | `alertmanager.prefixURL` | The prefix slug at which the server can be accessed | ``
42 | `alertmanager.baseURL` | The external url at which the server can be accessed | `/`
43 | `alertmanager.extraArgs` | Additional alertmanager container arguments | `{}`
44 | `alertmanager.configMapOverrideName` | Prometheus alertmanager ConfigMap override where full-name is `{{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}}` and setting this value will prevent the default alertmanager ConfigMap from being generated | `""`
45 | `alertmanager.ingress.enabled` | If true, alertmanager Ingress will be created | `false`
46 | `alertmanager.ingress.annotations` | alertmanager Ingress annotations | `{}`
47 | `alertmanager.ingress.extraLabels` | alertmanager Ingress additional labels | `{}`
48 | `alertmanager.ingress.hosts` | alertmanager Ingress hostnames | `[]`
49 | `alertmanager.ingress.tls` | alertmanager Ingress TLS configuration (YAML) | `[]`
50 | `alertmanager.nodeSelector` | node labels for alertmanager pod assignment | `{}`
51 | `alertmanager.tolerations` | node taints to tolerate (requires Kubernetes >=1.6) | `[]`
52 | `alertmanager.affinity` | pod affinity | `{}`
53 | `alertmanager.schedulerName` | alertmanager alternate scheduler name | `nil`
54 | `alertmanager.persistentVolume.enabled` | If true, alertmanager will create a Persistent Volume Claim | `true`
55 | `alertmanager.persistentVolume.accessModes` | alertmanager data Persistent Volume access modes | `[ReadWriteOnce]`
56 | `alertmanager.persistentVolume.annotations` | Annotations for alertmanager Persistent Volume Claim | `{}`
57 | `alertmanager.persistentVolume.existingClaim` | alertmanager data Persistent Volume existing claim name | `""`
58 | `alertmanager.persistentVolume.mountPath` | alertmanager data Persistent Volume mount root path | `/data`
59 | `alertmanager.persistentVolume.size` | alertmanager data Persistent Volume size | `2Gi`
60 | `alertmanager.persistentVolume.storageClass` | alertmanager data Persistent Volume Storage Class | `unset`
61 | `alertmanager.persistentVolume.subPath` | Subdirectory of alertmanager data Persistent Volume to mount | `""`
62 | `alertmanager.podAnnotations` | annotations to be added to alertmanager pods | `{}`
63 | `alertmanager.replicaCount` | desired number of alertmanager pods | `1`
64 | `alertmanager.priorityClassName` | alertmanager priorityClassName | `nil`
65 | `alertmanager.resources` | alertmanager pod resource requests & limits | `{}`
66 | `alertmanager.securityContext` | Custom [security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for Alert Manager containers | `{}`
67 | `alertmanager.service.annotations` | annotations for alertmanager service | `{}`
68 | `alertmanager.service.clusterIP` | internal alertmanager cluster service IP | `""`
69 | `alertmanager.service.externalIPs` | alertmanager service external IP addresses | `[]`
70 | `alertmanager.service.loadBalancerIP` | IP address to assign to load balancer (if supported) | `""`
71 | `alertmanager.service.loadBalancerSourceRanges` | list of IP CIDRs allowed access to load balancer (if supported) | `[]`
72 | `alertmanager.service.servicePort` | alertmanager service port | `80`
73 | `alertmanager.service.type` | type of alertmanager service to create | `ClusterIP`
74 | `alertmanagerFiles.alertmanager.yml` | Prometheus alertmanager configuration | example configuration
75 | `configmapReload.name` | configmap-reload container name | `configmap-reload`
76 | `configmapReload.image.repository` | configmap-reload container image repository | `jimmidyson/configmap-reload`
77 | `configmapReload.image.tag` | configmap-reload container image tag | `v0.2.2`
78 | `configmapReload.image.pullPolicy` | configmap-reload container image pull policy | `IfNotPresent`
79 | `configmapReload.extraArgs` | Additional configmap-reload container arguments | `{}`
80 | `configmapReload.extraVolumeDirs` | Additional configmap-reload volume directories | `{}`
81 | `configmapReload.extraConfigmapMounts` | Additional configmap-reload configMap mounts | `[]`
82 | `configmapReload.resources` | configmap-reload pod resource requests & limits | `{}`
83 | `initChownData.enabled` | If false, don't reset data ownership at startup | true
84 | `initChownData.name` | init-chown-data container name | `init-chown-data`
85 | `initChownData.image.repository` | init-chown-data container image repository | `busybox`
86 | `initChownData.image.tag` | init-chown-data container image tag | `latest`
87 | `initChownData.image.pullPolicy` | init-chown-data container image pull policy | `IfNotPresent`
88 | `initChownData.resources` | init-chown-data pod resource requests & limits | `{}`
89 | `server.name` | Prometheus server container name | `server`
90 | `server.image.repository` | Prometheus server container image repository | `prom/prometheus`
91 | `server.image.tag` | Prometheus server container image tag | `v2.4.3`
92 | `server.image.pullPolicy` | Prometheus server container image pull policy | `IfNotPresent`
93 | `server.enableAdminApi` | If true, Prometheus administrative HTTP API will be enabled. Please note, that you should take care of administrative API access protection (ingress or some frontend Nginx with auth) before enabling it. | `false`
94 | `server.global.scrape_interval` | How frequently to scrape targets by default | `1m`
95 | `server.global.scrape_timeout` | How long until a scrape request times out | `10s`
96 | `server.global.evaluation_interval` | How frequently to evaluate rules | `1m`
97 | `server.extraArgs` | Additional Prometheus server container arguments | `{}`
98 | `server.prefixURL` | The prefix slug at which the server can be accessed | ``
99 | `server.baseURL` | The external url at which the server can be accessed | ``
100 | `server.extraHostPathMounts` | Additional Prometheus server hostPath mounts | `[]`
101 | `server.extraConfigmapMounts` | Additional Prometheus server configMap mounts | `[]`
102 | `server.extraSecretMounts` | Additional Prometheus server Secret mounts | `[]`
103 | `server.configMapOverrideName` | Prometheus server ConfigMap override where full-name is `{{.Release.Name}}-{{.Values.server.configMapOverrideName}}` and setting this value will prevent the default server ConfigMap from being generated | `""`
104 | `server.ingress.enabled` | If true, Prometheus server Ingress will be created | `false`
105 | `server.ingress.annotations` | Prometheus server Ingress annotations | `[]`
106 | `server.ingress.extraLabels` | Prometheus server Ingress additional labels | `{}`
107 | `server.ingress.hosts` | Prometheus server Ingress hostnames | `[]`
108 | `server.ingress.tls` | Prometheus server Ingress TLS configuration (YAML) | `[]`
109 | `server.nodeSelector` | node labels for Prometheus server pod assignment | `{}`
110 | `server.tolerations` | node taints to tolerate (requires Kubernetes >=1.6) | `[]`
111 | `server.affinity` | pod affinity | `{}`
112 | `server.priorityClassName` | Prometheus server priorityClassName | `nil`
113 | `server.schedulerName` | Prometheus server alternate scheduler name | `nil`
114 | `server.persistentVolume.enabled` | If true, Prometheus server will create a Persistent Volume Claim | `true`
115 | `server.persistentVolume.accessModes` | Prometheus server data Persistent Volume access modes | `[ReadWriteOnce]`
116 | `server.persistentVolume.annotations` | Prometheus server data Persistent Volume annotations | `{}`
117 | `server.persistentVolume.existingClaim` | Prometheus server data Persistent Volume existing claim name | `""`
118 | `server.persistentVolume.mountPath` | Prometheus server data Persistent Volume mount root path | `/data`
119 | `server.persistentVolume.size` | Prometheus server data Persistent Volume size | `8Gi`
120 | `server.persistentVolume.storageClass` | Prometheus server data Persistent Volume Storage Class | `unset`
121 | `server.persistentVolume.subPath` | Subdirectory of Prometheus server data Persistent Volume to mount | `""`
122 | `server.podAnnotations` | annotations to be added to Prometheus server pods | `{}`
123 | `server.deploymentAnnotations` | annotations to be added to Prometheus server deployment | `{}'
124 | `server.replicaCount` | desired number of Prometheus server pods | `1`
125 | `server.resources` | Prometheus server resource requests and limits | `{}`
126 | `server.securityContext` | Custom [security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for server containers | `{}`
127 | `server.service.annotations` | annotations for Prometheus server service | `{}`
128 | `server.service.clusterIP` | internal Prometheus server cluster service IP | `""`
129 | `server.service.externalIPs` | Prometheus server service external IP addresses | `[]`
130 | `server.service.loadBalancerIP` | IP address to assign to load balancer (if supported) | `""`
131 | `server.service.loadBalancerSourceRanges` | list of IP CIDRs allowed access to load balancer (if supported) | `[]`
132 | `server.service.nodePort` | Port to be used as the service NodePort (ignored if `server.service.type` is not `NodePort`) | `0`
133 | `server.service.servicePort` | Prometheus server service port | `80`
134 | `server.service.type` | type of Prometheus server service to create | `ClusterIP`
135 | `serviceAccounts.alertmanager.create` | If true, create the alertmanager service account | `true`
136 | `serviceAccounts.alertmanager.name` | name of the alertmanager service account to use or create | `{{ prometheus.alertmanager.fullname }}`
137 | `serviceAccounts.kubeStateMetrics.create` | If true, create the kubeStateMetrics service account | `true`
138 | `serviceAccounts.kubeStateMetrics.name` | name of the kubeStateMetrics service account to use or create | `{{ prometheus.kubeStateMetrics.fullname }}`
139 | `serviceAccounts.nodeExporter.create` | If true, create the nodeExporter service account | `true`
140 | `serviceAccounts.nodeExporter.name` | name of the nodeExporter service account to use or create | `{{ prometheus.nodeExporter.fullname }}`
141 | `serviceAccounts.pushgateway.create` | If true, create the pushgateway service account | `true`
142 | `serviceAccounts.pushgateway.name` | name of the pushgateway service account to use or create | `{{ prometheus.pushgateway.fullname }}`
143 | `serviceAccounts.server.create` | If true, create the server service account | `true`
144 | `serviceAccounts.server.name` | name of the server service account to use or create | `{{ prometheus.server.fullname }}`
145 | `server.terminationGracePeriodSeconds` | Prometheus server Pod termination grace period | `300`
146 | `server.retention` | (optional) Prometheus data retention | `""`
147 | `serverFiles.alerts` | Prometheus server alerts configuration | `{}`
148 | `serverFiles.rules` | Prometheus server rules configuration | `{}`
149 | `serverFiles.prometheus.yml` | Prometheus server scrape configuration | example configuration
150 | `extraScrapeConfigs` | Prometheus server additional scrape configuration | ""
151 | `networkPolicy.enabled` | Enable NetworkPolicy | `false` |
152 | `grafana.replicas` | Number of nodes | `1`
153 | `grafana.deploymentStrategy` | Deployment strategy | `RollingUpdate`
154 | `grafana.livenessProbe` | Liveness Probe settings | `{ "httpGet": { "path": "/api/health", "port": 3000 } "initialDelaySeconds": 60, "timeoutSeconds": 30, "failureThreshold": 10 }
155 | `grafana.readinessProbe` | Rediness Probe settings | `{ "httpGet": { "path": "/api/health", "port": 3000 }
156 | `grafana.securityContext` | Deployment securityContext | `{"runAsUser": 472, "fsGroup": 472}`
157 | `grafana.image.repository` | Image repository | `grafana/grafana`
158 | `grafana.image.tag` | Image tag. (`Must be >= 5.0.0`) | `5.3.4`
159 | `grafana.image.pullPolicy` | Image pull policy | `IfNotPresent`
160 | `grafana.service.type` | Kubernetes service type | `ClusterIP`
161 | `grafana.service.port` | Kubernetes port where service is exposed | `80`
162 | `grafana.service.annotations` | Service annotations | `{}`
163 | `grafana.service.labels` | Custom labels | `{}`
164 | `grafana.ingress.enabled` | Enables Ingress | `false`
165 | `grafana.ingress.annotations` | Ingress annotations | `{}`
166 | `grafana.ingress.labels` | Custom labels | `{}`
167 | `grafana.ingress.hosts` | Ingress accepted hostnames | `[]`
168 | `grafana.ingress.tls` | Ingress TLS configuration | `[]`
169 | `grafana.resources` | CPU/Memory resource requests/limits | `{}`
170 | `grafana.nodeSelector` | Node labels for pod assignment | `{}`
171 | `grafana.tolerations` | Toleration labels for pod assignment | `[]`
172 | `grafana.affinity` | Affinity settings for pod assignment | `{}`
173 | `grafana.schedulerName` | Alternate scheduler name | `nil`
174 | `grafana.env` | Extra environment variables passed to pods | `{}`
175 | `grafana.custom.ini` | Grafana's primary configuration | `{}`
176 | `grafana.annotations` | Deployment annotations | `{}`
177 | `grafana.podAnnotations` | Pod annotations | `{}`
178 |
179 | Specify each parameter using the `--set key=value[,key=value]` argument to `helm install`. For example,
180 |
181 | ```console
182 | $ helm install ./helm --name my-prometheus \
183 | --set server.terminationGracePeriodSeconds=360
184 | ```
185 |
186 | Alternatively, a YAML file that specifies the values for the above parameters can be provided while installing the chart. For example,
187 |
188 | ```console
189 | $ helm install ./helm --name my-prometheus -f values.yaml
190 | ```
191 |
192 | ### ConfigMap Files
193 | AlertManager is configured through [alertmanager.yml](https://prometheus.io/docs/alerting/configuration/). This file (and any others listed in `alertmanagerFiles`) will be mounted into the `alertmanager` pod.
194 |
195 | Prometheus is configured through [prometheus.yml](https://prometheus.io/docs/operating/configuration/). This file (and any others listed in `serverFiles`) will be mounted into the `server` pod.
196 |
197 | ### Ingress TLS
198 | If your cluster allows automatic creation/retrieval of TLS certificates (e.g. [cert manager](https://github.com/jetstack/cert-manager)), please refer to the documentation for that mechanism.
199 |
200 | To manually configure TLS, first create/retrieve a key & certificate pair for the address(es) you wish to protect. Then create a TLS secret in the namespace:
201 |
202 | ```console
203 | kubectl create secret tls prometheus-server-tls --cert=path/to/tls.cert --key=path/to/tls.key
204 | ```
205 |
206 | Include the secret's name, along with the desired hostnames, in the alertmanager/server Ingress TLS section of your custom `values.yaml` file:
207 |
208 | ```yaml
209 | server:
210 | ingress:
211 | ## If true, Prometheus server Ingress will be created
212 | ##
213 | enabled: true
214 |
215 | ## Prometheus server Ingress hostnames
216 | ## Must be provided if Ingress is enabled
217 | ##
218 | hosts:
219 | - prometheus.domain.com
220 |
221 | ## Prometheus server Ingress TLS configuration
222 | ## Secrets must be manually created in the namespace
223 | ##
224 | tls:
225 | - secretName: prometheus-server-tls
226 | hosts:
227 | - prometheus.domain.com
228 | ```
229 |
230 | ### NetworkPolicy
231 |
232 | Enabling Network Policy for Prometheus will secure connections to Alert Manager
233 | and Kube State Metrics by only accepting connections from Prometheus Server.
234 | All inbound connections to Prometheus Server are still allowed.
235 |
236 | To enable network policy for Prometheus, install a networking plugin that
237 | implements the Kubernetes NetworkPolicy spec, and set `networkPolicy.enabled` to true.
238 |
239 | If NetworkPolicy is enabled for Prometheus' scrape targets, you may also need
240 | to manually create a networkpolicy which allows it.
241 |
242 |
243 | __Note__: This chart is based off of the [upstream community chart](https://github.com/helm/charts/tree/master/stable/prometheus).
244 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/grafana-dashboards/kubernetes-cluster.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": "-- Grafana --",
7 | "enable": true,
8 | "hide": true,
9 | "iconColor": "rgba(0, 211, 255, 1)",
10 | "name": "Annotations & Alerts",
11 | "type": "dashboard"
12 | }
13 | ]
14 | },
15 | "editable": true,
16 | "gnetId": null,
17 | "graphTooltip": 1,
18 | "hideControls": false,
19 | "id": 2,
20 | "links": [],
21 | "refresh": "5m",
22 | "rows": [
23 | {
24 | "collapse": false,
25 | "height": 250,
26 | "panels": [
27 | {
28 | "aliasColors": {},
29 | "bars": false,
30 | "dashLength": 10,
31 | "dashes": false,
32 | "datasource": "Prometheus",
33 | "decimals": 0,
34 | "description": "Number of pods being able to be scheduled per node.",
35 | "editable": true,
36 | "error": false,
37 | "fill": 1,
38 | "grid": {
39 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
40 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
41 | },
42 | "height": "350",
43 | "id": 3,
44 | "legend": {
45 | "alignAsTable": true,
46 | "avg": false,
47 | "current": true,
48 | "hideEmpty": false,
49 | "hideZero": false,
50 | "max": true,
51 | "min": true,
52 | "rightSide": false,
53 | "show": true,
54 | "total": false,
55 | "values": true
56 | },
57 | "lines": true,
58 | "linewidth": 1,
59 | "links": [],
60 | "nullPointMode": "null as zero",
61 | "percentage": false,
62 | "pointradius": 5,
63 | "points": false,
64 | "renderer": "flot",
65 | "seriesOverrides": [],
66 | "spaceLength": 11,
67 | "span": 12,
68 | "stack": false,
69 | "steppedLine": false,
70 | "targets": [
71 | {
72 | "expr": "(sum(kube_node_status_capacity_pods) by (node)) - (sum(kube_pod_info) by (node))",
73 | "format": "time_series",
74 | "intervalFactor": 2,
75 | "legendFormat": "{{node}}",
76 | "refId": "C"
77 | }
78 | ],
79 | "thresholds": [],
80 | "timeFrom": null,
81 | "timeShift": null,
82 | "title": "Pods Schedulable",
83 | "tooltip": {
84 | "msResolution": false,
85 | "shared": true,
86 | "sort": 0,
87 | "value_type": "individual"
88 | },
89 | "type": "graph",
90 | "xaxis": {
91 | "buckets": null,
92 | "mode": "time",
93 | "name": null,
94 | "show": true,
95 | "values": []
96 | },
97 | "yaxes": [
98 | {
99 | "format": "short",
100 | "logBase": 1,
101 | "min": "0",
102 | "show": true
103 | },
104 | {
105 | "format": "short",
106 | "logBase": 1,
107 | "show": false
108 | }
109 | ]
110 | }
111 | ],
112 | "repeat": null,
113 | "repeatIteration": null,
114 | "repeatRowId": null,
115 | "showTitle": false,
116 | "title": "Dashboard Row",
117 | "titleSize": "h6"
118 | },
119 | {
120 | "collapse": false,
121 | "height": 250,
122 | "panels": [
123 | {
124 | "aliasColors": {},
125 | "bars": false,
126 | "dashLength": 10,
127 | "dashes": false,
128 | "datasource": "Prometheus",
129 | "decimals": 0,
130 | "description": "Number of pods restarting within the Kubernetes cluster.",
131 | "fill": 8,
132 | "height": "350",
133 | "id": 4,
134 | "legend": {
135 | "alignAsTable": true,
136 | "avg": false,
137 | "current": false,
138 | "max": true,
139 | "min": true,
140 | "show": true,
141 | "total": false,
142 | "values": true
143 | },
144 | "lines": true,
145 | "linewidth": 0,
146 | "links": [],
147 | "nullPointMode": "null as zero",
148 | "percentage": false,
149 | "pointradius": 5,
150 | "points": false,
151 | "renderer": "flot",
152 | "seriesOverrides": [],
153 | "spaceLength": 10,
154 | "span": 12,
155 | "stack": false,
156 | "steppedLine": true,
157 | "targets": [
158 | {
159 | "expr": "sum(changes(kube_pod_container_status_restarts_total[5m]))",
160 | "format": "time_series",
161 | "intervalFactor": 2,
162 | "legendFormat": "restarts",
163 | "refId": "A"
164 | }
165 | ],
166 | "thresholds": [],
167 | "timeFrom": null,
168 | "timeShift": null,
169 | "title": "Pods Restarting",
170 | "tooltip": {
171 | "shared": true,
172 | "sort": 0,
173 | "value_type": "individual"
174 | },
175 | "type": "graph",
176 | "xaxis": {
177 | "buckets": null,
178 | "mode": "time",
179 | "name": null,
180 | "show": true,
181 | "values": []
182 | },
183 | "yaxes": [
184 | {
185 | "decimals": 0,
186 | "format": "short",
187 | "label": null,
188 | "logBase": 1,
189 | "max": null,
190 | "min": "0",
191 | "show": true
192 | },
193 | {
194 | "format": "short",
195 | "label": null,
196 | "logBase": 1,
197 | "max": null,
198 | "min": null,
199 | "show": false
200 | }
201 | ]
202 | }
203 | ],
204 | "repeat": null,
205 | "repeatIteration": null,
206 | "repeatRowId": null,
207 | "showTitle": false,
208 | "title": "Dashboard Row",
209 | "titleSize": "h6"
210 | },
211 | {
212 | "collapse": false,
213 | "height": 250,
214 | "panels": [
215 | {
216 | "aliasColors": {},
217 | "bars": false,
218 | "dashLength": 10,
219 | "dashes": false,
220 | "datasource": "Prometheus",
221 | "decimals": 2,
222 | "description": "Error rates of the Kubernetes apiserver grouped by request verb. Note that the graph's values are stacked to have a better idea about the overall error rate within the Kubernetes cluster.",
223 | "editable": true,
224 | "error": false,
225 | "fill": 8,
226 | "grid": {
227 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
228 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
229 | },
230 | "height": "350",
231 | "id": 5,
232 | "isNew": false,
233 | "legend": {
234 | "alignAsTable": true,
235 | "avg": false,
236 | "current": true,
237 | "hideEmpty": false,
238 | "hideZero": false,
239 | "max": true,
240 | "min": true,
241 | "rightSide": false,
242 | "show": true,
243 | "total": false,
244 | "values": true
245 | },
246 | "lines": true,
247 | "linewidth": 0,
248 | "links": [],
249 | "nullPointMode": "null",
250 | "percentage": false,
251 | "pointradius": 5,
252 | "points": false,
253 | "renderer": "flot",
254 | "seriesOverrides": [],
255 | "spaceLength": 10,
256 | "span": 12,
257 | "stack": true,
258 | "steppedLine": true,
259 | "targets": [
260 | {
261 | "expr": "sum(rate(apiserver_request_count{code!~\"2..\"}[5m])) by (verb)",
262 | "format": "time_series",
263 | "intervalFactor": 2,
264 | "legendFormat": "{{verb}}",
265 | "refId": "A",
266 | "step": 40
267 | }
268 | ],
269 | "thresholds": [],
270 | "timeFrom": null,
271 | "timeShift": null,
272 | "title": "Error Rates",
273 | "tooltip": {
274 | "msResolution": false,
275 | "shared": true,
276 | "sort": 0,
277 | "value_type": "individual"
278 | },
279 | "type": "graph",
280 | "xaxis": {
281 | "buckets": null,
282 | "mode": "time",
283 | "name": null,
284 | "show": true,
285 | "values": []
286 | },
287 | "yaxes": [
288 | {
289 | "decimals": 2,
290 | "format": "short",
291 | "label": "",
292 | "logBase": 1,
293 | "max": null,
294 | "min": "0",
295 | "show": true
296 | },
297 | {
298 | "format": "short",
299 | "logBase": 1,
300 | "show": false
301 | }
302 | ]
303 | }
304 | ],
305 | "repeat": null,
306 | "repeatIteration": null,
307 | "repeatRowId": null,
308 | "showTitle": false,
309 | "title": "Dashboard Row",
310 | "titleSize": "h6"
311 | }
312 | ],
313 | "schemaVersion": 14,
314 | "style": "dark",
315 | "tags": [
316 | "kubernetes"
317 | ],
318 | "templating": {
319 | "list": []
320 | },
321 | "time": {
322 | "from": "now-15m",
323 | "to": "now"
324 | },
325 | "timepicker": {
326 | "refresh_intervals": [
327 | "5s",
328 | "10s",
329 | "30s",
330 | "1m",
331 | "5m",
332 | "15m",
333 | "30m",
334 | "1h",
335 | "2h",
336 | "1d"
337 | ],
338 | "time_options": [
339 | "5m",
340 | "15m",
341 | "1h",
342 | "6h",
343 | "12h",
344 | "24h",
345 | "2d",
346 | "7d",
347 | "30d"
348 | ]
349 | },
350 | "timezone": "utc",
351 | "title": "Kubernetes Health",
352 | "version": 1
353 | }
354 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/grafana-dashboards/prometheus.json:
--------------------------------------------------------------------------------
1 | {
2 | "__inputs": [
3 | {
4 | "name": "Prometheus",
5 | "label": "Prometheus",
6 | "description": "",
7 | "type": "datasource",
8 | "pluginId": "prometheus",
9 | "pluginName": "Prometheus"
10 | }
11 | ],
12 | "__requires": [
13 | {
14 | "type": "grafana",
15 | "id": "grafana",
16 | "name": "Grafana",
17 | "version": "4.4.1"
18 | },
19 | {
20 | "type": "panel",
21 | "id": "graph",
22 | "name": "Graph",
23 | "version": ""
24 | },
25 | {
26 | "type": "datasource",
27 | "id": "prometheus",
28 | "name": "Prometheus",
29 | "version": "1.0.0"
30 | }
31 | ],
32 | "annotations": {
33 | "list": []
34 | },
35 | "editable": true,
36 | "gnetId": null,
37 | "graphTooltip": 1,
38 | "links": [],
39 | "panels": [
40 | {
41 | "aliasColors": {},
42 | "bars": false,
43 | "dashLength": 10,
44 | "dashes": false,
45 | "datasource": "Prometheus",
46 | "description": "5 minute rate of samples appended",
47 | "fill": 1,
48 | "gridPos": {
49 | "h": 7,
50 | "w": 18,
51 | "x": 0,
52 | "y": 0
53 | },
54 | "id": 4,
55 | "legend": {
56 | "alignAsTable": true,
57 | "avg": false,
58 | "current": true,
59 | "max": true,
60 | "min": true,
61 | "show": true,
62 | "total": false,
63 | "values": true
64 | },
65 | "lines": true,
66 | "linewidth": 1,
67 | "links": [],
68 | "nullPointMode": "null",
69 | "percentage": false,
70 | "pointradius": 5,
71 | "points": false,
72 | "renderer": "flot",
73 | "seriesOverrides": [],
74 | "spaceLength": 10,
75 | "stack": false,
76 | "steppedLine": false,
77 | "targets": [
78 | {
79 | "expr": "rate(prometheus_tsdb_head_samples_appended_total[5m])",
80 | "format": "time_series",
81 | "intervalFactor": 2,
82 | "legendFormat": "{{ instance }}",
83 | "refId": "A",
84 | "step": 60
85 | }
86 | ],
87 | "thresholds": [],
88 | "timeFrom": null,
89 | "timeShift": null,
90 | "title": "Rate of Samples Appended",
91 | "tooltip": {
92 | "shared": true,
93 | "sort": 0,
94 | "value_type": "individual"
95 | },
96 | "type": "graph",
97 | "xaxis": {
98 | "buckets": null,
99 | "mode": "time",
100 | "name": null,
101 | "show": true,
102 | "values": []
103 | },
104 | "yaxes": [
105 | {
106 | "format": "short",
107 | "label": null,
108 | "logBase": 1,
109 | "max": null,
110 | "min": "0",
111 | "show": true
112 | },
113 | {
114 | "format": "short",
115 | "label": null,
116 | "logBase": 1,
117 | "max": null,
118 | "min": null,
119 | "show": true
120 | }
121 | ],
122 | "yaxis": {
123 | "align": false,
124 | "alignLevel": null
125 | }
126 | },
127 | {
128 | "cacheTimeout": null,
129 | "colorBackground": false,
130 | "colorValue": true,
131 | "colors": [
132 | "rgba(50, 172, 45, 0.97)",
133 | "rgba(237, 129, 40, 0.89)",
134 | "rgba(245, 54, 54, 0.9)"
135 | ],
136 | "datasource": "Prometheus",
137 | "description": "Total number of time series in prometheus",
138 | "format": "none",
139 | "gauge": {
140 | "maxValue": 100,
141 | "minValue": 0,
142 | "show": false,
143 | "thresholdLabels": false,
144 | "thresholdMarkers": true
145 | },
146 | "gridPos": {
147 | "h": 7,
148 | "w": 6,
149 | "x": 18,
150 | "y": 0
151 | },
152 | "id": 13,
153 | "interval": null,
154 | "links": [],
155 | "mappingType": 1,
156 | "mappingTypes": [
157 | {
158 | "name": "value to text",
159 | "value": 1
160 | },
161 | {
162 | "name": "range to text",
163 | "value": 2
164 | }
165 | ],
166 | "maxDataPoints": 100,
167 | "nullPointMode": "connected",
168 | "nullText": null,
169 | "postfix": "",
170 | "postfixFontSize": "50%",
171 | "prefix": "",
172 | "prefixFontSize": "50%",
173 | "rangeMaps": [
174 | {
175 | "from": "null",
176 | "text": "N/A",
177 | "to": "null"
178 | }
179 | ],
180 | "sparkline": {
181 | "fillColor": "rgba(31, 118, 189, 0.18)",
182 | "full": true,
183 | "lineColor": "rgb(31, 120, 193)",
184 | "show": true
185 | },
186 | "tableColumn": "",
187 | "targets": [
188 | {
189 | "expr": "sum(prometheus_tsdb_head_series)",
190 | "format": "time_series",
191 | "intervalFactor": 2,
192 | "refId": "B",
193 | "step": 40
194 | }
195 | ],
196 | "thresholds": "1000000,2000000",
197 | "title": "Total Series",
198 | "type": "singlestat",
199 | "valueFontSize": "100%",
200 | "valueMaps": [
201 | {
202 | "op": "=",
203 | "text": "N/A",
204 | "value": "null"
205 | }
206 | ],
207 | "valueName": "current"
208 | },
209 | {
210 | "aliasColors": {},
211 | "bars": false,
212 | "dashLength": 10,
213 | "dashes": false,
214 | "datasource": "Prometheus",
215 | "description": "Time take for rule evaluation",
216 | "fill": 1,
217 | "gridPos": {
218 | "h": 7,
219 | "w": 12,
220 | "x": 0,
221 | "y": 7
222 | },
223 | "id": 5,
224 | "legend": {
225 | "alignAsTable": true,
226 | "avg": false,
227 | "current": true,
228 | "max": true,
229 | "min": true,
230 | "show": true,
231 | "total": false,
232 | "values": true
233 | },
234 | "lines": true,
235 | "linewidth": 1,
236 | "links": [],
237 | "nullPointMode": "null",
238 | "percentage": false,
239 | "pointradius": 5,
240 | "points": false,
241 | "renderer": "flot",
242 | "seriesOverrides": [],
243 | "spaceLength": 10,
244 | "stack": false,
245 | "steppedLine": false,
246 | "targets": [
247 | {
248 | "expr": "prometheus_rule_evaluation_duration_seconds{quantile=\"0.99\"}",
249 | "format": "time_series",
250 | "intervalFactor": 2,
251 | "legendFormat": "{{ instance }} - 0.99 quantile",
252 | "refId": "A",
253 | "step": 120
254 | },
255 | {
256 | "expr": "prometheus_rule_evaluation_duration_seconds{quantile=\"0.9\"}",
257 | "format": "time_series",
258 | "intervalFactor": 2,
259 | "legendFormat": "{{ instance }} - 0.9 quantile",
260 | "refId": "B",
261 | "step": 120
262 | },
263 | {
264 | "expr": "prometheus_rule_evaluation_duration_seconds{quantile=\"0.5\"}",
265 | "format": "time_series",
266 | "intervalFactor": 2,
267 | "legendFormat": "{{ instance }} - 0.5 quantile",
268 | "refId": "C",
269 | "step": 120
270 | }
271 | ],
272 | "thresholds": [],
273 | "timeFrom": null,
274 | "timeShift": null,
275 | "title": "Rule Evaluation Duration",
276 | "tooltip": {
277 | "shared": true,
278 | "sort": 0,
279 | "value_type": "individual"
280 | },
281 | "type": "graph",
282 | "xaxis": {
283 | "buckets": null,
284 | "mode": "time",
285 | "name": null,
286 | "show": true,
287 | "values": []
288 | },
289 | "yaxes": [
290 | {
291 | "format": "s",
292 | "label": null,
293 | "logBase": 1,
294 | "max": null,
295 | "min": "0",
296 | "show": true
297 | },
298 | {
299 | "format": "short",
300 | "label": null,
301 | "logBase": 1,
302 | "max": null,
303 | "min": null,
304 | "show": true
305 | }
306 | ],
307 | "yaxis": {
308 | "align": false,
309 | "alignLevel": null
310 | }
311 | },
312 | {
313 | "aliasColors": {},
314 | "bars": false,
315 | "dashLength": 10,
316 | "dashes": false,
317 | "datasource": "Prometheus",
318 | "description": "TIme taken to send notifications",
319 | "fill": 1,
320 | "gridPos": {
321 | "h": 7,
322 | "w": 12,
323 | "x": 12,
324 | "y": 7
325 | },
326 | "id": 6,
327 | "legend": {
328 | "alignAsTable": true,
329 | "avg": false,
330 | "current": true,
331 | "max": true,
332 | "min": true,
333 | "show": true,
334 | "total": false,
335 | "values": true
336 | },
337 | "lines": true,
338 | "linewidth": 1,
339 | "links": [],
340 | "nullPointMode": "null",
341 | "percentage": false,
342 | "pointradius": 5,
343 | "points": false,
344 | "renderer": "flot",
345 | "seriesOverrides": [],
346 | "spaceLength": 10,
347 | "stack": false,
348 | "steppedLine": false,
349 | "targets": [
350 | {
351 | "expr": "prometheus_notifications_latency_seconds{quantile=\"0.99\"}",
352 | "format": "time_series",
353 | "intervalFactor": 2,
354 | "legendFormat": "{{ instance }} - 0.99 quantile",
355 | "refId": "A",
356 | "step": 120
357 | },
358 | {
359 | "expr": "prometheus_notifications_latency_seconds{quantile=\"0.9\"}",
360 | "format": "time_series",
361 | "intervalFactor": 2,
362 | "legendFormat": "{{ instance }} - 0.9 quantile",
363 | "refId": "B",
364 | "step": 120
365 | },
366 | {
367 | "expr": "prometheus_notifications_latency_seconds{quantile=\"0.5\"}",
368 | "format": "time_series",
369 | "intervalFactor": 2,
370 | "legendFormat": "{{ instance }} - 0.5 quantile",
371 | "refId": "C",
372 | "step": 120
373 | }
374 | ],
375 | "thresholds": [],
376 | "timeFrom": null,
377 | "timeShift": null,
378 | "title": "Notification Latency",
379 | "tooltip": {
380 | "shared": true,
381 | "sort": 0,
382 | "value_type": "individual"
383 | },
384 | "type": "graph",
385 | "xaxis": {
386 | "buckets": null,
387 | "mode": "time",
388 | "name": null,
389 | "show": true,
390 | "values": []
391 | },
392 | "yaxes": [
393 | {
394 | "format": "s",
395 | "label": null,
396 | "logBase": 1,
397 | "max": null,
398 | "min": "0",
399 | "show": true
400 | },
401 | {
402 | "format": "short",
403 | "label": null,
404 | "logBase": 1,
405 | "max": null,
406 | "min": null,
407 | "show": true
408 | }
409 | ],
410 | "yaxis": {
411 | "align": false,
412 | "alignLevel": null
413 | }
414 | },
415 | {
416 | "aliasColors": {},
417 | "bars": false,
418 | "dashLength": 10,
419 | "dashes": false,
420 | "datasource": "Prometheus",
421 | "fill": 1,
422 | "gridPos": {
423 | "h": 7,
424 | "w": 24,
425 | "x": 0,
426 | "y": 14
427 | },
428 | "id": 8,
429 | "legend": {
430 | "alignAsTable": true,
431 | "avg": false,
432 | "current": true,
433 | "max": true,
434 | "min": true,
435 | "show": true,
436 | "total": false,
437 | "values": true
438 | },
439 | "lines": true,
440 | "linewidth": 1,
441 | "links": [],
442 | "nullPointMode": "null",
443 | "percentage": false,
444 | "pointradius": 5,
445 | "points": false,
446 | "renderer": "flot",
447 | "seriesOverrides": [],
448 | "spaceLength": 10,
449 | "stack": false,
450 | "steppedLine": false,
451 | "targets": [
452 | {
453 | "expr": "kube_pod_container_resource_limits_memory_bytes{container=\"prometheus\"}",
454 | "format": "time_series",
455 | "instant": false,
456 | "intervalFactor": 2,
457 | "legendFormat": "memory limit",
458 | "refId": "B"
459 | },
460 | {
461 | "expr": "container_memory_usage_bytes{container_name=\"prometheus\"}",
462 | "format": "time_series",
463 | "instant": false,
464 | "intervalFactor": 2,
465 | "legendFormat": "memory used",
466 | "metric": "container_memory_usage_bytes",
467 | "refId": "A",
468 | "step": 60
469 | }
470 | ],
471 | "thresholds": [],
472 | "timeFrom": null,
473 | "timeShift": null,
474 | "title": "Memory usage/limit",
475 | "tooltip": {
476 | "shared": true,
477 | "sort": 0,
478 | "value_type": "individual"
479 | },
480 | "type": "graph",
481 | "xaxis": {
482 | "buckets": null,
483 | "mode": "time",
484 | "name": null,
485 | "show": true,
486 | "values": []
487 | },
488 | "yaxes": [
489 | {
490 | "format": "bytes",
491 | "label": null,
492 | "logBase": 1,
493 | "max": null,
494 | "min": "0",
495 | "show": true
496 | },
497 | {
498 | "format": "short",
499 | "label": null,
500 | "logBase": 1,
501 | "max": null,
502 | "min": null,
503 | "show": true
504 | }
505 | ],
506 | "yaxis": {
507 | "align": false,
508 | "alignLevel": null
509 | }
510 | },
511 | {
512 | "aliasColors": {},
513 | "bars": false,
514 | "dashLength": 10,
515 | "dashes": false,
516 | "datasource": "Prometheus",
517 | "fill": 1,
518 | "gridPos": {
519 | "h": 7,
520 | "w": 24,
521 | "x": 0,
522 | "y": 21
523 | },
524 | "id": 9,
525 | "legend": {
526 | "alignAsTable": true,
527 | "avg": false,
528 | "current": true,
529 | "max": true,
530 | "min": true,
531 | "show": true,
532 | "total": false,
533 | "values": true
534 | },
535 | "lines": true,
536 | "linewidth": 1,
537 | "links": [],
538 | "nullPointMode": "null",
539 | "percentage": false,
540 | "pointradius": 5,
541 | "points": false,
542 | "renderer": "flot",
543 | "seriesOverrides": [],
544 | "spaceLength": 10,
545 | "stack": false,
546 | "steppedLine": false,
547 | "targets": [
548 | {
549 | "expr": "kube_pod_container_resource_limits_cpu_cores{container=\"prometheus\"}",
550 | "format": "time_series",
551 | "instant": false,
552 | "intervalFactor": 2,
553 | "legendFormat": "cpu limit",
554 | "refId": "B"
555 | },
556 | {
557 | "expr": "sum(rate(container_cpu_usage_seconds_total{container_name=\"prometheus\"}[3m]))",
558 | "format": "time_series",
559 | "instant": false,
560 | "intervalFactor": 2,
561 | "legendFormat": "cpu used",
562 | "metric": "container_cpu_usage_seconds_total",
563 | "refId": "A",
564 | "step": 60
565 | }
566 | ],
567 | "thresholds": [],
568 | "timeFrom": null,
569 | "timeShift": null,
570 | "title": "CPU usage/limit",
571 | "tooltip": {
572 | "shared": true,
573 | "sort": 0,
574 | "value_type": "individual"
575 | },
576 | "type": "graph",
577 | "xaxis": {
578 | "buckets": null,
579 | "mode": "time",
580 | "name": null,
581 | "show": true,
582 | "values": []
583 | },
584 | "yaxes": [
585 | {
586 | "format": "none",
587 | "label": "cores",
588 | "logBase": 1,
589 | "max": null,
590 | "min": null,
591 | "show": true
592 | },
593 | {
594 | "format": "short",
595 | "label": null,
596 | "logBase": 1,
597 | "max": null,
598 | "min": null,
599 | "show": false
600 | }
601 | ],
602 | "yaxis": {
603 | "align": false,
604 | "alignLevel": null
605 | }
606 | }
607 | ],
608 | "refresh": "1m",
609 | "schemaVersion": 16,
610 | "style": "dark",
611 | "tags": [
612 | ],
613 | "templating": {
614 | "list": []
615 | },
616 | "time": {
617 | "from": "now-15m",
618 | "to": "now"
619 | },
620 | "timepicker": {
621 | "refresh_intervals": [
622 | "5s",
623 | "10s",
624 | "30s",
625 | "1m",
626 | "5m",
627 | "15m",
628 | "30m",
629 | "1h",
630 | "2h",
631 | "1d"
632 | ],
633 | "time_options": [
634 | "5m",
635 | "15m",
636 | "1h",
637 | "6h",
638 | "12h",
639 | "24h",
640 | "2d",
641 | "7d",
642 | "30d"
643 | ]
644 | },
645 | "timezone": "utc",
646 | "title": "Prometheus",
647 | "uid": "iWowmlSmk",
648 | "version": 1
649 | }
650 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/alertmanager.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: alertmanager
3 | rules:
4 | - alert: AlertManagerNotificationsFailing
5 | expr: rate(alertmanager_notifications_failed_total[5m]) > 0
6 | for: 10m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: AlertManager {{ $labels.integration }} notifications are failing.
11 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/configmap.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: configmap
3 | rules:
4 | - alert: ConfigMapCountTooHigh
5 | expr: kube_configmap_info > 1000
6 | for: 15m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: ConfigMap count too high.
11 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/container.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: container
3 | rules:
4 | - alert: ContainerIsRestartingTooFrequently
5 | expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 5
6 | for: 5m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: Container {{ $labels.container }} in pod {{ $labels.exported_namespace }}/{{ $labels.pod }} is restarting too often.
11 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/daemonset.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: daemonset
3 | rules:
4 | - alert: DaemonSetNotSatisfied
5 | expr: kube_daemonset_status_number_unavailable{} > 0
6 | for: 15m
7 | labels:
8 | severity: page
9 | annotations:
10 | description: Daemonset {{ $labels.exported_namespace}}/{{ $labels.daemonset }}
11 | is not satisfied.
12 |
13 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/deployment.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: deployment
3 | rules:
4 | - alert: DeploymentNotSatisfied
5 | expr: kube_deployment_status_replicas_unavailable{} > 0
6 | for: 5m
7 | labels:
8 | severity: page
9 | annotations:
10 | description: Deployment {{ $labels.exported_namespace}}/{{ $labels.deployment }}
11 | is not satisfied.
12 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/disk.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: disk
3 | rules:
4 | - alert: PersistentVolumeSpaceTooLow
5 | expr: 100 * node_filesystem_free{mountpoint=~"/rootfs/var/lib/kubelet/.*"} / node_filesystem_size{mountpoint=~"/rootfs/var/lib/kubelet/.*"} < 10
6 | for: 10m
7 | labels:
8 | severity: page
9 | annotations:
10 | description: Persistent volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.
11 |
12 | - alert: RootVolumeSpaceTooLow
13 | expr: 100 * node_filesystem_free{mountpoint="/rootfs"} / node_filesystem_size{mountpoint="/rootfs"} < 10
14 | for: 10m
15 | labels:
16 | severity: page
17 | annotations:
18 | description: Root volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.
19 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/fluentbit.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: fluentbit
3 | rules:
4 | - alert: FluentbitTooManyErrors
5 | expr: rate(fluentbit_output_retries_failed_total[10m]) > 0
6 | for: 10m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: Fluentbit ({{ $labels.instance }}) is erroring.
11 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/ingress-controller.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: ingress-controller
3 | rules:
4 | - alert: IngressControllerReloadIsFailing
5 | expr: increase(ingress_controller_errors{count="reloads"}[5m]) > 1
6 | for: 5m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: Ingress Controller cannot reload new configuration. Please check IC logs.
11 |
12 | - alert: IngressControllerSSLCertificateWillExpireSoon
13 | expr: avg(ingress_controller_ssl_expire_time_seconds{host!~"(etcd|api).*"}) without (instance) < (time() + (10 * 24 * 3600))
14 | for: 5m
15 | labels:
16 | severity: notify
17 | annotations:
18 | description: SSL certificate for {{ $labels.host }} will expire in less than 10 days.
19 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/job.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: job
3 | rules:
4 | - alert: JobFailed
5 | expr: kube_job_failed{condition="true"} == 1
6 | for: 15m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: Job {{ $labels.exported_namespace }}/{{ $labels.exported_job }} is failed.
11 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/logging-data.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: fluentbit
3 | rules:
4 | - alert: LoggingDataNotAvailable
5 | # threshold value is 3e+8 bytes = 300 MB
6 | expr: kubelet_volume_stats_used_bytes{persistentvolumeclaim="elasticsearch-data"} < (300 * 1000 * 1000)
7 | for: 60m
8 | labels:
9 | severity: notify
10 | annotations:
11 | description: elasticsearch has no or very little log data in its volume.
12 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/network.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: network
3 | rules:
4 | - alert: DNSErrorRateTooHigh
5 | expr: rate(dns_error_total[15m]) > 0
6 | for: 15m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: DNS error rate is too high for {{ $labels.pod_name }}.
11 |
12 | - alert: NetworkErrorRateTooHigh
13 | expr: rate(network_error_total[15m]) > 0
14 | for: 15m
15 | labels:
16 | severity: notify
17 | annotations:
18 | description: Network error rate is too high for {{ $labels.pod_name }}.
19 |
20 | - alert: SYNRetransmissionRateTooHigh
21 | expr: rate(node_netstat_TcpExt_TCPSynRetrans[15m]) > 3
22 | for: 15m
23 | labels:
24 | severity: notify
25 | annotations:
26 | description: SYN retransmission rate is too high for {{ $labels.instance }}.
27 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/node.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: node
3 | rules:
4 | - alert: NodeStateFlapping
5 | # check for flapping node status
6 | # "changes" is returning the amount of value changes in the vector
7 | expr: changes(kube_node_status_condition{condition="Ready",status="true"}[30m]) > 6
8 | for: 5m
9 | labels:
10 | severity: notify
11 | annotations:
12 | description: Node {{ $labels.node }} status is flapping.
13 |
14 | - alert: NodeHasConstantOOMKills
15 | # Alert if node has more than 3 OOM kills for last hour (1 every 5 minutes).
16 | # This is only way to detect OOM kills at the moment.
17 | expr: increase(node_vmstat_oom_kill{}[1h]) > 3
18 | for: 10m
19 | labels:
20 | severity: notify
21 | annotations:
22 | description: Node {{ $labels.ip }} has constant OOM kills.
23 |
24 | - alert: NodeIsUnschedulable
25 | expr: kube_node_spec_unschedulable != 0
26 | for: 45m
27 | labels:
28 | severity: notify
29 | annotations:
30 | description: Node {{ $labels.node }} is unschedulable.
31 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/pod.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: pod
3 | rules:
4 | - alert: PodStuck
5 | expr: kube_pod_status_phase{phase="Pending"} == 1
6 | for: 15m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: Pod {{ $labels.exported_namespace }}/{{ $labels.pod }} is stuck in Pending.
11 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/prometheus.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: prometheus
3 | rules:
4 | - alert: PrometheusCompactionFailed
5 | expr: prometheus_tsdb_compactions_failed_total > 1
6 | for: 5m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: Prometheus compaction has failed.
11 |
12 | - alert: PrometheusCPUUsageTooHigh
13 | expr: sum(rate(container_cpu_usage_seconds_total{container_name="prometheus"}[20m])) / kube_pod_container_resource_limits_cpu_cores{container="prometheus"} > 0.93
14 | for: 5m
15 | labels:
16 | severity: page
17 | annotations:
18 | description: Prometheus cpu usage is reaching the treshold. Monitoring of the installation might be unreliable.
19 |
20 |
21 | - alert: PrometheusMemoryUsageTooHigh
22 | expr: avg_over_time(container_memory_usage_bytes{container_name="prometheus"}[20m]) / kube_pod_container_resource_limits_memory_bytes{container="prometheus"} > 0.93
23 | for: 5m
24 | labels:
25 | severity: page
26 | annotations:
27 | description: Prometheus memory usage is reaching the treshold. Monitoring of the installation might be unreliable.
28 |
29 |
30 | - alert: PrometheusWALCorrupted
31 | expr: prometheus_tsdb_wal_corruptions_total > 1
32 | for: 5m
33 | labels:
34 | severity: notify
35 | annotations:
36 | description: Prometheus WAL is corrupted.
37 |
38 | - alert: PrometheusIsRestarting
39 | # This alert covers the issue when Prometheus restarted (OOM killed) due to
40 | # lack of resources. Just check if total number of restarts more than 3.
41 | # In that case, we should be as quick as possible, so Prometheus will be able
42 | # to alarm before it will be killed by OOM killer.
43 | expr: kube_pod_container_status_restarts_total{container="prometheus"} > 3
44 | for: 1m
45 | labels:
46 | severity: notify
47 | annotations:
48 | description: Prometheus {{ $labels.exported_namespace }}/{{ $labels.pod }} is restarting too much probably due to OOM kills.
49 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/up.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: up
3 | rules:
4 | - alert: APIServerDown
5 | expr: up{app="kubernetes"} == 0
6 | for: 10m
7 | labels:
8 | severity: page
9 | annotations:
10 | description: Kubernetes API Server ({{ $labels.instance }}) is down.
11 |
12 | - alert: CadvisorDown
13 | expr: up{app="cadvisor"} == 0
14 | for: 15m
15 | labels:
16 | severity: notify
17 | annotations:
18 | description: Cadvisor ({{ $labels.ip }}) is down.
19 |
20 | - alert: KubeletDown
21 | expr: up{app="kubelet"} == 0
22 | for: 10m
23 | labels:
24 | severity: page
25 | annotations:
26 | description: Kubelet ({{ $labels.ip }}) is down.
27 | opsrecipe: https://github.com/giantswarm/ops-recipes/blob/master/020-kubelet-is-down.md
28 |
29 | - alert: KubeStateMetricsDown
30 | expr: up{app="kube-state-metrics"} == 0
31 | for: 10m
32 | labels:
33 | severity: page
34 | annotations:
35 | description: KubeStateMetrics ({{ $labels.instance }}) is down.
36 |
37 | - alert: NodeExporterDown
38 | expr: up{app="node-exporter"} == 0
39 | for: 10m
40 | labels:
41 | severity: page
42 | annotations:
43 | description: NodeExporter ({{ $labels.ip }}) is down.
44 |
45 | - alert: TargetDown
46 | expr: up{cluster_type="host", app!~"cadvisor|etcd|kubelet|kubernetes|master|node-exporter|worker"} == 0
47 | for: 10m
48 | labels:
49 | severity: page
50 | annotations:
51 | description: Target {{ $labels.namespace }}/{{ $labels.app }} ({{ $labels.instance }}) is down.
52 |
53 | - alert: TargetIsFlapping
54 | expr: changes(up[30m]) > 5
55 | for: 5m
56 | labels:
57 | severity: notify
58 | annotations:
59 | description: Target {{ $labels.app }} ({{ $labels.instance }}) is flapping.
60 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/prometheus-alerts/volume.rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: volume
3 | rules:
4 | - alert: VolumeUsedInodes
5 | expr: kubelet_volume_stats_inodes_used{} / kubelet_volume_stats_inodes{} > 0.9
6 | for: 10m
7 | labels:
8 | severity: notify
9 | annotations:
10 | description: Volume {{ $labels.persistentvolumeclaim }} has more than 90% inodes used.
11 |
12 | - alert: VolumeUsedSpace
13 | expr: kubelet_volume_stats_used_bytes{} / kubelet_volume_stats_capacity_bytes{} > 0.9
14 | for: 10m
15 | labels:
16 | severity: notify
17 | annotations:
18 | description: Volume {{ $labels.persistentvolumeclaim }} has more than 90% space used.
19 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | The Prometheus server can be accessed via port {{ .Values.server.service.servicePort }} on the following DNS name from within your cluster:
2 | {{ template "prometheus.server.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local
3 |
4 | {{ if .Values.server.ingress.enabled -}}
5 | From outside the cluster, the server URL(s) are:
6 | {{- range .Values.server.ingress.hosts }}
7 | http://{{ . }}
8 | {{- end }}
9 | {{- else }}
10 | Get the Prometheus server URL by running these commands in the same shell:
11 | {{- if contains "NodePort" .Values.server.service.type }}
12 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus.server.fullname" . }})
13 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
14 | echo http://$NODE_IP:$NODE_PORT
15 | {{- else if contains "LoadBalancer" .Values.server.service.type }}
16 | NOTE: It may take a few minutes for the LoadBalancer IP to be available.
17 | You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "prometheus.server.fullname" . }}'
18 |
19 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus.server.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
20 | echo http://$SERVICE_IP:{{ .Values.server.service.servicePort }}
21 | {{- else if contains "ClusterIP" .Values.server.service.type }}
22 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus.name" . }},component={{ .Values.server.name }}" -o jsonpath="{.items[0].metadata.name}")
23 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 9090
24 | {{- end }}
25 | {{- end }}
26 |
27 | {{- if .Values.server.persistentVolume.enabled }}
28 | {{- else }}
29 | #################################################################################
30 | ###### WARNING: Persistence is disabled!!! You will lose your data when #####
31 | ###### the Server pod is terminated. #####
32 | #################################################################################
33 | {{- end }}
34 |
35 | {{ if .Values.alertmanager.enabled }}
36 | The Prometheus alertmanager can be accessed via port {{ .Values.alertmanager.service.servicePort }} on the following DNS name from within your cluster:
37 | {{ template "prometheus.alertmanager.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local
38 |
39 | {{ if .Values.alertmanager.ingress.enabled -}}
40 | From outside the cluster, the alertmanager URL(s) are:
41 | {{- range .Values.alertmanager.ingress.hosts }}
42 | http://{{ . }}
43 | {{- end }}
44 | {{- else }}
45 | Get the Alertmanager URL by running these commands in the same shell:
46 | {{- if contains "NodePort" .Values.alertmanager.service.type }}
47 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus.alertmanager.fullname" . }})
48 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
49 | echo http://$NODE_IP:$NODE_PORT
50 | {{- else if contains "LoadBalancer" .Values.alertmanager.service.type }}
51 | NOTE: It may take a few minutes for the LoadBalancer IP to be available.
52 | You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "prometheus.alertmanager.fullname" . }}'
53 |
54 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus.alertmanager.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
55 | echo http://$SERVICE_IP:{{ .Values.alertmanager.service.servicePort }}
56 | {{- else if contains "ClusterIP" .Values.alertmanager.service.type }}
57 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus.name" . }},component={{ .Values.alertmanager.name }}" -o jsonpath="{.items[0].metadata.name}")
58 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 9093
59 | {{- end }}
60 | {{- end }}
61 |
62 | {{- if .Values.alertmanager.persistentVolume.enabled }}
63 | {{- else }}
64 | #################################################################################
65 | ###### WARNING: Persistence is disabled!!! You will lose your data when #####
66 | ###### the AlertManager pod is terminated. #####
67 | #################################################################################
68 | {{- end }}
69 | {{- end }}
70 |
71 | Grafana
72 |
73 | 1. Get your '{{ .Values.grafana.adminUser }}' user password by running:
74 |
75 | kubectl get secret --namespace {{ .Release.Namespace }} {{ template "prometheus.grafana.fullname" . }} -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
76 |
77 | 2. The Grafana server can be accessed via port {{ .Values.grafana.service.port }} on the following DNS name from within your cluster:
78 |
79 | {{ template "prometheus.grafana.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local
80 | {{ if .Values.grafana.ingress.enabled }}
81 | From outside the cluster, the server URL(s) are:
82 | {{- range .Values.grafana.ingress.hosts }}
83 | http://{{ . }}
84 | {{- end }}
85 | {{ else }}
86 | Get the Grafana URL to visit by running these commands in the same shell:
87 | {{ if contains "NodePort" .Values.grafana.service.type -}}
88 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus.grafana.fullname" . }})
89 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
90 | echo http://$NODE_IP:$NODE_PORT
91 | {{ else if contains "LoadBalancer" .Values.grafana.service.type -}}
92 | NOTE: It may take a few minutes for the LoadBalancer IP to be available.
93 | You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "prometheus.grafana.fullname" . }}'
94 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus.grafana.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
95 | http://$SERVICE_IP:{{ .Values.grafana.service.port -}}
96 | {{ else if contains "ClusterIP" .Values.grafana.service.type }}
97 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus.name" . }},component={{ .Values.grafana.name }}" -o jsonpath="{.items[0].metadata.name}")
98 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 3000
99 | {{- end }}
100 | {{- end }}
101 |
102 | 3. Login with the password from step 1 and the username: {{ .Values.grafana.adminUser }}
103 |
104 | For more information on running Prometheus, visit:
105 | https://prometheus.io/
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/* vim: set filetype=mustache: */}}
2 | {{/*
3 | Expand the name of the chart.
4 | */}}
5 | {{- define "prometheus.name" -}}
6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
7 | {{- end -}}
8 |
9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | */}}
13 | {{- define "prometheus.fullname" -}}
14 | {{- if .Values.fullnameOverride -}}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
16 | {{- else -}}
17 | {{- $name := default .Chart.Name .Values.nameOverride -}}
18 | {{- if contains $name .Release.Name -}}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
20 | {{- else -}}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
22 | {{- end -}}
23 | {{- end -}}
24 | {{- end -}}
25 |
26 | {{/*
27 | Create a fully qualified grafana name.
28 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
29 | */}}
30 |
31 | {{- define "prometheus.alertmanager.fullname" -}}
32 | {{- if .Values.alertmanager.fullnameOverride -}}
33 | {{- .Values.alertmanager.fullnameOverride | trunc 63 | trimSuffix "-" -}}
34 | {{- else -}}
35 | {{- $name := default .Chart.Name .Values.nameOverride -}}
36 | {{- if contains $name .Release.Name -}}
37 | {{- printf "%s-%s" .Release.Name .Values.alertmanager.name | trunc 63 | trimSuffix "-" -}}
38 | {{- else -}}
39 | {{- printf "%s-%s-%s" .Release.Name $name .Values.alertmanager.name | trunc 63 | trimSuffix "-" -}}
40 | {{- end -}}
41 | {{- end -}}
42 | {{- end -}}
43 |
44 |
45 | {{- define "prometheus.grafana.fullname" -}}
46 | {{- if .Values.grafana.fullnameOverride -}}
47 | {{- .Values.grafana.fullnameOverride | trunc 63 | trimSuffix "-" -}}
48 | {{- else -}}
49 | {{- $name := default .Chart.Name .Values.nameOverride -}}
50 | {{- if contains $name .Release.Name -}}
51 | {{- printf "%s-%s" .Release.Name .Values.grafana.name | trunc 63 | trimSuffix "-" -}}
52 | {{- else -}}
53 | {{- printf "%s-%s-%s" .Release.Name $name .Values.grafana.name | trunc 63 | trimSuffix "-" -}}
54 | {{- end -}}
55 | {{- end -}}
56 | {{- end -}}
57 |
58 |
59 | {{/*
60 | Create a fully qualified kube-state-metrics name.
61 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
62 | */}}
63 | {{- define "prometheus.kubeStateMetrics.fullname" -}}
64 | {{- if .Values.kubeStateMetrics.fullnameOverride -}}
65 | {{- .Values.kubeStateMetrics.fullnameOverride | trunc 63 | trimSuffix "-" -}}
66 | {{- else -}}
67 | {{- $name := default .Chart.Name .Values.nameOverride -}}
68 | {{- if contains $name .Release.Name -}}
69 | {{- printf "%s-%s" .Release.Name .Values.kubeStateMetrics.name | trunc 63 | trimSuffix "-" -}}
70 | {{- else -}}
71 | {{- printf "%s-%s-%s" .Release.Name $name .Values.kubeStateMetrics.name | trunc 63 | trimSuffix "-" -}}
72 | {{- end -}}
73 | {{- end -}}
74 | {{- end -}}
75 |
76 | {{/*
77 | Create a fully qualified node-exporter name.
78 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
79 | */}}
80 | {{- define "prometheus.nodeExporter.fullname" -}}
81 | {{- if .Values.nodeExporter.fullnameOverride -}}
82 | {{- .Values.nodeExporter.fullnameOverride | trunc 63 | trimSuffix "-" -}}
83 | {{- else -}}
84 | {{- $name := default .Chart.Name .Values.nameOverride -}}
85 | {{- if contains $name .Release.Name -}}
86 | {{- printf "%s-%s" .Release.Name .Values.nodeExporter.name | trunc 63 | trimSuffix "-" -}}
87 | {{- else -}}
88 | {{- printf "%s-%s-%s" .Release.Name $name .Values.nodeExporter.name | trunc 63 | trimSuffix "-" -}}
89 | {{- end -}}
90 | {{- end -}}
91 | {{- end -}}
92 |
93 | {{/*
94 | Create a fully qualified Prometheus server name.
95 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
96 | */}}
97 | {{- define "prometheus.server.fullname" -}}
98 | {{- if .Values.server.fullnameOverride -}}
99 | {{- .Values.server.fullnameOverride | trunc 63 | trimSuffix "-" -}}
100 | {{- else -}}
101 | {{- $name := default .Chart.Name .Values.nameOverride -}}
102 | {{- if contains $name .Release.Name -}}
103 | {{- printf "%s-%s" .Release.Name .Values.server.name | trunc 63 | trimSuffix "-" -}}
104 | {{- else -}}
105 | {{- printf "%s-%s-%s" .Release.Name $name .Values.server.name | trunc 63 | trimSuffix "-" -}}
106 | {{- end -}}
107 | {{- end -}}
108 | {{- end -}}
109 |
110 | {{/*
111 | Create a fully qualified pushgateway name.
112 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
113 | */}}
114 | {{- define "prometheus.pushgateway.fullname" -}}
115 | {{- if .Values.pushgateway.fullnameOverride -}}
116 | {{- .Values.pushgateway.fullnameOverride | trunc 63 | trimSuffix "-" -}}
117 | {{- else -}}
118 | {{- $name := default .Chart.Name .Values.nameOverride -}}
119 | {{- if contains $name .Release.Name -}}
120 | {{- printf "%s-%s" .Release.Name .Values.pushgateway.name | trunc 63 | trimSuffix "-" -}}
121 | {{- else -}}
122 | {{- printf "%s-%s-%s" .Release.Name $name .Values.pushgateway.name | trunc 63 | trimSuffix "-" -}}
123 | {{- end -}}
124 | {{- end -}}
125 | {{- end -}}
126 |
127 | {{/*
128 | Return the appropriate apiVersion for networkpolicy.
129 | */}}
130 | {{- define "prometheus.networkPolicy.apiVersion" -}}
131 | {{- if semverCompare ">=1.4-0, <1.7-0" .Capabilities.KubeVersion.GitVersion -}}
132 | {{- print "extensions/v1beta1" -}}
133 | {{- else if semverCompare "^1.7-0" .Capabilities.KubeVersion.GitVersion -}}
134 | {{- print "networking.k8s.io/v1" -}}
135 | {{- end -}}
136 | {{- end -}}
137 |
138 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/alertmanager-configmap.yaml:
--------------------------------------------------------------------------------
1 | {{- if and .Values.alertmanager.enabled (empty .Values.alertmanager.configMapOverrideName) -}}
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
8 | component: "{{ .Values.alertmanager.name }}"
9 | heritage: {{ .Release.Service }}
10 | release: {{ .Release.Name }}
11 | name: {{ template "prometheus.alertmanager.fullname" . }}
12 | data:
13 | {{- $root := . -}}
14 | {{- range $key, $value := .Values.alertmanagerFiles }}
15 | {{ $key }}: |
16 | {{ toYaml $value | default "{}" | indent 4 }}
17 | {{- end -}}
18 | {{- end -}}
19 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/alertmanager-deployment.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.alertmanager.enabled -}}
2 | apiVersion: extensions/v1beta1
3 | kind: Deployment
4 | metadata:
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
8 | component: "{{ .Values.alertmanager.name }}"
9 | heritage: {{ .Release.Service }}
10 | release: {{ .Release.Name }}
11 | name: {{ template "prometheus.alertmanager.fullname" . }}
12 | spec:
13 | replicas: {{ .Values.alertmanager.replicaCount }}
14 | {{- if .Values.server.strategy }}
15 | strategy:
16 | {{ toYaml .Values.server.strategy | indent 4 }}
17 | {{- end }}
18 | template:
19 | metadata:
20 | {{- if .Values.alertmanager.podAnnotations }}
21 | annotations:
22 | {{ toYaml .Values.alertmanager.podAnnotations | indent 8 }}
23 | {{- end }}
24 | labels:
25 | app: {{ template "prometheus.name" . }}
26 | component: "{{ .Values.alertmanager.name }}"
27 | release: {{ .Release.Name }}
28 | spec:
29 | {{- if .Values.alertmanager.affinity }}
30 | affinity:
31 | {{ toYaml .Values.alertmanager.affinity | indent 8 }}
32 | {{- end }}
33 | {{- if .Values.alertmanager.schedulerName }}
34 | schedulerName: "{{ .Values.alertmanager.schedulerName }}"
35 | {{- end }}
36 | serviceAccountName: alertmanager
37 | {{- if .Values.alertmanager.priorityClassName }}
38 | priorityClassName: "{{ .Values.alertmanager.priorityClassName }}"
39 | {{- end }}
40 | containers:
41 | - name: {{ template "prometheus.name" . }}-{{ .Values.alertmanager.name }}
42 | image: "{{ .Values.alertmanager.image.repository }}:{{ .Values.alertmanager.image.tag }}"
43 | imagePullPolicy: "{{ .Values.alertmanager.image.pullPolicy }}"
44 | env:
45 | {{- range $key, $value := .Values.alertmanager.extraEnv }}
46 | - name: {{ $key }}
47 | value: {{ $value }}
48 | {{- end }}
49 | - name: POD_IP
50 | valueFrom:
51 | fieldRef:
52 | apiVersion: v1
53 | fieldPath: status.podIP
54 | args:
55 | - --config.file=/etc/config/alertmanager.yml
56 | - --storage.path={{ .Values.alertmanager.persistentVolume.mountPath }}
57 | - --cluster.advertise-address=$(POD_IP):6783
58 | {{- range $key, $value := .Values.alertmanager.extraArgs }}
59 | - --{{ $key }}={{ $value }}
60 | {{- end }}
61 | {{- if .Values.alertmanager.baseURL }}
62 | - --web.external-url={{ .Values.alertmanager.baseURL }}
63 | {{- end }}
64 |
65 | ports:
66 | - containerPort: 9093
67 | readinessProbe:
68 | httpGet:
69 | path: {{ .Values.alertmanager.prefixURL }}/#/status
70 | port: 9093
71 | initialDelaySeconds: 30
72 | timeoutSeconds: 30
73 | resources:
74 | {{ toYaml .Values.alertmanager.resources | indent 12 }}
75 | volumeMounts:
76 | - name: config-volume
77 | mountPath: /etc/config
78 | - name: storage-volume
79 | mountPath: "{{ .Values.alertmanager.persistentVolume.mountPath }}"
80 | subPath: "{{ .Values.alertmanager.persistentVolume.subPath }}"
81 |
82 | - name: {{ template "prometheus.name" . }}-{{ .Values.alertmanager.name }}-{{ .Values.configmapReload.name }}
83 | image: "{{ .Values.configmapReload.image.repository }}:{{ .Values.configmapReload.image.tag }}"
84 | imagePullPolicy: "{{ .Values.configmapReload.image.pullPolicy }}"
85 | args:
86 | - --volume-dir=/etc/config
87 | - --webhook-url=http://localhost:9093{{ .Values.alertmanager.prefixURL }}/-/reload
88 | resources:
89 | {{ toYaml .Values.configmapReload.resources | indent 12 }}
90 | volumeMounts:
91 | - name: config-volume
92 | mountPath: /etc/config
93 | readOnly: true
94 | {{- if .Values.imagePullSecrets }}
95 | imagePullSecrets:
96 | {{ toYaml .Values.imagePullSecrets | indent 2 }}
97 | {{- end }}
98 | {{- if .Values.alertmanager.nodeSelector }}
99 | nodeSelector:
100 | {{ toYaml .Values.alertmanager.nodeSelector | indent 8 }}
101 | {{- end }}
102 | {{- if .Values.alertmanager.securityContext }}
103 | securityContext:
104 | {{ toYaml .Values.alertmanager.securityContext | indent 8 }}
105 | {{- end }}
106 | {{- if .Values.alertmanager.tolerations }}
107 | tolerations:
108 | {{ toYaml .Values.alertmanager.tolerations | indent 8 }}
109 | {{- end }}
110 | {{- if .Values.alertmanager.affinity }}
111 | affinity:
112 | {{ toYaml .Values.alertmanager.affinity | indent 8 }}
113 | {{- end }}
114 | volumes:
115 | - name: config-volume
116 | configMap:
117 | name: {{ if .Values.alertmanager.configMapOverrideName }}{{ .Release.Name }}-{{ .Values.alertmanager.configMapOverrideName }}{{- else }}{{ template "prometheus.alertmanager.fullname" . }}{{- end }}
118 | - name: storage-volume
119 | {{- if .Values.alertmanager.persistentVolume.enabled }}
120 | persistentVolumeClaim:
121 | claimName: {{ if .Values.alertmanager.persistentVolume.existingClaim }}{{ .Values.alertmanager.persistentVolume.existingClaim }}{{- else }}{{ template "prometheus.alertmanager.fullname" . }}{{- end }}
122 | {{- else }}
123 | emptyDir: {}
124 | {{- end -}}
125 | {{- end }}
126 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/alertmanager-ingress.yaml:
--------------------------------------------------------------------------------
1 | {{- if and .Values.alertmanager.enabled .Values.alertmanager.ingress.enabled -}}
2 | {{- $releaseName := .Release.Name -}}
3 | {{- $serviceName := include "prometheus.alertmanager.fullname" . }}
4 | {{- $servicePort := .Values.alertmanager.service.servicePort -}}
5 | apiVersion: extensions/v1beta1
6 | kind: Ingress
7 | metadata:
8 | {{- if .Values.alertmanager.ingress.annotations }}
9 | annotations:
10 | {{ toYaml .Values.alertmanager.ingress.annotations | indent 4 }}
11 | {{- end }}
12 | labels:
13 | app: {{ template "prometheus.name" . }}
14 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
15 | component: "{{ .Values.alertmanager.name }}"
16 | heritage: {{ .Release.Service }}
17 | release: {{ .Release.Name }}
18 | {{- range $key, $value := .Values.alertmanager.ingress.extraLabels }}
19 | {{ $key }}: {{ $value }}
20 | {{- end }}
21 | name: {{ template "prometheus.alertmanager.fullname" . }}
22 | spec:
23 | rules:
24 | {{- range .Values.alertmanager.ingress.hosts }}
25 | {{- $url := splitList "/" . }}
26 | - host: {{ first $url }}
27 | http:
28 | paths:
29 | - path: /{{ rest $url | join "/" }}
30 | backend:
31 | serviceName: {{ $serviceName }}
32 | servicePort: {{ $servicePort }}
33 | {{- end -}}
34 | {{- if .Values.alertmanager.ingress.tls }}
35 | tls:
36 | {{ toYaml .Values.alertmanager.ingress.tls | indent 4 }}
37 | {{- end -}}
38 | {{- end -}}
39 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/alertmanager-networkpolicy.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.networkPolicy.enabled }}
2 | apiVersion: {{ template "prometheus.networkPolicy.apiVersion" . }}
3 | kind: NetworkPolicy
4 | metadata:
5 | name: {{ template "prometheus.alertmanager.fullname" . }}
6 | labels:
7 | app: {{ template "prometheus.name" . }}
8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
9 | component: "{{ .Values.alertmanager.name }}"
10 | heritage: {{ .Release.Service }}
11 | release: {{ .Release.Name }}
12 | spec:
13 | podSelector:
14 | matchLabels:
15 | app: {{ template "prometheus.name" . }}
16 | component: "{{ .Values.alertmanager.name }}"
17 | release: {{ .Release.Name }}
18 | ingress:
19 | - from:
20 | - podSelector:
21 | matchLabels:
22 | release: {{ .Release.Name }}
23 | component: "{{ .Values.server.name }}"
24 | - ports:
25 | - port: 9093
26 | {{- end }}
27 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/alertmanager-pvc.yaml:
--------------------------------------------------------------------------------
1 | {{- if and .Values.alertmanager.enabled .Values.alertmanager.persistentVolume.enabled -}}
2 | {{- if not .Values.alertmanager.persistentVolume.existingClaim -}}
3 | apiVersion: v1
4 | kind: PersistentVolumeClaim
5 | metadata:
6 | {{- if .Values.alertmanager.persistentVolume.annotations }}
7 | annotations:
8 | {{ toYaml .Values.alertmanager.persistentVolume.annotations | indent 4 }}
9 | {{- end }}
10 | labels:
11 | app: {{ template "prometheus.name" . }}
12 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
13 | component: "{{ .Values.alertmanager.name }}"
14 | heritage: {{ .Release.Service }}
15 | release: {{ .Release.Name }}
16 | name: {{ template "prometheus.alertmanager.fullname" . }}
17 | spec:
18 | accessModes:
19 | {{ toYaml .Values.alertmanager.persistentVolume.accessModes | indent 4 }}
20 | {{- if .Values.alertmanager.persistentVolume.storageClass }}
21 | {{- if (eq "-" .Values.alertmanager.persistentVolume.storageClass) }}
22 | storageClassName: ""
23 | {{- else }}
24 | storageClassName: "{{ .Values.alertmanager.persistentVolume.storageClass }}"
25 | {{- end }}
26 | {{- end }}
27 | resources:
28 | requests:
29 | storage: "{{ .Values.alertmanager.persistentVolume.size }}"
30 | {{- end -}}
31 | {{- end -}}
32 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/alertmanager-service.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.alertmanager.enabled -}}
2 | apiVersion: v1
3 | kind: Service
4 | metadata:
5 | {{- if .Values.alertmanager.service.annotations }}
6 | annotations:
7 | {{ toYaml .Values.alertmanager.service.annotations | indent 4 }}
8 | {{- end }}
9 | labels:
10 | app: {{ template "prometheus.name" . }}
11 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
12 | component: "{{ .Values.alertmanager.name }}"
13 | heritage: {{ .Release.Service }}
14 | release: {{ .Release.Name }}
15 | {{- if .Values.alertmanager.service.labels }}
16 | {{ toYaml .Values.alertmanager.service.labels | indent 4 }}
17 | {{- end }}
18 | name: {{ template "prometheus.alertmanager.fullname" . }}
19 | spec:
20 | {{- if .Values.alertmanager.service.clusterIP }}
21 | clusterIP: {{ .Values.alertmanager.service.clusterIP }}
22 | {{- end }}
23 | {{- if .Values.alertmanager.service.externalIPs }}
24 | externalIPs:
25 | {{ toYaml .Values.alertmanager.service.externalIPs | indent 4 }}
26 | {{- end }}
27 | {{- if .Values.alertmanager.service.loadBalancerIP }}
28 | loadBalancerIP: {{ .Values.alertmanager.service.loadBalancerIP }}
29 | {{- end }}
30 | {{- if .Values.alertmanager.service.loadBalancerSourceRanges }}
31 | loadBalancerSourceRanges:
32 | {{- range $cidr := .Values.alertmanager.service.loadBalancerSourceRanges }}
33 | - {{ $cidr }}
34 | {{- end }}
35 | {{- end }}
36 | ports:
37 | - name: http
38 | port: {{ .Values.alertmanager.service.servicePort }}
39 | protocol: TCP
40 | targetPort: 9093
41 | {{- if .Values.alertmanager.service.nodePort }}
42 | nodePort: {{ .Values.alertmanager.service.nodePort }}
43 | {{- end }}
44 | {{- if .Values.alertmanager.service.enableMeshPeer }}
45 | - name: meshpeer
46 | port: 6783
47 | protocol: TCP
48 | targetPort: 6783
49 | {{- end }}
50 | selector:
51 | app: {{ template "prometheus.name" . }}
52 | component: "{{ .Values.alertmanager.name }}"
53 | release: {{ .Release.Name }}
54 | type: "{{ .Values.alertmanager.service.type }}"
55 | {{- end }}
56 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/alertmanager-serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | labels:
5 | app: {{ template "prometheus.name" . }}
6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
7 | component: "{{ .Values.alertmanager.name }}"
8 | heritage: {{ .Release.Service }}
9 | release: {{ .Release.Name }}
10 | name: alertmanager
11 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-configmap.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: {{ template "prometheus.grafana.fullname". }}
6 | labels:
7 | app: {{ template "prometheus.name" . }}
8 | component: "{{ .Values.grafana.name }}"
9 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
10 | release: {{ .Release.Name }}
11 | heritage: {{ .Release.Service }}
12 | data:
13 | grafana.ini: |
14 | {{- range $key, $value := index .Values.grafana "grafana.ini" }}
15 | [{{ $key }}]
16 | {{- range $elem, $elemVal := $value }}
17 | {{ $elem }} = {{ $elemVal }}
18 | {{- end }}
19 | {{- end }}
20 | ---
21 | apiVersion: v1
22 | kind: ConfigMap
23 | metadata:
24 | name: grafana-datasources
25 | labels:
26 | app: {{ template "prometheus.name" . }}
27 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
28 | release: {{ .Release.Name }}
29 | heritage: {{ .Release.Service }}
30 | data:
31 | datasources.yaml: |
32 | apiVersion: 1
33 | datasources:
34 | - name: Prometheus
35 | type: prometheus
36 | access: proxy
37 | orgId: 1
38 | url: http://{{ template "prometheus.server.fullname" . }}:{{ .Values.server.service.servicePort }}
39 | basicAuth: false
40 | withCredentials: false
41 | isDefault: true
42 | editable: false
43 | version: 1
44 | - name: Elasticsearch
45 | type: elasticsearch
46 | access: proxy
47 | orgId: 1
48 | url: http://elasticsearch:9200
49 | basicAuth: false
50 | withCredentials: false
51 | isDefault: false
52 | editable: false
53 | version: 1
54 | database: "[gslogs-]YYYY.MM.DD"
55 | jsonData:
56 | interval: Daily
57 | timeField: '@timestamp'
58 | ---
59 | apiVersion: v1
60 | kind: ConfigMap
61 | metadata:
62 | name: grafana-dashboards-main
63 | labels:
64 | app: {{ template "prometheus.name" . }}
65 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
66 | release: {{ .Release.Name }}
67 | heritage: {{ .Release.Service }}
68 | data:
69 | dashboards.yaml: |
70 | apiVersion: 1
71 | providers:
72 | - name: 'default'
73 | orgId: 1
74 | folder: ''
75 | type: file
76 | disableDeletion: true
77 | options:
78 | path: /etc/grafana/provisioning/dashboards-json
79 | ---
80 | apiVersion: v1
81 | kind: ConfigMap
82 | metadata:
83 | name: grafana-dashboards-json
84 | labels:
85 | app: {{ template "prometheus.name" . }}
86 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
87 | release: {{ .Release.Name }}
88 | heritage: {{ .Release.Service }}
89 | data:
90 | {{ (.Files.Glob "grafana-dashboards/*").AsConfig | indent 4 }}
91 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1beta2
2 | kind: Deployment
3 | metadata:
4 | name: {{ template "prometheus.grafana.fullname" . }}
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | component: "{{ .Values.grafana.name }}"
8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
9 | release: {{ .Release.Name }}
10 | heritage: {{ .Release.Service }}
11 | {{- with .Values.grafana.annotations }}
12 | annotations:
13 | {{ toYaml . | indent 4 }}
14 | {{- end }}
15 | spec:
16 | replicas: {{ .Values.grafana.replicas }}
17 | selector:
18 | matchLabels:
19 | app: {{ template "prometheus.name" . }}
20 | release: {{ .Release.Name }}
21 | strategy:
22 | type: {{ .Values.grafana.deploymentStrategy }}
23 | {{- if ne .Values.grafana.deploymentStrategy "RollingUpdate" }}
24 | rollingUpdate: null
25 | {{- end }}
26 | template:
27 | metadata:
28 | labels:
29 | app: {{ template "prometheus.name" . }}
30 | component: "{{ .Values.grafana.name }}"
31 | release: {{ .Release.Name }}
32 | {{- with .Values.grafana.podAnnotations }}
33 | annotations:
34 | {{ toYaml . | indent 8 }}
35 | {{- end }}
36 | spec:
37 | serviceAccountName: grafana
38 | {{- if .Values.grafana.schedulerName }}
39 | schedulerName: "{{ .Values.grafana.schedulerName }}"
40 | {{- end }}
41 | {{- if .Values.grafana.securityContext }}
42 | securityContext:
43 | {{ toYaml .Values.grafana.securityContext | indent 8 }}
44 | {{- end }}
45 | {{- if .Values.grafana.dashboards }}
46 | initContainers:
47 | - name: download-dashboards
48 | image: "{{ .Values.grafana.downloadDashboardsImage.repository }}:{{ .Values.grafana.downloadDashboardsImage.tag }}"
49 | imagePullPolicy: {{ .Values.grafana.downloadDashboardsImage.pullPolicy }}
50 | command: ["sh", "/etc/grafana/download_dashboards.sh"]
51 | volumeMounts:
52 | - name: config
53 | mountPath: "/etc/grafana/download_dashboards.sh"
54 | subPath: download_dashboards.sh
55 | - name: storage
56 | mountPath: "/var/lib/grafana"
57 | subPath: {{ .Values.grafana.persistence.subPath }}
58 | {{- range .Values.grafana.extraSecretMounts }}
59 | - name: {{ .name }}
60 | mountPath: {{ .mountPath }}
61 | readOnly: {{ .readOnly }}
62 | {{- end }}
63 | {{- end }}
64 | {{- if .Values.grafana.image.pullSecrets }}
65 | imagePullSecrets:
66 | {{- range .Values.grafana.image.pullSecrets }}
67 | - name: {{ . }}
68 | {{- end}}
69 | {{- end }}
70 | containers:
71 | - name: {{ .Chart.Name }}
72 | image: "{{ .Values.grafana.image.repository }}:{{ .Values.grafana.image.tag }}"
73 | imagePullPolicy: {{ .Values.grafana.image.pullPolicy }}
74 | volumeMounts:
75 | - name: config
76 | mountPath: "/etc/grafana/grafana.ini"
77 | subPath: grafana.ini
78 | - name: ldap
79 | mountPath: "/etc/grafana/ldap.toml"
80 | subPath: ldap.toml
81 | # Data sources to provision on startup
82 | - name: datasources
83 | mountPath: /etc/grafana/provisioning/datasources
84 | # Main dashboard provisioning file directory
85 | - name: dashboards-main
86 | mountPath: /etc/grafana/provisioning/dashboards
87 | # Individual dashboards JSON directory
88 | - name: dashboards-json
89 | mountPath: /etc/grafana/provisioning/dashboards-json
90 | ports:
91 | - name: service
92 | containerPort: {{ .Values.grafana.service.port }}
93 | protocol: TCP
94 | - name: grafana
95 | containerPort: 3000
96 | protocol: TCP
97 | env:
98 | - name: GF_SECURITY_ADMIN_USER
99 | valueFrom:
100 | secretKeyRef:
101 | name: {{ template "prometheus.grafana.fullname" . }}
102 | key: admin-user
103 | - name: GF_SECURITY_ADMIN_PASSWORD
104 | valueFrom:
105 | secretKeyRef:
106 | name: {{ template "prometheus.grafana.fullname" . }}
107 | key: admin-password
108 | livenessProbe:
109 | {{ toYaml .Values.grafana.livenessProbe | indent 12 }}
110 | readinessProbe:
111 | {{ toYaml .Values.grafana.readinessProbe | indent 12 }}
112 | resources:
113 | {{ toYaml .Values.grafana.resources | indent 12 }}
114 | {{- with .Values.grafana.nodeSelector }}
115 | nodeSelector:
116 | {{ toYaml . | indent 8 }}
117 | {{- end }}
118 | {{- with .Values.grafana.affinity }}
119 | affinity:
120 | {{ toYaml . | indent 8 }}
121 | {{- end }}
122 | {{- with .Values.grafana.tolerations }}
123 | tolerations:
124 | {{ toYaml . | indent 8 }}
125 | {{- end }}
126 | volumes:
127 | - name: config
128 | configMap:
129 | name: {{ template "prometheus.grafana.fullname" . }}
130 | - name: ldap
131 | secret:
132 | secretName: {{ template "prometheus.grafana.fullname" . }}
133 | items:
134 | - key: ldap-toml
135 | path: ldap.toml
136 | - name: datasources
137 | configMap:
138 | name: grafana-datasources
139 | - name: dashboards-main
140 | configMap:
141 | name: grafana-dashboards-main
142 | - name: dashboards-json
143 | configMap:
144 | name: grafana-dashboards-json
145 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-ingress.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.grafana.ingress.enabled -}}
2 | {{- $fullName := include "prometheus.grafana.fullname" . -}}
3 | {{- $servicePort := .Values.grafana.service.port -}}
4 | {{- $ingressPath := .Values.grafana.ingress.path -}}
5 | apiVersion: extensions/v1beta1
6 | kind: Ingress
7 | metadata:
8 | name: {{ $fullName }}
9 | labels:
10 | app: {{ template "prometheus.name" . }}
11 | component: "{{ .Values.grafana.name }}"
12 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
13 | release: {{ .Release.Name }}
14 | heritage: {{ .Release.Service }}
15 | {{- if .Values.grafana.ingress.labels }}
16 | {{ toYaml .Values.grafana.ingress.labels | indent 4 }}
17 | {{- end }}
18 | {{- with .Values.grafana.ingress.annotations }}
19 | annotations:
20 | {{ toYaml . | indent 4 }}
21 | {{- end }}
22 | spec:
23 | {{- if .Values.grafana.ingress.tls }}
24 | tls:
25 | {{- range .Values.grafana.ingress.tls }}
26 | - hosts:
27 | {{- range .hosts }}
28 | - {{ . | quote }}
29 | {{- end }}
30 | secretName: {{ .secretName }}
31 | {{- end }}
32 | {{- end }}
33 | rules:
34 | {{- range .Values.grafana.ingress.hosts }}
35 | - host: {{ . }}
36 | http:
37 | paths:
38 | - path: {{ $ingressPath }}
39 | backend:
40 | serviceName: {{ $fullName }}
41 | servicePort: {{ $servicePort }}
42 | {{- end }}
43 | {{- end }}
44 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-podsecuritypolicy.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: extensions/v1beta1
2 | kind: PodSecurityPolicy
3 | metadata:
4 | name: {{ template "prometheus.grafana.fullname" . }}
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | component: "{{ .Values.grafana.name }}"
8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
9 | heritage: {{ .Release.Service }}
10 | release: {{ .Release.Name }}
11 | annotations:
12 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: 'docker/default'
13 | apparmor.security.beta.kubernetes.io/allowedProfileNames: 'runtime/default'
14 | seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
15 | apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
16 | spec:
17 | privileged: false
18 | allowPrivilegeEscalation: false
19 | requiredDropCapabilities:
20 | - ALL
21 | volumes:
22 | - 'configMap'
23 | - 'emptyDir'
24 | - 'projected'
25 | - 'secret'
26 | - 'downwardAPI'
27 | - 'persistentVolumeClaim'
28 | hostNetwork: false
29 | hostIPC: false
30 | hostPID: false
31 | runAsUser:
32 | rule: 'RunAsAny'
33 | seLinux:
34 | rule: 'RunAsAny'
35 | supplementalGroups:
36 | rule: 'RunAsAny'
37 | fsGroup:
38 | rule: 'RunAsAny'
39 | readOnlyRootFilesystem: false
40 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-role.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1beta1
2 | kind: Role
3 | metadata:
4 | name: {{ template "prometheus.grafana.fullname" . }}
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | component: "{{ .Values.grafana.name }}"
8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
9 | heritage: {{ .Release.Service }}
10 | release: {{ .Release.Name }}
11 | rules:
12 | - apiGroups: ['extensions']
13 | resources: ['podsecuritypolicies']
14 | verbs: ['use']
15 | resourceNames: [{{ template "prometheus.grafana.fullname" . }}]
16 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-rolebinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1beta1
2 | kind: RoleBinding
3 | metadata:
4 | name: {{ template "prometheus.grafana.fullname" . }}
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | component: "{{ .Values.grafana.name }}"
8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
9 | heritage: {{ .Release.Service }}
10 | release: {{ .Release.Name }}
11 | roleRef:
12 | apiGroup: rbac.authorization.k8s.io
13 | kind: Role
14 | name: {{ template "prometheus.grafana.fullname" . }}
15 | subjects:
16 | - kind: ServiceAccount
17 | name: grafana
18 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 | name: {{ template "prometheus.grafana.fullname" . }}
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | component: "{{ .Values.grafana.name }}"
8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
9 | release: {{ .Release.Name }}
10 | heritage: {{ .Release.Service }}
11 | type: Opaque
12 | data:
13 | admin-user: {{ .Values.grafana.adminUser | b64enc | quote }}
14 | {{- if .Values.grafana.adminPassword }}
15 | admin-password: {{ .Values.grafana.adminPassword | b64enc | quote }}
16 | {{- else }}
17 | admin-password: {{ randAlphaNum 40 | b64enc | quote }}
18 | {{- end }}
19 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ template "prometheus.grafana.fullname" . }}
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | component: "{{ .Values.grafana.name }}"
8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
9 | release: {{ .Release.Name }}
10 | heritage: {{ .Release.Service }}
11 | {{- if .Values.grafana.service.labels }}
12 | {{ toYaml .Values.grafana.service.labels | indent 4 }}
13 | {{- end }}
14 | {{- with .Values.grafana.service.annotations }}
15 | annotations:
16 | {{ toYaml . | indent 4 }}
17 | {{- end }}
18 | spec:
19 | {{- if (or (eq .Values.grafana.service.type "ClusterIP") (empty .Values.grafana.service.type)) }}
20 | type: ClusterIP
21 | {{- if .Values.grafana.service.clusterIP }}
22 | clusterIP: {{ .Values.grafana.service.clusterIP }}
23 | {{end}}
24 | {{- else if eq .Values.grafana.service.type "LoadBalancer" }}
25 | type: {{ .Values.grafana.service.type }}
26 | {{- if .Values.grafana.service.loadBalancerIP }}
27 | loadBalancerIP: {{ .Values.grafana.service.loadBalancerIP }}
28 | {{- end }}
29 | {{- if .Values.grafana.service.loadBalancerSourceRanges }}
30 | loadBalancerSourceRanges:
31 | {{ toYaml .Values.grafana.service.loadBalancerSourceRanges | indent 4 }}
32 | {{- end -}}
33 | {{- else }}
34 | type: {{ .Values.grafana.service.type }}
35 | {{- end }}
36 | {{- if .Values.grafana.service.externalIPs }}
37 | externalIPs:
38 | {{ toYaml .Values.grafana.service.externalIPs | indent 4 }}
39 | {{- end }}
40 | ports:
41 | - name: service
42 | port: {{ .Values.grafana.service.port }}
43 | protocol: TCP
44 | targetPort: 3000
45 | {{ if (and (eq .Values.grafana.service.type "NodePort") (not (empty .Values.grafana.service.nodePort))) }}
46 | nodePort: {{.Values.grafana.service.nodePort}}
47 | {{ end }}
48 | selector:
49 | app: {{ template "prometheus.name" . }}
50 | release: {{ .Release.Name }}
51 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/grafana-serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | labels:
5 | app: {{ template "prometheus.name" . }}
6 | component: "{{ .Values.grafana.name }}"
7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
8 | heritage: {{ .Release.Service }}
9 | release: {{ .Release.Name }}
10 | name: grafana
11 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-clusterrole.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1beta1
2 | kind: ClusterRole
3 | metadata:
4 | labels:
5 | app: {{ template "prometheus.name" . }}
6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
7 | component: "{{ .Values.server.name }}"
8 | heritage: {{ .Release.Service }}
9 | release: {{ .Release.Name }}
10 | name: {{ template "prometheus.server.fullname" . }}
11 | rules:
12 | - apiGroups:
13 | - ""
14 | resources:
15 | - nodes
16 | - nodes/proxy
17 | - services
18 | - endpoints
19 | - pods
20 | - ingresses
21 | verbs:
22 | - get
23 | - list
24 | - watch
25 | - apiGroups:
26 | - ""
27 | resources:
28 | - configmaps
29 | verbs:
30 | - get
31 | - apiGroups:
32 | - "extensions"
33 | resources:
34 | - ingresses/status
35 | - ingresses
36 | verbs:
37 | - get
38 | - list
39 | - watch
40 | - nonResourceURLs:
41 | - "/metrics"
42 | verbs:
43 | - get
44 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-clusterrolebinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1beta1
2 | kind: ClusterRoleBinding
3 | metadata:
4 | labels:
5 | app: {{ template "prometheus.name" . }}
6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
7 | component: "{{ .Values.server.name }}"
8 | heritage: {{ .Release.Service }}
9 | release: {{ .Release.Name }}
10 | name: {{ template "prometheus.server.fullname" . }}
11 | subjects:
12 | - kind: ServiceAccount
13 | name: prometheus
14 | namespace: {{ .Release.Namespace }}
15 | roleRef:
16 | apiGroup: rbac.authorization.k8s.io
17 | kind: ClusterRole
18 | name: {{ template "prometheus.server.fullname" . }}
19 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-configmap.yaml:
--------------------------------------------------------------------------------
1 | {{- if (empty .Values.server.configMapOverrideName) -}}
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | labels:
6 | app: {{ template "prometheus.name" . }}
7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
8 | component: "{{ .Values.server.name }}"
9 | heritage: {{ .Release.Service }}
10 | release: {{ .Release.Name }}
11 | name: {{ template "prometheus.server.fullname" . }}
12 | data:
13 | {{- $root := . -}}
14 | {{- range $key, $value := .Values.serverFiles }}
15 | {{ $key }}: |
16 | {{- if eq $key "prometheus.yml" }}
17 | global:
18 | {{ $root.Values.server.global | toYaml | indent 6 }}
19 | {{- end }}
20 | {{ toYaml $value | default "{}" | indent 4 }}
21 | {{- if eq $key "prometheus.yml" -}}
22 | {{- if $root.Values.extraScrapeConfigs }}
23 | {{ tpl $root.Values.extraScrapeConfigs $root | indent 4 }}
24 | {{- end -}}
25 | {{- if $root.Values.alertmanager.enabled }}
26 | alerting:
27 | alertmanagers:
28 | - kubernetes_sd_configs:
29 | - role: pod
30 | tls_config:
31 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
32 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
33 | {{- if $root.Values.alertmanager.prefixURL }}
34 | path_prefix: {{ $root.Values.alertmanager.prefixURL }}
35 | {{- end }}
36 | relabel_configs:
37 | - source_labels: [__meta_kubernetes_namespace]
38 | regex: {{ $root.Release.Namespace }}
39 | action: keep
40 | - source_labels: [__meta_kubernetes_pod_label_app]
41 | regex: {{ template "prometheus.name" $root }}
42 | action: keep
43 | - source_labels: [__meta_kubernetes_pod_label_component]
44 | regex: alertmanager
45 | action: keep
46 | - source_labels: [__meta_kubernetes_pod_container_port_number]
47 | regex:
48 | action: drop
49 | {{- end -}}
50 | {{- end -}}
51 | {{- end -}}
52 | {{- end -}}
53 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: extensions/v1beta1
2 | kind: Deployment
3 | metadata:
4 | {{- if .Values.server.deploymentAnnotations }}
5 | annotations:
6 | {{ toYaml .Values.server.deploymentAnnotations | indent 4 }}
7 | {{- end }}
8 | labels:
9 | app: {{ template "prometheus.name" . }}
10 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
11 | component: "{{ .Values.server.name }}"
12 | heritage: {{ .Release.Service }}
13 | release: {{ .Release.Name }}
14 | name: {{ template "prometheus.server.fullname" . }}
15 | spec:
16 | replicas: {{ .Values.server.replicaCount }}
17 | {{- if .Values.server.strategy }}
18 | strategy:
19 | {{ toYaml .Values.server.strategy | indent 4 }}
20 | {{- end }}
21 | template:
22 | metadata:
23 | {{- if .Values.server.podAnnotations }}
24 | annotations:
25 | {{ toYaml .Values.server.podAnnotations | indent 8 }}
26 | {{- end }}
27 | labels:
28 | app: {{ template "prometheus.name" . }}
29 | component: "{{ .Values.server.name }}"
30 | release: {{ .Release.Name }}
31 | spec:
32 | {{- if .Values.server.affinity }}
33 | affinity:
34 | {{ toYaml .Values.server.affinity | indent 8 }}
35 | {{- end }}
36 | {{- if .Values.server.priorityClassName }}
37 | priorityClassName: "{{ .Values.server.priorityClassName }}"
38 | {{- end }}
39 | {{- if .Values.server.schedulerName }}
40 | schedulerName: "{{ .Values.server.schedulerName }}"
41 | {{- end }}
42 | serviceAccountName: prometheus
43 | {{- if .Values.initChownData.enabled }}
44 | initContainers:
45 | - name: "{{ .Values.initChownData.name }}"
46 | image: "{{ .Values.initChownData.image.repository }}:{{ .Values.initChownData.image.tag }}"
47 | imagePullPolicy: "{{ .Values.initChownData.image.pullPolicy }}"
48 | resources:
49 | {{ toYaml .Values.initChownData.resources | indent 12 }}
50 | # 65534 is the nobody user that prometheus uses.
51 | command: ["chown", "-R", "65534:65534", "{{ .Values.server.persistentVolume.mountPath }}"]
52 | volumeMounts:
53 | - name: storage-volume
54 | mountPath: {{ .Values.server.persistentVolume.mountPath }}
55 | subPath: "{{ .Values.server.persistentVolume.subPath }}"
56 | {{- end }}
57 | containers:
58 | - name: {{ template "prometheus.name" . }}-{{ .Values.server.name }}-{{ .Values.configmapReload.name }}
59 | image: "{{ .Values.configmapReload.image.repository }}:{{ .Values.configmapReload.image.tag }}"
60 | imagePullPolicy: "{{ .Values.configmapReload.image.pullPolicy }}"
61 | args:
62 | - --volume-dir=/etc/config
63 | - --webhook-url=http://127.0.0.1:9090{{ .Values.server.prefixURL }}/-/reload
64 | {{- range $key, $value := .Values.configmapReload.extraArgs }}
65 | - --{{ $key }}={{ $value }}
66 | {{- end }}
67 | {{- range .Values.configmapReload.extraVolumeDirs }}
68 | - --volume-dir={{ . }}
69 | {{- end }}
70 | resources:
71 | {{ toYaml .Values.configmapReload.resources | indent 12 }}
72 | volumeMounts:
73 | - name: config-volume
74 | mountPath: /etc/config
75 | readOnly: true
76 | {{- range .Values.configmapReload.extraConfigmapMounts }}
77 | - name: {{ $.Values.configmapReload.name }}-{{ .name }}
78 | mountPath: {{ .mountPath }}
79 | subPath: {{ .subPath }}
80 | readOnly: {{ .readOnly }}
81 | {{- end }}
82 |
83 | - name: {{ template "prometheus.name" . }}-{{ .Values.server.name }}
84 | image: "{{ .Values.server.image.repository }}:{{ .Values.server.image.tag }}"
85 | imagePullPolicy: "{{ .Values.server.image.pullPolicy }}"
86 | args:
87 | {{- if .Values.server.retention }}
88 | - --storage.tsdb.retention={{ .Values.server.retention }}
89 | {{- end }}
90 | - --config.file=/etc/config/prometheus.yml
91 | - --storage.tsdb.path={{ .Values.server.persistentVolume.mountPath }}
92 | - --web.console.libraries=/etc/prometheus/console_libraries
93 | - --web.console.templates=/etc/prometheus/consoles
94 | - --web.enable-lifecycle
95 | {{- range $key, $value := .Values.server.extraArgs }}
96 | - --{{ $key }}={{ $value }}
97 | {{- end }}
98 | {{- if .Values.server.baseURL }}
99 | - --web.external-url={{ .Values.server.baseURL }}
100 | {{- end }}
101 | {{- if .Values.server.enableAdminApi }}
102 | - --web.enable-admin-api
103 | {{- end }}
104 | ports:
105 | - containerPort: 9090
106 | readinessProbe:
107 | httpGet:
108 | path: {{ .Values.server.prefixURL }}/-/ready
109 | port: 9090
110 | initialDelaySeconds: 30
111 | timeoutSeconds: 30
112 | livenessProbe:
113 | httpGet:
114 | path: {{ .Values.server.prefixURL }}/-/healthy
115 | port: 9090
116 | initialDelaySeconds: 30
117 | timeoutSeconds: 30
118 | resources:
119 | {{ toYaml .Values.server.resources | indent 12 }}
120 | volumeMounts:
121 | - name: rules-volume
122 | mountPath: /etc/prometheus-rules
123 | - name: config-volume
124 | mountPath: /etc/config
125 | - name: storage-volume
126 | mountPath: {{ .Values.server.persistentVolume.mountPath }}
127 | subPath: "{{ .Values.server.persistentVolume.subPath }}"
128 | {{- range .Values.server.extraHostPathMounts }}
129 | - name: {{ .name }}
130 | mountPath: {{ .mountPath }}
131 | subPath: {{ .subPath }}
132 | readOnly: {{ .readOnly }}
133 | {{- end }}
134 | {{- range .Values.server.extraConfigmapMounts }}
135 | - name: {{ $.Values.server.name }}-{{ .name }}
136 | mountPath: {{ .mountPath }}
137 | subPath: {{ .subPath }}
138 | readOnly: {{ .readOnly }}
139 | {{- end }}
140 | {{- range .Values.server.extraSecretMounts }}
141 | - name: {{ .name }}
142 | mountPath: {{ .mountPath }}
143 | subPath: {{ .subPath }}
144 | readOnly: {{ .readOnly }}
145 | {{- end }}
146 | {{- if .Values.imagePullSecrets }}
147 | imagePullSecrets:
148 | {{ toYaml .Values.imagePullSecrets | indent 2 }}
149 | {{- end }}
150 | {{- if .Values.server.nodeSelector }}
151 | nodeSelector:
152 | {{ toYaml .Values.server.nodeSelector | indent 8 }}
153 | {{- end }}
154 | {{- if .Values.server.securityContext }}
155 | securityContext:
156 | {{ toYaml .Values.server.securityContext | indent 8 }}
157 | {{- end }}
158 | {{- if .Values.server.tolerations }}
159 | tolerations:
160 | {{ toYaml .Values.server.tolerations | indent 8 }}
161 | {{- end }}
162 | {{- if .Values.server.affinity }}
163 | affinity:
164 | {{ toYaml .Values.server.affinity | indent 8 }}
165 | {{- end }}
166 | terminationGracePeriodSeconds: {{ .Values.server.terminationGracePeriodSeconds }}
167 | volumes:
168 | - name: rules-volume
169 | configMap:
170 | name: prometheus-rules
171 | - name: config-volume
172 | configMap:
173 | name: {{ if .Values.server.configMapOverrideName }}{{ .Release.Name }}-{{ .Values.server.configMapOverrideName }}{{- else }}{{ template "prometheus.server.fullname" . }}{{- end }}
174 | - name: storage-volume
175 | {{- if .Values.server.persistentVolume.enabled }}
176 | persistentVolumeClaim:
177 | claimName: {{ if .Values.server.persistentVolume.existingClaim }}{{ .Values.server.persistentVolume.existingClaim }}{{- else }}{{ template "prometheus.server.fullname" . }}{{- end }}
178 | {{- else }}
179 | emptyDir: {}
180 | {{- end -}}
181 | {{- range .Values.server.extraHostPathMounts }}
182 | - name: {{ .name }}
183 | hostPath:
184 | path: {{ .hostPath }}
185 | {{- end }}
186 | {{- range .Values.configmapReload.extraConfigmapMounts }}
187 | - name: {{ $.Values.configmapReload.name }}-{{ .name }}
188 | configMap:
189 | name: {{ .configMap }}
190 | {{- end }}
191 | {{- range .Values.server.extraConfigmapMounts }}
192 | - name: {{ $.Values.server.name }}-{{ .name }}
193 | configMap:
194 | name: {{ .configMap }}
195 | {{- end }}
196 | {{- range .Values.server.extraSecretMounts }}
197 | - name: {{ .name }}
198 | secret:
199 | secretName: {{ .secretName }}
200 | {{- end }}
201 | {{- range .Values.configmapReload.extraConfigmapMounts }}
202 | - name: {{ .name }}
203 | configMap:
204 | name: {{ .configMap }}
205 | {{- end }}
206 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-ingress.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.server.ingress.enabled -}}
2 | {{- $releaseName := .Release.Name -}}
3 | {{- $serviceName := include "prometheus.server.fullname" . }}
4 | {{- $servicePort := .Values.server.service.servicePort -}}
5 | apiVersion: extensions/v1beta1
6 | kind: Ingress
7 | metadata:
8 | {{- if .Values.server.ingress.annotations }}
9 | annotations:
10 | {{ toYaml .Values.server.ingress.annotations | indent 4 }}
11 | {{- end }}
12 | labels:
13 | app: {{ template "prometheus.name" . }}
14 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
15 | component: "{{ .Values.server.name }}"
16 | heritage: {{ .Release.Service }}
17 | release: {{ .Release.Name }}
18 | {{- range $key, $value := .Values.server.ingress.extraLabels }}
19 | {{ $key }}: {{ $value }}
20 | {{- end }}
21 | name: {{ template "prometheus.server.fullname" . }}
22 | spec:
23 | rules:
24 | {{- range .Values.server.ingress.hosts }}
25 | {{- $url := splitList "/" . }}
26 | - host: {{ first $url }}
27 | http:
28 | paths:
29 | - path: /{{ rest $url | join "/" }}
30 | backend:
31 | serviceName: {{ $serviceName }}
32 | servicePort: {{ $servicePort }}
33 | {{- end -}}
34 | {{- if .Values.server.ingress.tls }}
35 | tls:
36 | {{ toYaml .Values.server.ingress.tls | indent 4 }}
37 | {{- end -}}
38 | {{- end -}}
39 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-networkpolicy.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.networkPolicy.enabled }}
2 | apiVersion: {{ template "prometheus.networkPolicy.apiVersion" . }}
3 | kind: NetworkPolicy
4 | metadata:
5 | name: {{ template "prometheus.server.fullname" . }}
6 | labels:
7 | app: {{ template "prometheus.name" . }}
8 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
9 | component: "{{ .Values.server.name }}"
10 | heritage: {{ .Release.Service }}
11 | release: {{ .Release.Name }}
12 | spec:
13 | podSelector:
14 | matchLabels:
15 | app: {{ template "prometheus.name" . }}
16 | component: "{{ .Values.server.name }}"
17 | release: {{ .Release.Name }}
18 | ingress:
19 | - ports:
20 | - port: 9090
21 | {{- end }}
22 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-pvc.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.server.persistentVolume.enabled -}}
2 | {{- if not .Values.server.persistentVolume.existingClaim -}}
3 | apiVersion: v1
4 | kind: PersistentVolumeClaim
5 | metadata:
6 | {{- if .Values.server.persistentVolume.annotations }}
7 | annotations:
8 | {{ toYaml .Values.server.persistentVolume.annotations | indent 4 }}
9 | {{- end }}
10 | labels:
11 | app: {{ template "prometheus.name" . }}
12 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
13 | component: "{{ .Values.server.name }}"
14 | heritage: {{ .Release.Service }}
15 | release: {{ .Release.Name }}
16 | name: {{ template "prometheus.server.fullname" . }}
17 | spec:
18 | accessModes:
19 | {{ toYaml .Values.server.persistentVolume.accessModes | indent 4 }}
20 | {{- if .Values.server.persistentVolume.storageClass }}
21 | {{- if (eq "-" .Values.server.persistentVolume.storageClass) }}
22 | storageClassName: ""
23 | {{- else }}
24 | storageClassName: "{{ .Values.server.persistentVolume.storageClass }}"
25 | {{- end }}
26 | {{- end }}
27 | resources:
28 | requests:
29 | storage: "{{ .Values.server.persistentVolume.size }}"
30 | {{- end -}}
31 | {{- end -}}
32 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-rules.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: prometheus-rules
5 | labels:
6 | app: prometheus
7 | data:
8 | {{ (.Files.Glob "prometheus-alerts/*").AsConfig | indent 2 }}
9 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | {{- if .Values.server.service.annotations }}
5 | annotations:
6 | {{ toYaml .Values.server.service.annotations | indent 4 }}
7 | {{- end }}
8 | labels:
9 | app: {{ template "prometheus.name" . }}
10 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
11 | component: "{{ .Values.server.name }}"
12 | heritage: {{ .Release.Service }}
13 | release: {{ .Release.Name }}
14 | {{- if .Values.server.service.labels }}
15 | {{ toYaml .Values.server.service.labels | indent 4 }}
16 | {{- end }}
17 | name: {{ template "prometheus.server.fullname" . }}
18 | spec:
19 | {{- if .Values.server.service.clusterIP }}
20 | clusterIP: {{ .Values.server.service.clusterIP }}
21 | {{- end }}
22 | {{- if .Values.server.service.externalIPs }}
23 | externalIPs:
24 | {{ toYaml .Values.server.service.externalIPs | indent 4 }}
25 | {{- end }}
26 | {{- if .Values.server.service.loadBalancerIP }}
27 | loadBalancerIP: {{ .Values.server.service.loadBalancerIP }}
28 | {{- end }}
29 | {{- if .Values.server.service.loadBalancerSourceRanges }}
30 | loadBalancerSourceRanges:
31 | {{- range $cidr := .Values.server.service.loadBalancerSourceRanges }}
32 | - {{ $cidr }}
33 | {{- end }}
34 | {{- end }}
35 | ports:
36 | - name: http
37 | port: {{ .Values.server.service.servicePort }}
38 | protocol: TCP
39 | targetPort: 9090
40 | {{- if .Values.server.service.nodePort }}
41 | nodePort: {{ .Values.server.service.nodePort }}
42 | {{- end }}
43 | selector:
44 | app: {{ template "prometheus.name" . }}
45 | component: "{{ .Values.server.name }}"
46 | release: {{ .Release.Name }}
47 | type: "{{ .Values.server.service.type }}"
48 |
--------------------------------------------------------------------------------
/helm/prometheus-chart/templates/prometheus-serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | labels:
5 | app: {{ template "prometheus.name" . }}
6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }}
7 | component: "{{ .Values.server.name }}"
8 | heritage: {{ .Release.Service }}
9 | release: {{ .Release.Name }}
10 | name: prometheus
11 |
--------------------------------------------------------------------------------
/manifests/0-namespace.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | name: monitoring
5 |
--------------------------------------------------------------------------------
/manifests/01-rbac.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: rbac.authorization.k8s.io/v1
3 | kind: ClusterRoleBinding
4 | metadata:
5 | name: prometheus
6 | roleRef:
7 | apiGroup: rbac.authorization.k8s.io
8 | kind: ClusterRole
9 | name: prometheus
10 | subjects:
11 | - kind: ServiceAccount
12 | name: prometheus-k8s
13 | namespace: monitoring
14 | ---
15 | apiVersion: rbac.authorization.k8s.io/v1
16 | kind: ClusterRole
17 | metadata:
18 | name: prometheus
19 | rules:
20 | - apiGroups: [""]
21 | resources:
22 | - nodes
23 | - nodes/proxy
24 | - services
25 | - endpoints
26 | - pods
27 | verbs: ["get", "list", "watch"]
28 | - apiGroups: [""]
29 | resources:
30 | - configmaps
31 | verbs: ["get"]
32 | - nonResourceURLs: ["/metrics"]
33 | verbs: ["get"]
34 | ---
35 | apiVersion: v1
36 | kind: ServiceAccount
37 | metadata:
38 | name: prometheus-k8s
39 | namespace: monitoring
40 |
--------------------------------------------------------------------------------
/manifests/alertmanager/alertmanager-templates.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 | default.tmpl: |
4 | {{ define "__alertmanager" }}AlertManager{{ end }}
5 | {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
6 |
7 | {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
8 | {{ define "__description" }}{{ end }}
9 |
10 | {{ define "__text_alert_list" }}{{ range . }}Labels:
11 | {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }}
12 | {{ end }}Annotations:
13 | {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }}
14 | {{ end }}Source: {{ .GeneratorURL }}
15 | {{ end }}{{ end }}
16 |
17 |
18 | {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }}
19 | {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }}
20 | {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }}
21 | {{ define "slack.default.pretext" }}{{ end }}
22 | {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }}
23 | {{ define "slack.default.iconemoji" }}{{ end }}
24 | {{ define "slack.default.iconurl" }}{{ end }}
25 | {{ define "slack.default.text" }}{{ end }}
26 |
27 |
28 | {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }}
29 | {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }}
30 |
31 |
32 | {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }}
33 | {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }}
34 | {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }}
35 | {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }}
36 |
37 |
38 | {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }}
39 | {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
40 | {{ if gt (len .Alerts.Firing) 0 -}}
41 | Alerts Firing:
42 | {{ template "__text_alert_list" .Alerts.Firing }}
43 | {{- end }}
44 | {{ if gt (len .Alerts.Resolved) 0 -}}
45 | Alerts Resolved:
46 | {{ template "__text_alert_list" .Alerts.Resolved }}
47 | {{- end }}
48 | {{- end }}
49 | {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }}
50 |
51 |
52 | {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }}
53 | {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }}
54 |
55 |
56 | {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }}
57 | {{ define "email.default.html" }}
58 |
59 |
85 |
86 |
87 |
88 |
89 | {{ template "__subject" . }}
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 | |
98 |
99 |
100 |
101 |
102 |
103 | {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }}
104 | {{ .Name }}={{ .Value }}
105 | {{ end }}
106 | |
107 |
108 |
109 |
110 |
111 |
112 |
113 | View in {{ template "__alertmanager" . }}
114 | |
115 |
116 | {{ if gt (len .Alerts.Firing) 0 }}
117 |
118 |
119 | [{{ .Alerts.Firing | len }}] Firing
120 | |
121 |
122 | {{ end }}
123 | {{ range .Alerts.Firing }}
124 |
125 |
126 | Labels
127 | {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
128 | {{ if gt (len .Annotations) 0 }}Annotations {{ end }}
129 | {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
130 | Source
131 | |
132 |
133 | {{ end }}
134 |
135 | {{ if gt (len .Alerts.Resolved) 0 }}
136 | {{ if gt (len .Alerts.Firing) 0 }}
137 |
138 |
139 |
140 |
141 |
142 | |
143 |
144 | {{ end }}
145 |
146 |
147 | [{{ .Alerts.Resolved | len }}] Resolved
148 | |
149 |
150 | {{ end }}
151 | {{ range .Alerts.Resolved }}
152 |
153 |
154 | Labels
155 | {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
156 | {{ if gt (len .Annotations) 0 }}Annotations {{ end }}
157 | {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
158 | Source
159 | |
160 |
161 | {{ end }}
162 |
163 | |
164 |
165 |
166 |
167 |
174 | |
175 | |
176 |
177 |
178 |
179 |
180 |
181 |
182 | {{ end }}
183 |
184 | {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }}
185 | {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
186 | {{ if gt (len .Alerts.Firing) 0 }}
187 | Alerts Firing:
188 | {{ template "__text_alert_list" .Alerts.Firing }}
189 | {{ end }}
190 | {{ if gt (len .Alerts.Resolved) 0 }}
191 | Alerts Resolved:
192 | {{ template "__text_alert_list" .Alerts.Resolved }}
193 | {{ end }}
194 | {{ end }}
195 | {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }}
196 | slack.tmpl: |
197 | {{ define "slack.devops.text" }}
198 | {{range .Alerts}}{{.Annotations.DESCRIPTION}}
199 | {{end}}
200 | {{ end }}
201 | kind: ConfigMap
202 | metadata:
203 | creationTimestamp: null
204 | name: alertmanager-templates
205 | namespace: monitoring
206 |
--------------------------------------------------------------------------------
/manifests/alertmanager/configmap.yaml:
--------------------------------------------------------------------------------
1 | kind: ConfigMap
2 | apiVersion: v1
3 | metadata:
4 | name: alertmanager
5 | namespace: monitoring
6 | data:
7 | config.yml: |-
8 | global:
9 | # ResolveTimeout is the time after which an alert is declared resolved
10 | # if it has not been updated.
11 | resolve_timeout: 5m
12 |
13 | # The smarthost and SMTP sender used for mail notifications.
14 | smtp_smarthost: 'smtp.gmail.com:587'
15 | smtp_from: 'foo@bar.com'
16 | smtp_auth_username: 'foo@bar.com'
17 | smtp_auth_password: 'barfoo'
18 |
19 | # The API URL to use for Slack notifications.
20 | slack_api_url: 'https://hooks.slack.com/services/some/api/token'
21 |
22 | # # The directory from which notification templates are read.
23 | templates:
24 | - '/etc/alertmanager-templates/*.tmpl'
25 |
26 | # The root route on which each incoming alert enters.
27 | route:
28 |
29 | # The labels by which incoming alerts are grouped together. For example,
30 | # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
31 | # be batched into a single group.
32 |
33 | group_by: ['alertname', 'cluster', 'service']
34 |
35 | # When a new group of alerts is created by an incoming alert, wait at
36 | # least 'group_wait' to send the initial notification.
37 | # This way ensures that you get multiple alerts for the same group that start
38 | # firing shortly after another are batched together on the first
39 | # notification.
40 |
41 | group_wait: 30s
42 |
43 | # When the first notification was sent, wait 'group_interval' to send a batch
44 | # of new alerts that started firing for that group.
45 |
46 | group_interval: 5m
47 |
48 | # If an alert has successfully been sent, wait 'repeat_interval' to
49 | # resend them.
50 |
51 | #repeat_interval: 1m
52 | repeat_interval: 15m
53 |
54 | # A default receiver
55 |
56 | # If an alert isn't caught by a route, send it to default.
57 | receiver: default
58 |
59 | # All the above attributes are inherited by all child routes and can
60 | # overwritten on each.
61 |
62 | # The child route trees.
63 | routes:
64 | # Send severity=slack alerts to slack.
65 | - match:
66 | severity: slack
67 | receiver: slack_alert
68 | # - match:
69 | # severity: email
70 | # receiver: email_alert
71 |
72 | receivers:
73 | - name: 'default'
74 | slack_configs:
75 | - channel: '#alertmanager-test'
76 | text: '{{ template "slack.devops.text" . }}'
77 | send_resolved: true
78 |
79 | - name: 'slack_alert'
80 | slack_configs:
81 | - channel: '#alertmanager-test'
82 | send_resolved: true
83 |
--------------------------------------------------------------------------------
/manifests/alertmanager/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: alertmanager
5 | namespace: monitoring
6 | spec:
7 | replicas: 1
8 | selector:
9 | matchLabels:
10 | app: alertmanager
11 | template:
12 | metadata:
13 | name: alertmanager
14 | labels:
15 | app: alertmanager
16 | spec:
17 | containers:
18 | - name: alertmanager
19 | image: quay.io/prometheus/alertmanager:v0.7.1
20 | args:
21 | - '-config.file=/etc/alertmanager/config.yml'
22 | - '-storage.path=/alertmanager'
23 | ports:
24 | - name: alertmanager
25 | containerPort: 9093
26 | volumeMounts:
27 | - name: config-volume
28 | mountPath: /etc/alertmanager
29 | - name: templates-volume
30 | mountPath: /etc/alertmanager-templates
31 | - name: alertmanager
32 | mountPath: /alertmanager
33 | volumes:
34 | - name: config-volume
35 | configMap:
36 | name: alertmanager
37 | - name: templates-volume
38 | configMap:
39 | name: alertmanager-templates
40 | - name: alertmanager
41 | emptyDir: {}
42 |
--------------------------------------------------------------------------------
/manifests/alertmanager/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | annotations:
5 | prometheus.io/scrape: 'true'
6 | prometheus.io/path: '/metrics'
7 | labels:
8 | name: alertmanager
9 | name: alertmanager
10 | namespace: monitoring
11 | spec:
12 | selector:
13 | app: alertmanager
14 | type: NodePort
15 | ports:
16 | - name: alertmanager
17 | protocol: TCP
18 | port: 9093
19 | targetPort: 9093
20 |
--------------------------------------------------------------------------------
/manifests/grafana/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: grafana-core
5 | namespace: monitoring
6 | labels:
7 | app: grafana
8 | component: core
9 | spec:
10 | replicas: 1
11 | selector:
12 | matchLabels:
13 | app: grafana
14 | template:
15 | metadata:
16 | labels:
17 | app: grafana
18 | component: core
19 | spec:
20 | containers:
21 | - image: grafana/grafana:4.2.0
22 | name: grafana-core
23 | imagePullPolicy: IfNotPresent
24 | # env:
25 | resources:
26 | # keep request = limit to keep this container in guaranteed class
27 | limits:
28 | cpu: 100m
29 | memory: 100Mi
30 | requests:
31 | cpu: 100m
32 | memory: 100Mi
33 | env:
34 | # The following env variables set up basic auth twith the default admin user and admin password.
35 | - name: GF_AUTH_BASIC_ENABLED
36 | value: "true"
37 | - name: GF_SECURITY_ADMIN_USER
38 | valueFrom:
39 | secretKeyRef:
40 | name: grafana
41 | key: admin-username
42 | - name: GF_SECURITY_ADMIN_PASSWORD
43 | valueFrom:
44 | secretKeyRef:
45 | name: grafana
46 | key: admin-password
47 | - name: GF_AUTH_ANONYMOUS_ENABLED
48 | value: "false"
49 | # - name: GF_AUTH_ANONYMOUS_ORG_ROLE
50 | # value: Admin
51 | # does not really work, because of template variables in exported dashboards:
52 | # - name: GF_DASHBOARDS_JSON_ENABLED
53 | # value: "true"
54 | readinessProbe:
55 | httpGet:
56 | path: /login
57 | port: 3000
58 | # initialDelaySeconds: 30
59 | # timeoutSeconds: 1
60 | volumeMounts:
61 | - name: grafana-persistent-storage
62 | mountPath: /var/lib/grafana
63 | volumes:
64 | - name: grafana-persistent-storage
65 | emptyDir: {}
66 |
--------------------------------------------------------------------------------
/manifests/grafana/import-dashboards/job.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata:
4 | name: grafana-import-dashboards
5 | namespace: monitoring
6 | labels:
7 | app: grafana
8 | component: import-dashboards
9 | spec:
10 | template:
11 | metadata:
12 | name: grafana-import-dashboards
13 | labels:
14 | app: grafana
15 | component: import-dashboards
16 | spec:
17 | serviceAccountName: prometheus-k8s
18 | initContainers:
19 | - name: wait-for-grafana
20 | image: giantswarm/tiny-tools
21 | args:
22 | - /bin/sh
23 | - -c
24 | - >
25 | set -x;
26 | while [ $(curl -Lsw '%{http_code}' "http://grafana:3000" -o /dev/null) -ne 200 ]; do
27 | echo '.'
28 | sleep 15;
29 | done
30 | containers:
31 | - name: grafana-import-dashboards
32 | image: giantswarm/tiny-tools
33 | command: ["/bin/sh", "-c"]
34 | workingDir: /opt/grafana-import-dashboards
35 | args:
36 | - >
37 | for file in *-datasource.json ; do
38 | if [ -e "$file" ] ; then
39 | echo "importing $file" &&
40 | curl --silent --fail --show-error \
41 | --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/datasources \
42 | --header "Content-Type: application/json" \
43 | --data-binary "@$file" ;
44 | echo "" ;
45 | fi
46 | done ;
47 | for file in *-dashboard.json ; do
48 | if [ -e "$file" ] ; then
49 | echo "importing $file" &&
50 | ( echo '{"dashboard":'; \
51 | cat "$file"; \
52 | echo ',"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}' ) \
53 | | jq -c '.' \
54 | | curl --silent --fail --show-error \
55 | --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/dashboards/import \
56 | --header "Content-Type: application/json" \
57 | --data-binary "@-" ;
58 | echo "" ;
59 | fi
60 | done
61 |
62 | env:
63 | - name: GF_ADMIN_USER
64 | valueFrom:
65 | secretKeyRef:
66 | name: grafana
67 | key: admin-username
68 | - name: GF_ADMIN_PASSWORD
69 | valueFrom:
70 | secretKeyRef:
71 | name: grafana
72 | key: admin-password
73 | volumeMounts:
74 | - name: config-volume
75 | mountPath: /opt/grafana-import-dashboards
76 | restartPolicy: Never
77 | volumes:
78 | - name: config-volume
79 | configMap:
80 | name: grafana-import-dashboards
81 |
--------------------------------------------------------------------------------
/manifests/grafana/ingress.yaml:
--------------------------------------------------------------------------------
1 | # apiVersion: apps/v1
2 | # kind: Ingress
3 | # metadata:
4 | # name: grafana
5 | # namespace: monitoring
6 | # spec:
7 | # rules:
8 | # - host: ..k8s.gigantic.io
9 | # http:
10 | # paths:
11 | # - path: /
12 | # backend:
13 | # serviceName: grafana
14 | # servicePort: 3000
15 |
--------------------------------------------------------------------------------
/manifests/grafana/secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | data:
4 | admin-password: YWRtaW4=
5 | admin-username: YWRtaW4=
6 | metadata:
7 | name: grafana
8 | namespace: monitoring
9 | type: Opaque
10 |
--------------------------------------------------------------------------------
/manifests/grafana/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: grafana
5 | namespace: monitoring
6 | labels:
7 | app: grafana
8 | component: core
9 | spec:
10 | type: NodePort
11 | ports:
12 | - port: 3000
13 | selector:
14 | app: grafana
15 | component: core
16 |
--------------------------------------------------------------------------------
/manifests/prometheus/configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 | prometheus.yaml: |
4 | global:
5 | scrape_interval: 10s
6 | scrape_timeout: 10s
7 | evaluation_interval: 10s
8 | rule_files:
9 | - "/etc/prometheus-rules/*.rules"
10 | scrape_configs:
11 |
12 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37
13 | - job_name: 'kubernetes-nodes'
14 | tls_config:
15 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
16 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
17 | kubernetes_sd_configs:
18 | - role: node
19 | relabel_configs:
20 | - source_labels: [__address__]
21 | regex: '(.*):10250'
22 | replacement: '${1}:10255'
23 | target_label: __address__
24 |
25 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79
26 | - job_name: 'kubernetes-endpoints'
27 | kubernetes_sd_configs:
28 | - role: endpoints
29 | relabel_configs:
30 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
31 | action: keep
32 | regex: true
33 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
34 | action: replace
35 | target_label: __scheme__
36 | regex: (https?)
37 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
38 | action: replace
39 | target_label: __metrics_path__
40 | regex: (.+)
41 | - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
42 | action: replace
43 | target_label: __address__
44 | regex: (.+)(?::\d+);(\d+)
45 | replacement: $1:$2
46 | - action: labelmap
47 | regex: __meta_kubernetes_service_label_(.+)
48 | - source_labels: [__meta_kubernetes_namespace]
49 | action: replace
50 | target_label: kubernetes_namespace
51 | - source_labels: [__meta_kubernetes_service_name]
52 | action: replace
53 | target_label: kubernetes_name
54 |
55 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119
56 | - job_name: 'kubernetes-services'
57 | metrics_path: /probe
58 | params:
59 | module: [http_2xx]
60 | kubernetes_sd_configs:
61 | - role: service
62 | relabel_configs:
63 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
64 | action: keep
65 | regex: true
66 | - source_labels: [__address__]
67 | target_label: __param_target
68 | - target_label: __address__
69 | replacement: blackbox
70 | - source_labels: [__param_target]
71 | target_label: instance
72 | - action: labelmap
73 | regex: __meta_kubernetes_service_label_(.+)
74 | - source_labels: [__meta_kubernetes_namespace]
75 | target_label: kubernetes_namespace
76 | - source_labels: [__meta_kubernetes_service_name]
77 | target_label: kubernetes_name
78 |
79 | # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156
80 | - job_name: 'kubernetes-pods'
81 | kubernetes_sd_configs:
82 | - role: pod
83 | relabel_configs:
84 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
85 | action: keep
86 | regex: true
87 | - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
88 | action: replace
89 | target_label: __metrics_path__
90 | regex: (.+)
91 | - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
92 | action: replace
93 | regex: (.+):(?:\d+);(\d+)
94 | replacement: ${1}:${2}
95 | target_label: __address__
96 | - action: labelmap
97 | regex: __meta_kubernetes_pod_label_(.+)
98 | - source_labels: [__meta_kubernetes_namespace]
99 | action: replace
100 | target_label: kubernetes_namespace
101 | - source_labels: [__meta_kubernetes_pod_name]
102 | action: replace
103 | target_label: kubernetes_pod_name
104 | - source_labels: [__meta_kubernetes_pod_container_port_number]
105 | action: keep
106 | regex: 9\d{3}
107 |
108 | - job_name: 'kubernetes-cadvisor'
109 | scheme: https
110 | tls_config:
111 | ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
112 | bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
113 | kubernetes_sd_configs:
114 | - role: node
115 | relabel_configs:
116 | - action: labelmap
117 | regex: __meta_kubernetes_node_label_(.+)
118 | - target_label: __address__
119 | replacement: kubernetes.default.svc:443
120 | - source_labels: [__meta_kubernetes_node_name]
121 | regex: (.+)
122 | target_label: __metrics_path__
123 | replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
124 | kind: ConfigMap
125 | metadata:
126 | creationTimestamp: null
127 | name: prometheus-core
128 | namespace: monitoring
129 |
--------------------------------------------------------------------------------
/manifests/prometheus/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: prometheus-core
5 | namespace: monitoring
6 | labels:
7 | app: prometheus
8 | component: core
9 | spec:
10 | replicas: 1
11 | selector:
12 | matchLabels:
13 | app: prometheus
14 | template:
15 | metadata:
16 | name: prometheus-main
17 | labels:
18 | app: prometheus
19 | component: core
20 | spec:
21 | serviceAccountName: prometheus-k8s
22 | containers:
23 | - name: prometheus
24 | image: prom/prometheus:v1.7.0
25 | args:
26 | - '-storage.local.retention=12h'
27 | - '-storage.local.memory-chunks=500000'
28 | - '-config.file=/etc/prometheus/prometheus.yaml'
29 | - '-alertmanager.url=http://alertmanager:9093/'
30 | ports:
31 | - name: webui
32 | containerPort: 9090
33 | resources:
34 | requests:
35 | cpu: 500m
36 | memory: 500M
37 | limits:
38 | cpu: 500m
39 | memory: 500M
40 | volumeMounts:
41 | - name: config-volume
42 | mountPath: /etc/prometheus
43 | - name: rules-volume
44 | mountPath: /etc/prometheus-rules
45 | volumes:
46 | - name: config-volume
47 | configMap:
48 | name: prometheus-core
49 | - name: rules-volume
50 | configMap:
51 | name: prometheus-rules
52 |
--------------------------------------------------------------------------------
/manifests/prometheus/kube-state-metrics/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: kube-state-metrics
5 | namespace: monitoring
6 | spec:
7 | replicas: 1
8 | template:
9 | metadata:
10 | labels:
11 | app: kube-state-metrics
12 | spec:
13 | serviceAccountName: kube-state-metrics
14 | containers:
15 | - name: kube-state-metrics
16 | image: gcr.io/google_containers/kube-state-metrics:v0.5.0
17 | ports:
18 | - containerPort: 8080
19 |
--------------------------------------------------------------------------------
/manifests/prometheus/kube-state-metrics/rbac.yaml:
--------------------------------------------------------------------------------
1 | # ---
2 | # apiVersion: rbac.authorization.k8s.io/v1
3 | # kind: ClusterRoleBinding
4 | # metadata:
5 | # name: kube-state-metrics
6 | # roleRef:
7 | # apiGroup: rbac.authorization.k8s.io
8 | # kind: ClusterRole
9 | # name: kube-state-metrics
10 | # subjects:
11 | # - kind: ServiceAccount
12 | # name: kube-state-metrics
13 | # namespace: monitoring
14 | # ---
15 | # apiVersion: rbac.authorization.k8s.io/v1
16 | # kind: ClusterRole
17 | # metadata:
18 | # name: kube-state-metrics
19 | # rules:
20 | # - apiGroups: [""]
21 | # resources:
22 | # - nodes
23 | # - pods
24 | # - services
25 | # - resourcequotas
26 | # - replicationcontrollers
27 | # - limitranges
28 | # verbs: ["list", "watch"]
29 | # - apiGroups: ["extensions"]
30 | # resources:
31 | # - daemonsets
32 | # - deployments
33 | # - replicasets
34 | # verbs: ["list", "watch"]
35 | # ---
36 | apiVersion: v1
37 | kind: ServiceAccount
38 | metadata:
39 | name: kube-state-metrics
40 | namespace: monitoring
41 |
--------------------------------------------------------------------------------
/manifests/prometheus/kube-state-metrics/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | annotations:
5 | prometheus.io/scrape: 'true'
6 | name: kube-state-metrics
7 | namespace: monitoring
8 | labels:
9 | app: kube-state-metrics
10 | spec:
11 | ports:
12 | - name: kube-state-metrics
13 | port: 8080
14 | protocol: TCP
15 | selector:
16 | app: kube-state-metrics
17 |
18 |
--------------------------------------------------------------------------------
/manifests/prometheus/node-directory-size-metrics/daemonset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: node-directory-size-metrics
5 | namespace: monitoring
6 | labels:
7 | app: node-directory-size-metrics
8 | annotations:
9 | description: |
10 | This `DaemonSet` provides metrics in Prometheus format about disk usage on the nodes.
11 | The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now.
12 | The other container `caddy` just hands out the contents of that file on request via `http` on `/metrics` at port `9102` which are the defaults for Prometheus.
13 | These are scheduled on every node in the Kubernetes cluster.
14 | To choose directories from the node to check, just mount them on the `read-du` container below `/mnt`.
15 | spec:
16 | selector:
17 | matchLabels:
18 | app: node-directory-size-metrics
19 | template:
20 | metadata:
21 | labels:
22 | app: node-directory-size-metrics
23 | annotations:
24 | prometheus.io/scrape: 'true'
25 | prometheus.io/port: '9102'
26 | description: |
27 | This `Pod` provides metrics in Prometheus format about disk usage on the node.
28 | The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now.
29 | The other container `caddy` just hands out the contents of that file on request on `/metrics` at port `9102` which are the defaults for Prometheus.
30 | This `Pod` is scheduled on every node in the Kubernetes cluster.
31 | To choose directories from the node to check just mount them on `read-du` below `/mnt`.
32 | spec:
33 | containers:
34 | - name: read-du
35 | image: giantswarm/tiny-tools
36 | imagePullPolicy: Always
37 | # FIXME threshold via env var
38 | # The
39 | command:
40 | - fish
41 | - --command
42 | - |
43 | touch /tmp/metrics-temp
44 | while true
45 | for directory in (du --bytes --separate-dirs --threshold=100M /mnt)
46 | echo $directory | read size path
47 | echo "node_directory_size_bytes{path=\"$path\"} $size" \
48 | >> /tmp/metrics-temp
49 | end
50 | mv /tmp/metrics-temp /tmp/metrics
51 | sleep 300
52 | end
53 | volumeMounts:
54 | - name: host-fs-var
55 | mountPath: /mnt/var
56 | readOnly: true
57 | - name: metrics
58 | mountPath: /tmp
59 | - name: caddy
60 | image: dockermuenster/caddy:0.9.3
61 | command:
62 | - "caddy"
63 | - "-port=9102"
64 | - "-root=/var/www"
65 | ports:
66 | - containerPort: 9102
67 | volumeMounts:
68 | - name: metrics
69 | mountPath: /var/www
70 | volumes:
71 | - name: host-fs-var
72 | hostPath:
73 | path: /var
74 | - name: metrics
75 | emptyDir:
76 | medium: Memory
77 |
--------------------------------------------------------------------------------
/manifests/prometheus/node-exporter/daemonset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: prometheus-node-exporter
5 | namespace: monitoring
6 | labels:
7 | app: prometheus
8 | component: node-exporter
9 | spec:
10 | selector:
11 | matchLabels:
12 | app: prometheus
13 | template:
14 | metadata:
15 | name: prometheus-node-exporter
16 | labels:
17 | app: prometheus
18 | component: node-exporter
19 | spec:
20 | containers:
21 | - image: prom/node-exporter:v0.14.0
22 | name: prometheus-node-exporter
23 | ports:
24 | - name: prom-node-exp
25 | #^ must be an IANA_SVC_NAME (at most 15 characters, ..)
26 | containerPort: 9100
27 | hostPort: 9100
28 | hostNetwork: true
29 | hostPID: true
30 |
--------------------------------------------------------------------------------
/manifests/prometheus/node-exporter/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | annotations:
5 | prometheus.io/scrape: 'true'
6 | name: prometheus-node-exporter
7 | namespace: monitoring
8 | labels:
9 | app: prometheus
10 | component: node-exporter
11 | spec:
12 | clusterIP: None
13 | ports:
14 | - name: prometheus-node-exporter
15 | port: 9100
16 | protocol: TCP
17 | selector:
18 | app: prometheus
19 | component: node-exporter
20 | type: ClusterIP
21 |
--------------------------------------------------------------------------------
/manifests/prometheus/prometheus-rules.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 | cpu-usage.rules: |
4 | ALERT NodeCPUUsage
5 | IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75
6 | FOR 2m
7 | LABELS {
8 | severity="page"
9 | }
10 | ANNOTATIONS {
11 | SUMMARY = "{{$labels.instance}}: High CPU usage detected",
12 | DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})"
13 | }
14 | instance-availability.rules: |
15 | ALERT InstanceDown
16 | IF up == 0
17 | FOR 1m
18 | LABELS { severity = "page" }
19 | ANNOTATIONS {
20 | summary = "Instance {{ $labels.instance }} down",
21 | description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.",
22 | }
23 | low-disk-space.rules: |
24 | ALERT NodeLowRootDisk
25 | IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75
26 | FOR 2m
27 | LABELS {
28 | severity="page"
29 | }
30 | ANNOTATIONS {
31 | SUMMARY = "{{$labels.instance}}: Low root disk space",
32 | DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})"
33 | }
34 |
35 | ALERT NodeLowDataDisk
36 | IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75
37 | FOR 2m
38 | LABELS {
39 | severity="page"
40 | }
41 | ANNOTATIONS {
42 | SUMMARY = "{{$labels.instance}}: Low data disk space",
43 | DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})"
44 | }
45 | mem-usage.rules: |
46 | ALERT NodeSwapUsage
47 | IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75
48 | FOR 2m
49 | LABELS {
50 | severity="page"
51 | }
52 | ANNOTATIONS {
53 | SUMMARY = "{{$labels.instance}}: Swap usage detected",
54 | DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})"
55 | }
56 |
57 | ALERT NodeMemoryUsage
58 | IF (((node_memory_MemTotal-node_memory_MemAvailable)/(node_memory_MemTotal)*100)) > 75
59 | FOR 2m
60 | LABELS {
61 | severity="page"
62 | }
63 | ANNOTATIONS {
64 | SUMMARY = "{{$labels.instance}}: High memory usage detected",
65 | DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})"
66 | }
67 | kind: ConfigMap
68 | metadata:
69 | creationTimestamp: null
70 | name: prometheus-rules
71 | namespace: monitoring
72 |
--------------------------------------------------------------------------------
/manifests/prometheus/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: prometheus
5 | namespace: monitoring
6 | labels:
7 | app: prometheus
8 | component: core
9 | annotations:
10 | prometheus.io/scrape: 'true'
11 | spec:
12 | type: NodePort
13 | ports:
14 | - port: 9090
15 | protocol: TCP
16 | name: webui
17 | selector:
18 | app: prometheus
19 | component: core
20 |
--------------------------------------------------------------------------------