├── .github
├── ISSUE_TEMPLATE
│ ├── bugs.yaml
│ └── enhancements.yaml
├── PULL_REQUEST_TEMPLATE
│ └── prs.md
├── dependabot.yaml
└── workflows
│ ├── check-with-upstream.yaml
│ ├── ci.yaml
│ ├── release.yaml
│ └── stale.yml
├── .gitignore
├── .lint
├── .vale.ini
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DESIGN.md
├── LICENSE
├── Makefile
├── OWNERS
├── README.md
├── SECURITY.md
├── SECURITY_CONTACTS
├── alerts
├── alerts.libsonnet
├── apps_alerts.libsonnet
├── kube_apiserver.libsonnet
├── kube_controller_manager.libsonnet
├── kube_proxy.libsonnet
├── kube_scheduler.libsonnet
├── kubelet.libsonnet
├── resource_alerts.libsonnet
├── storage_alerts.libsonnet
└── system_alerts.libsonnet
├── config.libsonnet
├── dashboards
├── apiserver.libsonnet
├── controller-manager.libsonnet
├── dashboards.libsonnet
├── defaults.libsonnet
├── kubelet.libsonnet
├── network-usage
│ ├── cluster-total.libsonnet
│ ├── namespace-by-pod.libsonnet
│ ├── namespace-by-workload.libsonnet
│ ├── pod-total.libsonnet
│ └── workload-total.libsonnet
├── network.libsonnet
├── persistentvolumesusage.libsonnet
├── proxy.libsonnet
├── resources.libsonnet
├── resources
│ ├── cluster.libsonnet
│ ├── multi-cluster.libsonnet
│ ├── namespace.libsonnet
│ ├── node.libsonnet
│ ├── pod.libsonnet
│ ├── workload-namespace.libsonnet
│ └── workload.libsonnet
├── scheduler.libsonnet
└── windows.libsonnet
├── jsonnetfile.json
├── lib
├── absent_alert.libsonnet
├── add-runbook-links.libsonnet
├── alerts.jsonnet
├── dashboards.jsonnet
├── rules.jsonnet
└── utils.libsonnet
├── mixin.libsonnet
├── rules
├── apps.libsonnet
├── kube_apiserver-availability.libsonnet
├── kube_apiserver-burnrate.libsonnet
├── kube_apiserver-config.libsonnet
├── kube_apiserver-histogram.libsonnet
├── kube_apiserver.libsonnet
├── kube_scheduler.libsonnet
├── kubelet.libsonnet
├── node.libsonnet
├── rules.libsonnet
└── windows.libsonnet
├── runbook.md
├── scripts
├── check-selectors-ksm.sh
├── go.mod
├── go.sum
└── tools.go
└── tests
├── apiserver-availability-test.yaml
├── apps_alerts-test.yaml
├── rules-pod-owner-test.yaml
└── tests.yaml
/.github/ISSUE_TEMPLATE/bugs.yaml:
--------------------------------------------------------------------------------
1 | name: Bug Report
2 | description: Report a bug in the existing codebase.
3 | title: '[Bug]: '
4 | labels: ['kind/bug', 'pending-triage']
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Please use this template while reporting a bug and provide as much information as possible. If the matter is security related, please disclose it privately, see the project [security policy](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/main/SECURITY.md).
10 | - type: textarea
11 | id: cause
12 | attributes:
13 | label: What happened?
14 | description: A clear and concise description of what the bug is. Screenshots and screencasts are highly encouraged and helpful during triage, so please provide them if you can.
15 | placeholder: Describe the bug you encountered. Please do not paste any snippets here, use the next field instead.
16 | validations:
17 | required: true
18 | - type: textarea
19 | id: snippet
20 | attributes:
21 | label: Please provide any helpful snippets.
22 | description: If applicable, add code snippet(s) to help explain or reproduce the problem. This will be automatically formatted into code, so no need for backticks. Separate snippets using comments.
23 | render: jsonnet
24 | - type: dropdown
25 | id: contamination
26 | attributes:
27 | label: What parts of the codebase are affected?
28 | description: Select all that apply.
29 | multiple: true
30 | options:
31 | - Alerts
32 | - Dashboards
33 | - Rules
34 | - Other
35 | validations:
36 | required: true
37 | - type: checkboxes
38 | id: terms
39 | attributes:
40 | label: "I agree to the following terms:"
41 | options:
42 | - label: I agree to follow this project's [Code of Conduct](../../CODE_OF_CONDUCT.md).
43 | required: true
44 | - label: I have filled out all the required information above to the best of my ability.
45 | required: true
46 | - label: I have searched the issues of this repository and believe that this is not a duplicate.
47 | required: true
48 | - label: I have confirmed this bug exists in the default branch of the repository, as of the latest commit at the time of submission.
49 | required: true
50 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancements.yaml:
--------------------------------------------------------------------------------
1 | name: Enhancement Proposal
2 | description: Propose an enhancement for the existing codebase.
3 | title: '[Enhancement]: '
4 | labels: ['kind/enhancement', 'pending-triage']
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Please use this template while proposing an enhancement and provide as much information as possible. If this is a feature request, please ensure that [a consensus has been reached](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/CONTRIBUTING.md?plain=1#L24) before submitting.
10 | - type: textarea
11 | id: idea
12 | attributes:
13 | label: What's the general idea for the enhancement?
14 | description: A clear and concise description of the enhancement's targeted problem and its proposed solution. Screenshots and screencasts are highly encouraged and helpful during triage, so please provide them if you can.
15 | placeholder: Describe the need for this enhancement. Please do not paste any snippets here, use the next field instead.
16 | validations:
17 | required: true
18 | - type: textarea
19 | id: snippet
20 | attributes:
21 | label: Please provide any helpful snippets.
22 | description: If applicable, add code snippet(s) to help explain or reproduce the problem. This will be automatically formatted into code, so no need for backticks. Separate snippets using comments.
23 | render: jsonnet
24 | - type: dropdown
25 | id: contamination
26 | attributes:
27 | label: What parts of the codebase does the enhancement target?
28 | description: Select all that apply.
29 | multiple: true
30 | options:
31 | - Alerts
32 | - Dashboards
33 | - Rules
34 | - Other
35 | validations:
36 | required: true
37 | - type: textarea
38 | id: extra
39 | attributes:
40 | label: Anything else relevant to the enhancement that would help with the triage process?
41 | description: Any additional context or information that would be helpful to the maintainers. For example, if you have considered any alternatives or workarounds, please share them here.
42 | placeholder: Add any additional information here.
43 | - type: checkboxes
44 | id: terms
45 | attributes:
46 | label: "I agree to the following terms:"
47 | options:
48 | - label: I agree to follow this project's [Code of Conduct](../../CODE_OF_CONDUCT.md).
49 | required: true
50 | - label: I have filled out all the required information above to the best of my ability.
51 | required: true
52 | - label: I have searched the issues of this repository and believe that this is not a duplicate.
53 | required: true
54 | - label: I have confirmed this proposal applies to the default branch of the repository, as of the latest commit at the time of submission.
55 | required: true
56 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/prs.md:
--------------------------------------------------------------------------------
1 |
6 |
7 | #### What does this PR fix? Please be as descriptive as possible.**
8 |
9 | #### Any helpful code snippets or visual aids (before and after this patch, if applicable)?**
10 |
11 | Details
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | Fixes #
20 |
--------------------------------------------------------------------------------
/.github/dependabot.yaml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: github-actions
9 | directory: /
10 | schedule:
11 | interval: weekly
12 | - package-ecosystem: gomod
13 | directory: /scripts
14 | schedule:
15 | interval: weekly
16 |
--------------------------------------------------------------------------------
/.github/workflows/check-with-upstream.yaml:
--------------------------------------------------------------------------------
1 | name: check-with-upstream
2 | permissions: {}
3 | # Run every Monday.
4 | on:
5 | schedule:
6 | - cron: "0 0 * * 1"
7 | jobs:
8 | check-selectors-ksm:
9 | runs-on: ubuntu-latest
10 | name: Check if KSM selectors are present on applicable metrics.
11 | steps:
12 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
13 | with:
14 | persist-credentials: false
15 | - run: make --always-make check-selectors-ksm
16 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: ci
2 | permissions: {}
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | jobs:
9 | matrix:
10 | runs-on: ubuntu-latest
11 | name: ${{ matrix.name }}
12 | strategy:
13 | fail-fast: false
14 | matrix:
15 | include:
16 | - name: Lint Alerts
17 | run: make --always-make alerts-lint
18 | - name: Generate YAML
19 | run: make --always-make generate && git diff --exit-code
20 | - name: Lint Grafana Dashboards
21 | run: make --always-make dashboards-lint
22 | - name: Format JSONNET
23 | run: make --always-make jsonnet-fmt && git diff --exit-code
24 | - name: Lint JSONNET
25 | run: make --always-make jsonnet-lint
26 | - name: Format MD
27 | run: make --always-make markdownfmt && git diff --exit-code
28 | - name: Lint MD
29 | run: make --always-make vale && git diff --exit-code
30 | - name: Lint YAML
31 | run: make --always-make pint-lint
32 | - name: Run unit tests
33 | run: make --always-make test
34 |
35 | steps:
36 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
37 | with:
38 | persist-credentials: false
39 | - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
40 | with:
41 | go-version-file: scripts/go.mod
42 | cache-dependency-path: scripts/go.sum
43 | - run: ${{ matrix.run }}
44 |
--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - "version-*" # Trigger the workflow on push events to version-* tags
7 |
8 | permissions:
9 | contents: write
10 |
11 | jobs:
12 | release:
13 | name: Release
14 | runs-on: ubuntu-latest
15 | steps:
16 | - name: Create release on kubernetes-mixin
17 | uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
18 | env:
19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 | with:
21 | tag_name: ${{ github.ref_name }}
22 | repository: kubernetes-monitoring/kubernetes-mixin
23 | generate_release_notes: true
24 |
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | name: Check whether issues or PRs need attention
2 | on:
3 | workflow_dispatch: {}
4 | schedule:
5 | - cron: "0 0 * * *"
6 | permissions:
7 | issues: write
8 | pull-requests: write
9 | jobs:
10 | stale:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
14 | with:
15 | days-before-stale: 30
16 | days-before-close: 7
17 | stale-issue-message: |
18 | This issue has not had any activity in the past 30 days, so the
19 | `stale` label has been added to it.
20 |
21 | * The `stale` label will be removed if there is new activity
22 | * The issue will be closed in 7 days if there is no new activity
23 | * Add the `keepalive` label to exempt this issue from the stale check action
24 |
25 | Thank you for your contributions!
26 | stale-pr-message: |
27 | This PR has been automatically marked as stale because it has not
28 | had any activity in the past 30 days.
29 |
30 | The next time this stale check runs, the stale label will be
31 | removed if there is new activity. The issue will be closed in 7
32 | days if there is no new activity.
33 |
34 | Thank you for your contributions!
35 | stale-issue-label: stale
36 | stale-pr-label: stale
37 | exempt-issue-labels: keepalive
38 | exempt-pr-labels: keepalive
39 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | prometheus_alerts.yaml
2 | prometheus_rules.yaml
3 | dashboards_out
4 | vendor
5 | jsonnetfile.lock.json
6 | tmp
7 | .vale
8 |
--------------------------------------------------------------------------------
/.lint:
--------------------------------------------------------------------------------
1 | exclusions:
2 | template-job-rule:
3 | template-instance-rule:
4 | target-job-rule:
5 | target-instance-rule:
6 | panel-title-description-rule:
7 | panel-units-rule:
8 | panel-datasource-rule:
9 | reason: The new Grafonnet promotes the use of datasources at the query level. This should probably end up in the linter as a valid option.
10 |
--------------------------------------------------------------------------------
/.vale.ini:
--------------------------------------------------------------------------------
1 | StylesPath = .vale/styles
2 |
3 | MinAlertLevel = error
4 |
5 | Packages = Readability, write-good, alex
6 |
7 | [*]
8 | BasedOnStyles = Readability, write-good, alex
9 |
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # https://help.github.com/articles/about-codeowners/
2 |
3 | # These owners will be the default owners for everything in the repo. Unless a
4 | # later match takes precedence, they will be requested for review when someone
5 | # opens a pull request.
6 | * @povilasv @skl
7 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
6 |
7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
8 |
9 | ## Our Standards
10 |
11 | Examples of behavior that contributes to a positive environment for our community include:
12 |
13 | * Demonstrating empathy and kindness toward other people
14 | * Being respectful of differing opinions, viewpoints, and experiences
15 | * Giving and gracefully accepting constructive feedback
16 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience
17 | * Focusing on what is best not just for us as individuals, but for the overall community
18 |
19 | Examples of unacceptable behavior include:
20 |
21 | * The use of sexualized language or imagery, and sexual attention or advances of any kind
22 | * Trolling, insulting or derogatory comments, and personal or political attacks
23 | * Public or private harassment
24 | * Publishing others' private information, such as a physical or email address, without their explicit permission
25 | * Other conduct which could reasonably be considered inappropriate in a professional setting
26 |
27 | ## Enforcement Responsibilities
28 |
29 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful.
30 |
31 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate.
32 |
33 | ## Scope
34 |
35 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event.
36 |
37 | ## Enforcement
38 |
39 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at kubernetes-mixin-security@googlegroups.com. All complaints will be reviewed and investigated promptly and fairly.
40 |
41 | All community leaders are obligated to respect the privacy and security of the reporter of any incident.
42 |
43 | ## Enforcement Guidelines
44 |
45 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct:
46 |
47 | ### 1. Correction
48 |
49 | **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community.
50 |
51 | **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested.
52 |
53 | ### 2. Warning
54 |
55 | **Community Impact**: A violation through a single incident or series of actions.
56 |
57 | **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban.
58 |
59 | ### 3. Temporary Ban
60 |
61 | **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior.
62 |
63 | **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban.
64 |
65 | ### 4. Permanent Ban
66 |
67 | **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals.
68 |
69 | **Consequence**: A permanent ban from any sort of public interaction within the community.
70 |
71 | ## Attribution
72 |
73 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
74 |
75 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity).
76 |
77 | For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations.
78 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Thank you for taking an interest in the project! We welcome all manner of contributions that are within the bounds of the project's [code of conduct](CODE_OF_CONDUCT.md).
4 |
5 | #### **Did you find a bug?**
6 |
7 | * **Do not open up a GitHub issue if the bug is a security vulnerability**, and instead to refer to our [security policy](SECURITY.md).
8 |
9 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/kubernetes-monitoring/kubernetes-mixin/issues).
10 |
11 | * If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/new). Be sure to include a **title and clear description**, as much relevant information as possible, and a **`jsonnet` snippet**, if applicable, as well as an optional **visual sample** demonstrating the expected behavior that is not occurring.
12 |
13 | * Whenever possible, use the relevant bug report templates to create the issue.
14 |
15 | #### **Did you write a patch that fixes a bug?**
16 |
17 | * Open a new GitHub pull request with the patch.
18 |
19 | * Ensure the PR description describes the problem **and** solution. Include the relevant issue number if applicable.
20 |
21 | * Before submitting, please make sure the pull request template is filled out correctly.
22 |
23 | #### **Do you intend to add a new feature or change an existing one?**
24 |
25 | * Suggest your change in [#monitoring-mixins](https://kubernetes.slack.com/archives/CAX9GU941) and start writing code. While doing so, please reflect on:
26 | * Is your feature request related to a problem? Please describe the necessity for the change.
27 | * Describe the solution you're proposing. Please provide any relevant context.
28 | * Add any other context (for example, any workarounds, code snippets, visual aids, etc.), if applicable.
29 |
30 | * Do not open an issue on GitHub until you have collected positive feedback about the change. GitHub issues are primarily intended for bug reports and fixes.
31 |
32 | #### **Do you have questions about the source code?**
33 |
34 | * Ask any question about how to use the `kubernetes-mixin` project in the [#monitoring-mixins](https://kubernetes.slack.com/archives/CAX9GU941).
35 |
36 | ---
37 |
38 | `kubernetes-mixin` is a volunteer effort. We encourage you to pitch in and join [the team](https://github.com/kubernetes-monitoring/kubernetes-mixin/graphs/contributors)!
39 |
--------------------------------------------------------------------------------
/DESIGN.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Prometheus Monitoring Mixins
4 |
5 | ## Using jsonnet to package together dashboards, alerts and exporters.
6 |
7 | Status: Draft Tom Wilkie, Grafana Labs Frederic Branczyk, Red Hat
8 |
9 | In this design doc we present a technique for packaging and deploying "Monitoring Mixins" - extensible and customisable combinations of dashboards, alert definitions and exporters.
10 |
11 | ## Problem
12 |
13 | [Prometheus](#Notes) offers powerful open source monitoring and alerting - but that comes with higher degrees of freedom, making pre-configured monitoring configurations hard to build. Simultaneously, it has become accepted wisdom that the developers of a given software package are best placed to operate said software, or at least construct the basic monitoring configuration.
14 |
15 | This work aims to build on Julius Volz' document ["Prometheus Alerting and Dashboard Example Bundles"](#Notes) and subsequent PR ["Add initial node-exporter example bundle"](#Notes). In particular, we support the hypothesis that for Prometheus to gain increased traction we will need to appeal to non-monitoring-experts, and allow for a relatively seamless pre-configured monitoring experience. Where we disagree is around standardization: we do not want to prescribe a given label schema, example deployment or topology. That being said, a lot of the challenges surfaced in that doc are shared here.
16 |
17 | ## Aims
18 |
19 | This solution aims to define a minimal standard for how to package together Prometheus alerts, Prometheus recording rules and [Grafana](#Notes) dashboards in a way that is:
20 |
21 | **Easy to install and use, platform agnostic.** The users of these packages are unlikely to be monitoring experts. These packages must be easily installable with a few commands. And they must be general enough to work in all the environments where Prometheus can work: we're not just trying to build for Kubernetes here. That being said, the experience will be first class on Kubernetes.
22 |
23 | **Hosted alongside the programs which expose Prometheus metrics.** More often than not, the best people to build the alerting rules and dashboards for a given application are the authors of that application. And if that is not the case, then at least users of a given application will look to its source for monitoring best practices. We aim to provide a packaging method which allows the repo hosting the application source to also host the applications monitoring package; for them to be versioned along side the application. For example, we envisage the monitoring mixin for Etcd to live in the etcd repo and the monitoring package for Hashicorp's Consul to live in the [consul_exporter](#Notes) repo.
24 |
25 | **We want the ability to iterate and collaborate on packages.** A challenge with the existing published dashboards and alerts is that they are static: the only way to use them is to copy them into your codebase, edit them to make them fit with your deployment. This makes it hard for users to contribute changes back to the original author; it makes it impossible to download new improved versions and stay up to date with improvements. We want these packages to be constantly evolving; we want to encourage drive-by commits.
26 |
27 | **Packages should be reusable, configurable and extensible.** Users should be able to configure the packages to fit their deployments and labels schema without modifying the packages. Users should be able to extend the packages with extra dashboard panels and extra alerts, without having to copy, paste and modify them. The packages must be configurable so that they support the many different label schemes used today by different organisations.
28 |
29 | ## Proposal
30 |
31 | **Monitoring Mixins.** A monitoring mixin is a package of configuration containing Prometheus alerts, Prometheus recording rules and Grafana dashboards. Mixins will be maintained in version controlled repos (eg git) as a set of files. Versioning of mixins will be provided by the version control system; mixins themselves should not contain multiple versions.
32 |
33 | Mixins are intended just for the combination of Prometheus and Grafana, and not other monitoring or visualisation systems. Mixins are intended to be opinionated about the choice of monitoring technology.
34 |
35 | Mixins should not however be opinionated about how this configuration should be deployed; they should not contain manifests for deploying Prometheus and Grafana on Kubernetes, for instance. Multiple, separate projects can and should exist to help deploy mixins; we will provide example of how to do this on Kubernetes, and a tool for integrating with traditional config management systems.
36 |
37 | **Jsonnet.** We propose the use of [jsonnet](#Notes), a configuration language from Google, as the basis of our monitoring mixins. Jsonnet has some popularity in this space, as it is used in the [ksonnet](#Notes) project for achieving similar goals for Kubernetes.
38 |
39 | Jsonnet offers the ability to parameterise configuration, allowing for basic customisation. Furthermore, in Jsonnet one can reference another part of the data structure, reducing repetition. For example, with jsonnet one can specify a default job name, and then have all the alerts use that:
40 |
41 | ```
42 | {
43 | _config+:: {
44 | kubeStateMetricsSelector: ‘job=”default/kube-state-metrics"',
45 |
46 | allowedNotReadyPods: 0,
47 | },
48 |
49 | groups+: [
50 | {
51 | name: "kubernetes",
52 | rules: [
53 | {
54 | alert: "KubePodNotReady",
55 | expr: |||
56 | sum by (namespace, pod) (
57 | kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase!~"Running|Succeeded"}
58 | ) > $(allowedNotReadyPods)s
59 | ||| % $._config,
60 | "for": "1h",
61 | labels: {
62 | severity: "critical",
63 | },
64 | annotations: {
65 | message: "{{ $labels.namespace }}/{{ $labels.pod }} is not ready.",
66 | },
67 | },
68 | ],
69 | },
70 | ],
71 | }
72 | ```
73 |
74 | **Configuration.* We'd like to suggest some standardisation of how configuration is supplied to mixins. A top level `_config` dictionary should be provided, containing various parameters for substitution into alerts and dashboards. In the above example, this is used to specify the selector for the kube-state-metrics pod, and the threshold for the alert.
75 |
76 | **Extension.** One of jsonnet's basic operations is to "merge” data structures - this also allows you to extend existing configurations. For example, given an existing dashboard:
77 |
78 | ```
79 | local g = import "klumps/lib/grafana.libsonnet";
80 |
81 | {
82 | dashboards+:: {
83 | "foo.json": g.dashboard("Foo")
84 | .addRow(
85 | g.row("Foo")
86 | .addPanel(
87 | g.panel("Bar") +
88 | g.queryPanel('irate(foor_bar_total[1m])', 'Foo Bar')
89 | )
90 | )
91 | },
92 | }
93 | ```
94 |
95 | It is relatively easy to import it and add extra rows:
96 |
97 | ```
98 | local g = import "foo.libsonnet";
99 |
100 | {
101 | dashboards+:: {
102 | "foo.json"+:
103 | super.addRow(
104 | g.row("A new row")
105 | .addPanel(
106 | g.panel("A new panel") +
107 | g.queryPanel('irate(new_total[1m])', 'New')
108 | )
109 | )
110 | },
111 | }
112 | ```
113 |
114 | These abilities offered by jsonnet are key to being able to separate out "upstream” alerts and dashboards from customizations, and keep upstream in sync with the source of the mixin.
115 |
116 | **Higher Order Abstractions.** jsonnet is a functional programming language, and as such allows you to build higher order abstractions over your configuration. For example, you can build functions to generate recording rules for a set of percentiles and labels aggregations, given a histogram:
117 |
118 | ```
119 | local histogramRules(metric, labels) =
120 | local vars = {
121 | metric: metric,
122 | labels_underscore: std.join("_", labels),
123 | labels_comma: std.join(", ", labels),
124 | };
125 | [
126 | {
127 | record: "%(labels_underscore)s:%(metric)s:99quantile" % vars,
128 | expr: "histogram_quantile(0.99, sum(rate(%(metric)s_bucket[5m])) by (le,
129 | %(labels_comma)s))" % vars,
130 | },
131 | {
132 | record: "%(labels_underscore)s:%(metric)s:50quantile" % vars,
133 | expr: "histogram_quantile(0.50, sum(rate(%(metric)s_bucket[5m])) by (le,
134 | %(labels_comma)s))" % vars,
135 | },
136 | {
137 | record: "%(labels_underscore)s:%(metric)s:avg" % vars,
138 | expr: "sum(rate(%(metric)s_sum[5m])) by (%(labels_comma)s) /
139 | sum(rate(%(metric)s_count[5m])) by (%(labels_comma)s)" % vars,
140 | },
141 | ];
142 |
143 | {
144 | groups+: [{
145 | name: "frontend_rules",
146 | rules:
147 | histogramRules("frontend_request_duration_seconds", ["job"]) +
148 | histogramRules("frontend_request_duration_seconds", ["job", "route"]),
149 | }],
150 | }
151 | ```
152 |
153 | Other potential examples include functions to generate alerts at different thresholds, omitting multiple alerts, warning and critical.
154 |
155 | **[Grafonnet](#Notes)** An emerging pattern in the jsonnet ecosystem is the existence of libraries of helper functions to generate objects for a given system. For example, ksonnet is a library to generate objects for the Kubernetes object model. Grafonnet is a library for generating Grafana Dashboards using jsonnet. We envisage a series of libraries, such as Grafonnet, to help people build mixins. As such, any system for installing mixins needs to deal with transitive dependencies.
156 |
157 | **Package Management.** The current proof of concepts for mixins (see below) use the new package manager [jsonnet-bundler](#Notes) enabling the following workflow:
158 |
159 | ```
160 | $ jb install kausal github.com/kausalco/public/consul-mixin
161 | ```
162 |
163 | This downloads a copy of the mixin into `vendor/consul-mixin` and allows users to include the mixin in their ksonnet config like so:
164 |
165 | ```
166 | local prometheus = import "prometheus-ksonnet/prometheus-ksonnet.libsonnet";
167 | local consul_mixin = import "consul-mixin/mixin.libsonnet";
168 |
169 | prometheus + consul_mixin {
170 | _config+:: {
171 | namespace: "default",
172 | },
173 | }
174 | ```
175 |
176 | This example also uses the prometheus-ksonnet package from [Kausal](#Notes), which understands the structure of the mixins and manifests alerting rules, recording rules and dashboards as config maps in Kubernetes, mounted into the Kubernetes pods in the correct place.
177 |
178 | However, we think this is a wider problem than just monitoring mixins, and are exploring designs for a generic jsonnet package manager in a [separate design doc](#Notes).
179 |
180 | **Proposed Schema.** To allow multiple tools to utilise mixins, we must agree on some common naming. The proposal is that a mixin is a single dictionary containing three keys:
181 |
182 | - `grafanaDashboards` A dictionary of dashboard file name (foo.json) to dashboard json.
183 | - `prometheusAlerts` A list of Prometheus alert groups.
184 | - `prometheusRules` A list of Prometheus rule groups.
185 |
186 | Each of these values will be expressed as jsonnet objects - not strings. It is the responsibility of the tool consuming the mixin to render these out as JSON or YAML. Jsonnet scripts to do this for you will be provided.
187 |
188 | ```
189 | {
190 | grafanaDashboards+:: {
191 | "dashboard-name.json”: {...},
192 | },
193 | prometheusAlerts+:: [...],
194 | prometheusRules+:: [...],
195 | }
196 | ```
197 |
198 | **Consuming a mixin.**
199 |
200 | - TODO examples of how we expect people to install, customise and extend mixins.
201 | - TODO Ability to manifest out jsonnet configuration in a variety of formats - YAML, JSON, INI etc
202 | - TODO show how it works with ksonnet but also with something like puppet..
203 |
204 | Examples & Proof of Concepts We will probably put the specification and list of known mixins in a repo somewhere, as a readme. For now, these are the known mixins and related projects:
205 |
206 | | Application | Mixin | Author |
207 | |------------------|--------------------|--------------------------------|
208 | | CoreOS Etcd | etcd-mixin | Grapeshot / Tom Wilkie |
209 | | Cassandra | TBD | Grafana Labs |
210 | | Hashicorp Consul | consul-mixin | Kausal |
211 | | Hashicorp Vault | vault_exporter | Grapeshot / Tom Wilkie |
212 | | Kubernetes | kubernetes-mixin | Tom Wilkie & Frederic Branczyk |
213 | | Kubernetes | kubernetes-grafana | Frederic Branczyk |
214 | | Kubernetes | kube-prometheus | Frederic Branczyk |
215 | | Prometheus | prometheus-ksonnet | Kausal |
216 |
217 | **Open Questions**
218 |
219 | - Some systems require exporters; can / should these be packaged as part of the mixin? Hard to do generally, easy to do for kubernetes with ksonnet.
220 | - On the exporter topic, some systems need stats_exporter mappings to be consistent with alerts and dashboards. Even if we can include statds_exporter in the mixin, can we include the mappings?
221 | - A lot of questions from Julius' design are still open: how to deal with different aggregation windows, what labels to use on alerts etc.
222 |
223 |
224 | ## Notes
225 |
226 | This was recreated from a [web.archive.org](https://web.archive.org/web/20211021151124/https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/edit) capture of the original document, the license of this file is unknown.
227 |
228 | The links in the archive do not work and have not been recreated.
229 |
230 | The license of this file is unknown, but judging by the intent it was meant to be shared freely.
231 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | BIN_DIR ?= $(shell pwd)/tmp/bin
2 |
3 | JSONNET_VENDOR=vendor
4 | GRAFANA_DASHBOARD_LINTER_BIN=$(BIN_DIR)/dashboard-linter
5 | JB_BIN=$(BIN_DIR)/jb
6 | JSONNET_BIN=$(BIN_DIR)/jsonnet
7 | JSONNETLINT_BIN=$(BIN_DIR)/jsonnet-lint
8 | JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt
9 | MD_FILES = $(shell find . \( -type d -name '.vale' -o -type d -name 'vendor' \) -prune -o -type f -name "*.md" -print)
10 | MARKDOWNFMT_BIN=$(BIN_DIR)/markdownfmt
11 | VALE_BIN=$(BIN_DIR)/vale
12 | PROMTOOL_BIN=$(BIN_DIR)/promtool
13 | PINT_BIN=$(BIN_DIR)/pint
14 | TOOLING=$(JB_BIN) $(JSONNETLINT_BIN) $(JSONNET_BIN) $(JSONNETFMT_BIN) $(PROMTOOL_BIN) $(GRAFANA_DASHBOARD_LINTER_BIN) $(MARKDOWNFMT_BIN) $(VALE_BIN) $(PINT_BIN)
15 | JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s
16 | SRC_DIR ?=dashboards
17 | OUT_DIR ?=dashboards_out
18 |
19 | .PHONY: all
20 | all: fmt generate lint test
21 |
22 | .PHONY: generate
23 | generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR)
24 |
25 | $(JSONNET_VENDOR): $(JB_BIN) jsonnetfile.json
26 | $(JB_BIN) install
27 |
28 | .PHONY: fmt
29 | fmt: jsonnet-fmt markdownfmt
30 |
31 | .PHONY: jsonnet-fmt
32 | jsonnet-fmt: $(JSONNETFMT_BIN)
33 | @find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
34 | xargs -n 1 -- $(JSONNETFMT_BIN) $(JSONNETFMT_ARGS) -i
35 |
36 | .PHONY: markdownfmt
37 | markdownfmt: $(MARKDOWNFMT_BIN)
38 | @for file in $(MD_FILES); do $(MARKDOWNFMT_BIN) -w -gofmt $$file; done
39 |
40 | prometheus_alerts.yaml: $(JSONNET_BIN) mixin.libsonnet lib/alerts.jsonnet alerts/*.libsonnet
41 | @$(JSONNET_BIN) -J vendor -S lib/alerts.jsonnet > $@
42 |
43 | prometheus_rules.yaml: $(JSONNET_BIN) mixin.libsonnet lib/rules.jsonnet rules/*.libsonnet
44 | @$(JSONNET_BIN) -J vendor -S lib/rules.jsonnet > $@
45 |
46 | $(OUT_DIR): $(JSONNET_BIN) $(JSONNET_VENDOR) mixin.libsonnet lib/dashboards.jsonnet $(SRC_DIR)/*.libsonnet
47 | @mkdir -p $(OUT_DIR)
48 | @$(JSONNET_BIN) -J vendor -m $(OUT_DIR) lib/dashboards.jsonnet
49 |
50 | .PHONY: lint
51 | lint: jsonnet-lint alerts-lint dashboards-lint vale pint-lint
52 |
53 | .PHONY: jsonnet-lint
54 | jsonnet-lint: $(JSONNETLINT_BIN) $(JSONNET_VENDOR)
55 | @find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
56 | xargs -n 1 -- $(JSONNETLINT_BIN) -J vendor
57 |
58 | .PHONY: alerts-lint
59 | alerts-lint: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml
60 | @$(PROMTOOL_BIN) check rules prometheus_rules.yaml
61 | @$(PROMTOOL_BIN) check rules prometheus_alerts.yaml
62 |
63 | $(OUT_DIR)/.lint: $(OUT_DIR)
64 | @cp .lint $@
65 |
66 | .PHONY: dashboards-lint
67 | dashboards-lint: $(GRAFANA_DASHBOARD_LINTER_BIN) $(OUT_DIR)/.lint
68 | # Replace $$interval:$$resolution var with $$__rate_interval to make dashboard-linter happy.
69 | @sed -i -e 's/$$interval:$$resolution/$$__rate_interval/g' $(OUT_DIR)/*.json
70 | @find $(OUT_DIR) -name '*.json' -print0 | xargs -n 1 -0 $(GRAFANA_DASHBOARD_LINTER_BIN) lint --strict
71 |
72 | .PHONY: vale
73 | vale: $(VALE_BIN)
74 | @$(VALE_BIN) sync && \
75 | $(VALE_BIN) $(MD_FILES)
76 |
77 | .PHONY: pint-lint
78 | pint-lint: generate $(PINT_BIN)
79 | @# Pint will not exit with a non-zero status code if there are linting issues.
80 | @output=$$($(PINT_BIN) -n -o -l WARN lint prometheus_alerts.yaml prometheus_rules.yaml 2>&1); \
81 | if [ -n "$$output" ]; then \
82 | echo "\n$$output"; \
83 | exit 1; \
84 | fi
85 |
86 | .PHONY: clean
87 | clean:
88 | # Remove all files and directories ignored by git.
89 | git clean -Xfd .
90 |
91 | .PHONY: test
92 | test: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml
93 | @$(PROMTOOL_BIN) test rules tests/*.yaml
94 |
95 | $(BIN_DIR):
96 | mkdir -p $(BIN_DIR)
97 |
98 | $(TOOLING): $(BIN_DIR)
99 | @echo Installing tools from hack/tools.go
100 | @cd scripts && go list -e -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) %
101 |
102 | ########################################
103 | # "check-with-upstream" workflow checks.
104 | ########################################
105 |
106 | check-selectors-ksm:
107 | @./scripts/check-selectors-ksm.sh
108 |
--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
1 | # See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md
2 |
3 | approvers:
4 | - brancz
5 | - csmarchbanks
6 | - metalmatze
7 | - tomwilkie
8 | - s-urbaniak
9 | - povilasv
10 | - paulfantom
11 |
12 | reviewers:
13 | - brancz
14 | - csmarchbanks
15 | - metalmatze
16 | - tomwilkie
17 | - s-urbaniak
18 | - povilasv
19 | - paulfantom
20 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Prometheus Monitoring Mixin for Kubernetes
2 |
3 | [](https://github.com/kubernetes-monitoring/kubernetes-mixin/actions/workflows/ci.yaml)
4 |
5 | > NOTE: This project is *pre-release* stage. Flags, configuration, behaviour and design may change significantly in following releases.
6 |
7 | A set of Grafana dashboards and Prometheus alerts for Kubernetes.
8 |
9 | ## Releases
10 |
11 | > Note: Releases up until `release-0.12` are changes in their own branches. Changelogs are included in releases starting from [version-0.13.0](https://github.com/kubernetes-monitoring/kubernetes-mixin/releases/tag/version-0.13.0).
12 |
13 | | Release branch | Kubernetes Compatibility | Prometheus Compatibility | Kube-state-metrics Compatibility |
14 | |----------------|--------------------------|--------------------------|----------------------------------|
15 | | release-0.1 | v1.13 and before | | |
16 | | release-0.2 | v1.14.1 and before | v2.11.0+ | |
17 | | release-0.3 | v1.17 and before | v2.11.0+ | |
18 | | release-0.4 | v1.18 | v2.11.0+ | |
19 | | release-0.5 | v1.19 | v2.11.0+ | |
20 | | release-0.6 | v1.19+ | v2.11.0+ | |
21 | | release-0.7 | v1.19+ | v2.11.0+ | v1.x |
22 | | release-0.8 | v1.20+ | v2.11.0+ | v2.0+ |
23 | | release-0.9 | v1.20+ | v2.11.0+ | v2.0+ |
24 | | release-0.10 | v1.20+ | v2.11.0+ | v2.0+ |
25 | | release-0.11 | v1.23+ | v2.11.0+ | v2.0+ |
26 | | release-0.12 | v1.23+ | v2.11.0+ | v2.0+ |
27 | | release-0.13 | v1.23+ | v2.11.0+ | v2.0+ |
28 | | master | v1.26+ | v2.11.0+ | v2.0+ |
29 |
30 | In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented. Therefore v0.1.x of this repository is the last release to support Kubernetes 1.13 and previous version on a best effort basis.
31 |
32 | Some alerts now use Prometheus filters made available in Prometheus 2.11.0, which makes this version of Prometheus a dependency.
33 |
34 | Warning: This compatibility matrix was initially created based on experience, we do not guarantee the compatibility, it may be updated based on new learnings.
35 |
36 | Warning: By default the expressions will generate *grafana 7.2+* compatible rules using the *$__rate_interval* variable for rate functions. If you need backward compatible rules please set *grafana72: false* in your *_config*
37 |
38 | ### Release steps
39 |
40 | Maintainers can trigger the [release workflow](.github/workflows/release.yaml) by pushing a git tag that matches the pattern: `version-*`.
41 |
42 | 1. Checkout `master` branch and pull for latest.
43 |
44 | ```bash
45 | git checkout master
46 | ```
47 |
48 | 2. Create a tag following sem-ver versioning for the version and trigger release.
49 |
50 | ```bash
51 | # replace MAJOR.MINOR.PATCH with e.g. 1.2.3
52 | tag=version-MAJOR.MINOR.PATCH; git tag $tag && git push origin $tag
53 | ```
54 |
55 | #### Decisions on backfilling releases
56 |
57 | We wanted to backfill `release-0.1` to `release-0.12` to have a changelog, but we were not able to use a GitHub action in a newer commit to trigger a release that generates a changelog on older commits. See #489 for full discussion.
58 |
59 | ## Metrics Deprecation
60 |
61 | The following recording rule is marked deprecated. It will be removed in v2.0.0.
62 |
63 | ```bash
64 | node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
65 | ```
66 |
67 | It will be replaced by the following recording rule to preserve data points using `rate` and add `5m` to indicate the range of the rate query in the recording rule name.
68 |
69 | ```bash
70 | node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m
71 | ```
72 |
73 | ## How to use
74 |
75 | This mixin is designed to be vendored into the repo with your infrastructure config. To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler):
76 |
77 | You then have three options for deploying your dashboards
78 | 1. Generate the config files and deploy them yourself
79 | 2. Use ksonnet to deploy this mixin along with Prometheus and Grafana
80 | 3. Use prometheus-operator to deploy this mixin (TODO)
81 |
82 | ## Generate config files
83 |
84 | You can manually generate the alerts, dashboards and rules files, but first you must install some tools:
85 |
86 | ```
87 | $ go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
88 | $ brew install jsonnet
89 | ```
90 |
91 | Then, grab the mixin and its dependencies:
92 |
93 | ```
94 | $ git clone https://github.com/kubernetes-monitoring/kubernetes-mixin
95 | $ cd kubernetes-mixin
96 | $ jb install
97 | ```
98 |
99 | Finally, build the mixin:
100 |
101 | ```
102 | $ make prometheus_alerts.yaml
103 | $ make prometheus_rules.yaml
104 | $ make dashboards_out
105 | ```
106 |
107 | The `prometheus_alerts.yaml` and `prometheus_rules.yaml` file then need to passed to your Prometheus server, and the files in `dashboards_out` need to be imported into you Grafana server. The exact details will depending on how you deploy your monitoring stack to Kubernetes.
108 |
109 | ### Dashboards for Windows Nodes
110 |
111 | There exist separate dashboards for windows resources.
112 | 1) Compute Resources / Cluster(Windows)
113 | 2) Compute Resources / Namespace(Windows)
114 | 3) Compute Resources / Pod(Windows)
115 | 4) USE Method / Cluster(Windows)
116 | 5) USE Method / Node(Windows)
117 |
118 | These dashboards are based on metrics populated by [windows-exporter](https://github.com/prometheus-community/windows_exporter) from each Windows node.
119 |
120 | ## Running the tests
121 |
122 | ```sh
123 | make test
124 | ```
125 |
126 | ## Using with prometheus-ksonnet
127 |
128 | Alternatively you can also use the mixin with [prometheus-ksonnet](https://github.com/kausalco/public/tree/master/prometheus-ksonnet), a [ksonnet](https://github.com/ksonnet/ksonnet) module to deploy a fully-fledged Prometheus-based monitoring system for Kubernetes:
129 |
130 | Make sure you have the ksonnet v0.8.0:
131 |
132 | ```
133 | $ brew install https://raw.githubusercontent.com/ksonnet/homebrew-tap/82ef24cb7b454d1857db40e38671426c18cd8820/ks.rb
134 | $ brew pin ks
135 | $ ks version
136 | ksonnet version: v0.8.0
137 | jsonnet version: v0.9.5
138 | client-go version: v1.6.8-beta.0+$Format:%h$
139 | ```
140 |
141 | In your config repo, if you don't have a ksonnet application, make a new one (will copy credentials from current context):
142 |
143 | ```
144 | $ ks init
145 | $ cd
146 | $ ks env add default
147 | ```
148 |
149 | Grab the kubernetes-jsonnet module using and its dependencies, which include the kubernetes-mixin:
150 |
151 | ```
152 | $ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
153 | $ jb init
154 | $ jb install github.com/kausalco/public/prometheus-ksonnet
155 | ```
156 |
157 | Assuming you want to run in the default namespace ('environment' in ksonnet parlance), add the follow to the file `environments/default/main.jsonnet`:
158 |
159 | ```jsonnet
160 | local prometheus = import "prometheus-ksonnet/prometheus-ksonnet.libsonnet";
161 |
162 | prometheus {
163 | _config+:: {
164 | namespace: "default",
165 | },
166 | }
167 | ```
168 |
169 | Apply your config:
170 |
171 | ```
172 | $ ks apply default
173 | ```
174 |
175 | ## Using prometheus-operator
176 |
177 | TODO
178 |
179 | ## Multi-cluster support
180 |
181 | Kubernetes-mixin can support dashboards across multiple clusters. You need either a multi-cluster [Thanos](https://github.com/improbable-eng/thanos) installation with `external_labels` configured or a [Cortex](https://github.com/cortexproject/cortex) system where a cluster label exists. To enable this feature you need to configure the following:
182 |
183 | ```jsonnet
184 | // Opt-in to multiCluster dashboards by overriding this and the clusterLabel.
185 | showMultiCluster: true,
186 | clusterLabel: '',
187 | ```
188 |
189 | ## Customising the mixin
190 |
191 | Kubernetes-mixin allows you to override the selectors used for various jobs, to match those used in your Prometheus set. You can also customize the dashboard names and add grafana tags.
192 |
193 | In a new directory, add a file `mixin.libsonnet`:
194 |
195 | ```jsonnet
196 | local kubernetes = import "kubernetes-mixin/mixin.libsonnet";
197 |
198 | kubernetes {
199 | _config+:: {
200 | kubeStateMetricsSelector: 'job="kube-state-metrics"',
201 | cadvisorSelector: 'job="kubernetes-cadvisor"',
202 | nodeExporterSelector: 'job="kubernetes-node-exporter"',
203 | kubeletSelector: 'job="kubernetes-kubelet"',
204 | grafanaK8s+:: {
205 | dashboardNamePrefix: 'Mixin / ',
206 | dashboardTags: ['kubernetes', 'infrastucture'],
207 | },
208 | },
209 | }
210 | ```
211 |
212 | Then, install the kubernetes-mixin:
213 |
214 | ```
215 | $ jb init
216 | $ jb install github.com/kubernetes-monitoring/kubernetes-mixin
217 | ```
218 |
219 | Generate the alerts, rules and dashboards:
220 |
221 | ```
222 | $ jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusAlerts)' > alerts.yml
223 | $ jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusRules)' >files/rules.yml
224 | $ jsonnet -J vendor -m files/dashboards -e '(import "mixin.libsonnet").grafanaDashboards'
225 | ```
226 |
227 | ### Customising alert annotations
228 |
229 | The steps described below extend on the existing mixin library without modifying the original git repository. This is to make consuming updates to your extended alert definitions easier. These definitions can reside outside of this repository and added to your own custom location, where you can define your alert dependencies in your `jsonnetfile.json` and add customisations to the existing definitions.
230 |
231 | In your working directory, create a new file `kubernetes_mixin_override.libsonnet` with the following:
232 |
233 | ```jsonnet
234 | local utils = import 'lib/utils.libsonnet';
235 | (import 'mixin.libsonnet') +
236 | (
237 | {
238 | prometheusAlerts+::
239 | // The specialAlerts can be in any other config file
240 | local slack = 'observability';
241 | local specialAlerts = {
242 | KubePodCrashLooping: { slack_channel: slack },
243 | KubePodNotReady: { slack_channel: slack },
244 | };
245 |
246 | local addExtraAnnotations(rule) = rule {
247 | [if 'alert' in rule then 'annotations']+: {
248 | dashboard: 'https://foo.bar.co',
249 | [if rule.alert in specialAlerts then 'slack_channel']: specialAlerts[rule.alert].slack_channel,
250 | },
251 | };
252 | utils.mapRuleGroups(addExtraAnnotations),
253 | }
254 | )
255 | ```
256 |
257 | Create new file: `lib/kubernetes_customised_alerts.jsonnet` with the following:
258 |
259 | ```jsonnet
260 | std.manifestYamlDoc((import '../kubernetes_mixin_override.libsonnet').prometheusAlerts)
261 | ```
262 |
263 | Running `jsonnet -S lib/kubernetes_customised_alerts.jsonnet` will build the alerts with your customisations.
264 |
265 | Same result can be achieved by modyfying the existing `config.libsonnet` with the content of `kubernetes_mixin_override.libsonnet`.
266 |
267 | ## Background
268 |
269 | ### Alert Severities
270 |
271 | While the community has not yet fully agreed on alert severities and their to be used, this repository assumes the following paradigms when setting the severities:
272 |
273 | * Critical: An issue, that needs to page a person to take instant action
274 | * Warning: An issue, that needs to be worked on but in the regular work queue or for during office hours rather than paging the oncall
275 | * Info: Is meant to support a trouble shooting process by informing about a non-normal situation for one or more systems but not worth a page or ticket on its own.
276 |
277 | ### Architecture and Technical Decisions
278 |
279 | * For more motivation, see "[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin.
280 | * For more information about monitoring mixins, see this [design doc](DESIGN.md).
281 |
282 | ## Note
283 |
284 | You can use the external tool call [prom-metrics-check](https://github.com/ContainerSolutions/prom-metrics-check) to validate the created dashboards. This tool allows you to check if the metrics installed and used in Grafana dashboards exist in the Prometheus instance. Please have a look at https://github.com/ContainerSolutions/prom-metrics-check.
285 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Reporting a Vulnerability
4 |
5 | If you discover a security issue in this project, please report it to the project's [SECURITY_CONTACTS](SECURITY_CONTACTS). You can also ping the project's maintainers through the project's [Slack](https://kubernetes.slack.com/archives/CAX9GU941), privately.
6 |
--------------------------------------------------------------------------------
/SECURITY_CONTACTS:
--------------------------------------------------------------------------------
1 | # Defined below are the security contacts for this repo.
2 | #
3 | # They are the contact point for the Product Security Committee to reach out
4 | # to for triaging and handling of incoming issues.
5 | #
6 | # The below names agree to abide by the
7 | # [Embargo Policy](https://git.k8s.io/security/private-distributors-list.md#embargo-policy)
8 | # and will be removed and replaced if they violate that agreement.
9 | #
10 | # DO NOT REPORT SECURITY VULNERABILITIES DIRECTLY TO THESE NAMES, FOLLOW THE
11 | # INSTRUCTIONS AT https://kubernetes.io/security/
12 |
13 | brancz
14 | csmarchbanks
15 | metalmatze
16 | tomwilkie
17 |
--------------------------------------------------------------------------------
/alerts/alerts.libsonnet:
--------------------------------------------------------------------------------
1 | (import 'apps_alerts.libsonnet') +
2 | (import 'resource_alerts.libsonnet') +
3 | (import 'storage_alerts.libsonnet') +
4 | (import 'system_alerts.libsonnet') +
5 | (import 'kube_apiserver.libsonnet') +
6 | (import 'kubelet.libsonnet') +
7 | (import 'kube_scheduler.libsonnet') +
8 | (import 'kube_controller_manager.libsonnet') +
9 | (import 'kube_proxy.libsonnet') +
10 | (import '../lib/add-runbook-links.libsonnet')
11 |
--------------------------------------------------------------------------------
/alerts/kube_apiserver.libsonnet:
--------------------------------------------------------------------------------
1 | local utils = import '../lib/utils.libsonnet';
2 |
3 | {
4 | _config+:: {
5 | kubeApiserverSelector: error 'must provide selector for kube-apiserver',
6 |
7 | certExpirationWarningSeconds: 7 * 24 * 3600,
8 | certExpirationCriticalSeconds: 1 * 24 * 3600,
9 | },
10 |
11 | prometheusAlerts+:: {
12 | groups+: [
13 | {
14 | name: 'kube-apiserver-slos',
15 | rules: [
16 | {
17 | alert: 'KubeAPIErrorBudgetBurn',
18 | expr: |||
19 | sum by(%s) (apiserver_request:burnrate%s) > (%.2f * %.5f)
20 | and on(%s)
21 | sum by(%s) (apiserver_request:burnrate%s) > (%.2f * %.5f)
22 | ||| % [
23 | $._config.clusterLabel,
24 | w.long,
25 | w.factor,
26 | (1 - $._config.SLOs.apiserver.target),
27 | $._config.clusterLabel,
28 | $._config.clusterLabel,
29 | w.short,
30 | w.factor,
31 | (1 - $._config.SLOs.apiserver.target),
32 | ],
33 | labels: {
34 | severity: w.severity,
35 | short: '%(short)s' % w,
36 | long: '%(long)s' % w,
37 | },
38 | annotations: {
39 | description: 'The API server is burning too much error budget%s.' % [
40 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
41 | ],
42 | summary: 'The API server is burning too much error budget.',
43 | },
44 | 'for': '%(for)s' % w,
45 | }
46 | for w in $._config.SLOs.apiserver.windows
47 | ],
48 | },
49 | {
50 | name: 'kubernetes-system-apiserver',
51 | rules: [
52 | {
53 | alert: 'KubeClientCertificateExpiration',
54 | expr: |||
55 | histogram_quantile(0.01, sum without (%(namespaceLabel)s, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s
56 | and
57 | on(job, %(clusterLabel)s, instance) apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0
58 | ||| % $._config,
59 | 'for': '5m',
60 | labels: {
61 | severity: 'warning',
62 | },
63 | annotations: {
64 | description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than %s%s.' % [
65 | (utils.humanizeSeconds($._config.certExpirationWarningSeconds)),
66 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
67 | ],
68 | summary: 'Client certificate is about to expire.',
69 | },
70 | },
71 | {
72 | alert: 'KubeClientCertificateExpiration',
73 | expr: |||
74 | histogram_quantile(0.01, sum without (%(namespaceLabel)s, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s
75 | and
76 | on(job, %(clusterLabel)s, instance) apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0
77 | ||| % $._config,
78 | 'for': '5m',
79 | labels: {
80 | severity: 'critical',
81 | },
82 | annotations: {
83 | description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than %s%s.' % [
84 | (utils.humanizeSeconds($._config.certExpirationCriticalSeconds)),
85 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
86 | ],
87 | summary: 'Client certificate is about to expire.',
88 | },
89 | },
90 | {
91 | alert: 'KubeAggregatedAPIErrors',
92 | expr: |||
93 | sum by(%(clusterLabel)s, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{%(kubeApiserverSelector)s}[1m])) > 0
94 | ||| % $._config,
95 | 'for': '10m',
96 | labels: {
97 | severity: 'warning',
98 | },
99 | annotations: {
100 | description: 'Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors%s.' % [
101 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
102 | ],
103 | summary: 'Kubernetes aggregated API has reported errors.',
104 | },
105 | },
106 | {
107 | alert: 'KubeAggregatedAPIDown',
108 | expr: |||
109 | (1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice{%(kubeApiserverSelector)s}[10m]))) * 100 < 85
110 | ||| % $._config,
111 | 'for': '5m',
112 | labels: {
113 | severity: 'warning',
114 | },
115 | annotations: {
116 | description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}%% available over the last 10m%s.' % [
117 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
118 | ],
119 | summary: 'Kubernetes aggregated API is down.',
120 | },
121 | },
122 | (import '../lib/absent_alert.libsonnet') {
123 | componentName:: 'KubeAPI',
124 | selector:: $._config.kubeApiserverSelector,
125 | },
126 | {
127 | alert: 'KubeAPITerminatedRequests',
128 | expr: |||
129 | sum by(%(clusterLabel)s) (rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) / ( sum by(%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) + sum by(%(clusterLabel)s) (rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) ) > 0.20
130 | ||| % $._config,
131 | labels: {
132 | severity: 'warning',
133 | },
134 | annotations: {
135 | description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests%s.' % [
136 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
137 | ],
138 | summary: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
139 | },
140 | 'for': '5m',
141 | },
142 | ],
143 | },
144 | ],
145 | },
146 | }
147 |
--------------------------------------------------------------------------------
/alerts/kube_controller_manager.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | kubeControllerManagerSelector: error 'must provide selector for kube-controller-manager',
4 | },
5 |
6 | prometheusAlerts+:: {
7 | groups+: [
8 | {
9 | name: 'kubernetes-system-controller-manager',
10 | rules: [
11 | (import '../lib/absent_alert.libsonnet') {
12 | componentName:: 'KubeControllerManager',
13 | selector:: $._config.kubeControllerManagerSelector,
14 | },
15 | ],
16 | },
17 | ],
18 | },
19 | }
20 |
--------------------------------------------------------------------------------
/alerts/kube_proxy.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | kubeProxySelector: error 'must provide selector for kube-proxy',
4 | },
5 |
6 | prometheusAlerts+:: {
7 | groups+: [
8 | {
9 | name: 'kubernetes-system-kube-proxy',
10 | rules: [
11 | (import '../lib/absent_alert.libsonnet') {
12 | componentName:: 'KubeProxy',
13 | selector:: $._config.kubeProxySelector,
14 | },
15 | ],
16 | },
17 | ],
18 | },
19 | }
20 |
--------------------------------------------------------------------------------
/alerts/kube_scheduler.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | kubeSchedulerSelector: 'job="kube-scheduler"',
4 | },
5 |
6 | prometheusAlerts+:: {
7 | groups+: [
8 | {
9 | name: 'kubernetes-system-scheduler',
10 | rules: [
11 | (import '../lib/absent_alert.libsonnet') {
12 | componentName:: 'KubeScheduler',
13 | selector:: $._config.kubeSchedulerSelector,
14 | },
15 | ],
16 | },
17 | ],
18 | },
19 | }
20 |
--------------------------------------------------------------------------------
/alerts/kubelet.libsonnet:
--------------------------------------------------------------------------------
1 | local utils = import '../lib/utils.libsonnet';
2 |
3 | {
4 | _config+:: {
5 | kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
6 | kubeletSelector: error 'must provide selector for kubelet',
7 | kubeNodeUnreachableIgnoreKeys: [
8 | 'ToBeDeletedByClusterAutoscaler',
9 | 'cloud.google.com/impending-node-termination',
10 | 'aws-node-termination-handler/spot-itn',
11 | ],
12 |
13 | kubeletCertExpirationWarningSeconds: 7 * 24 * 3600,
14 | kubeletCertExpirationCriticalSeconds: 1 * 24 * 3600,
15 |
16 | // Evictions per second that will trigger an alert. The default value will trigger on any evictions.
17 | KubeNodeEvictionRateThreshold: 0.0,
18 | },
19 |
20 | prometheusAlerts+:: {
21 | groups+: [
22 | {
23 | name: 'kubernetes-system-kubelet',
24 | rules: [
25 | {
26 | expr: |||
27 | kube_node_status_condition{%(kubeStateMetricsSelector)s,condition="Ready",status="true"} == 0
28 | and on (%(clusterLabel)s, node)
29 | kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0
30 | ||| % $._config,
31 | labels: {
32 | severity: 'warning',
33 | },
34 | annotations: {
35 | description: '{{ $labels.node }} has been unready for more than 15 minutes%s.' % [
36 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
37 | ],
38 | summary: 'Node is not ready.',
39 | },
40 | 'for': '15m',
41 | alert: 'KubeNodeNotReady',
42 | },
43 | {
44 | alert: 'KubeNodePressure',
45 | expr: |||
46 | kube_node_status_condition{%(kubeStateMetricsSelector)s,condition=~"(MemoryPressure|DiskPressure|PIDPressure)",status="true"} == 1
47 | and on (%(clusterLabel)s, node)
48 | kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0
49 | ||| % $._config,
50 | labels: {
51 | severity: 'info',
52 | },
53 | 'for': '10m',
54 | annotations: {
55 | description: '{{ $labels.node }}%s has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.' % [
56 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
57 | ],
58 | summary: 'Node has as active Condition.',
59 | },
60 | },
61 | {
62 | expr: |||
63 | (kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{%(kubeStateMetricsSelector)s,key=~"%(kubeNodeUnreachableIgnoreKeys)s"}) == 1
64 | ||| % $._config {
65 | kubeNodeUnreachableIgnoreKeys: std.join('|', super.kubeNodeUnreachableIgnoreKeys),
66 | },
67 | labels: {
68 | severity: 'warning',
69 | },
70 | annotations: {
71 | description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled%s.' % [
72 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
73 | ],
74 | summary: 'Node is unreachable.',
75 | },
76 | 'for': '15m',
77 | alert: 'KubeNodeUnreachable',
78 | },
79 | {
80 | alert: 'KubeletTooManyPods',
81 | // Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
82 | // We have to ignore this special node in the KubeletTooManyPods alert.
83 | expr: |||
84 | (
85 | max by (%(clusterLabel)s, instance) (
86 | kubelet_running_pods{%(kubeletSelector)s} > 1
87 | )
88 | * on (%(clusterLabel)s, instance) group_left(node)
89 | max by (%(clusterLabel)s, instance, node) (
90 | kubelet_node_name{%(kubeletSelector)s}
91 | )
92 | )
93 | / on (%(clusterLabel)s, node) group_left()
94 | max by (%(clusterLabel)s, node) (
95 | kube_node_status_capacity{%(kubeStateMetricsSelector)s, resource="pods"} != 1
96 | ) > 0.95
97 | ||| % $._config,
98 | 'for': '15m',
99 | labels: {
100 | severity: 'info',
101 | },
102 | annotations: {
103 | description: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity%s." % [
104 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
105 | ],
106 | summary: 'Kubelet is running at capacity.',
107 | },
108 | },
109 | {
110 | alert: 'KubeNodeReadinessFlapping',
111 | expr: |||
112 | sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
113 | and on (%(clusterLabel)s, node)
114 | kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0
115 | ||| % $._config,
116 | 'for': '15m',
117 | labels: {
118 | severity: 'warning',
119 | },
120 | annotations: {
121 | description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes%s.' % [
122 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
123 | ],
124 | summary: 'Node readiness status is flapping.',
125 | },
126 | },
127 | {
128 | alert: 'KubeNodeEviction',
129 | expr: |||
130 | sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, instance)
131 | * on (%(clusterLabel)s, instance) group_left(node)
132 | max by (%(clusterLabel)s, instance, node) (
133 | kubelet_node_name{%(kubeletSelector)s}
134 | )
135 | > %(KubeNodeEvictionRateThreshold)s
136 | ||| % $._config,
137 | labels: {
138 | severity: 'info',
139 | },
140 | 'for': '0s',
141 | annotations: {
142 | description: 'Node {{ $labels.node }}%s is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.' % [
143 | utils.ifShowMultiCluster($._config, ' on {{ $labels.%(clusterLabel)s }}' % $._config),
144 | ],
145 | summary: 'Node is evicting pods.',
146 | },
147 | },
148 | {
149 | alert: 'KubeletPlegDurationHigh',
150 | expr: |||
151 | node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
152 | ||| % $._config,
153 | 'for': '5m',
154 | labels: {
155 | severity: 'warning',
156 | },
157 | annotations: {
158 | description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}%s.' % [
159 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
160 | ],
161 | summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.',
162 | },
163 | },
164 | {
165 | alert: 'KubeletPodStartUpLatencyHigh',
166 | expr: |||
167 | histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (%(clusterLabel)s, instance, le)) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s} > 60
168 | ||| % $._config,
169 | 'for': '15m',
170 | labels: {
171 | severity: 'warning',
172 | },
173 | annotations: {
174 | description: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}%s.' % [
175 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
176 | ],
177 | summary: 'Kubelet Pod startup latency is too high.',
178 | },
179 | },
180 | {
181 | alert: 'KubeletClientCertificateExpiration',
182 | expr: |||
183 | kubelet_certificate_manager_client_ttl_seconds < %(kubeletCertExpirationWarningSeconds)s
184 | ||| % $._config,
185 | labels: {
186 | severity: 'warning',
187 | },
188 | annotations: {
189 | description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
190 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
191 | ],
192 | summary: 'Kubelet client certificate is about to expire.',
193 | },
194 | },
195 | {
196 | alert: 'KubeletClientCertificateExpiration',
197 | expr: |||
198 | kubelet_certificate_manager_client_ttl_seconds < %(kubeletCertExpirationCriticalSeconds)s
199 | ||| % $._config,
200 | labels: {
201 | severity: 'critical',
202 | },
203 | annotations: {
204 | description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
205 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
206 | ],
207 | summary: 'Kubelet client certificate is about to expire.',
208 | },
209 | },
210 | {
211 | alert: 'KubeletServerCertificateExpiration',
212 | expr: |||
213 | kubelet_certificate_manager_server_ttl_seconds < %(kubeletCertExpirationWarningSeconds)s
214 | ||| % $._config,
215 | labels: {
216 | severity: 'warning',
217 | },
218 | annotations: {
219 | description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
220 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
221 | ],
222 | summary: 'Kubelet server certificate is about to expire.',
223 | },
224 | },
225 | {
226 | alert: 'KubeletServerCertificateExpiration',
227 | expr: |||
228 | kubelet_certificate_manager_server_ttl_seconds < %(kubeletCertExpirationCriticalSeconds)s
229 | ||| % $._config,
230 | labels: {
231 | severity: 'critical',
232 | },
233 | annotations: {
234 | description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
235 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
236 | ],
237 | summary: 'Kubelet server certificate is about to expire.',
238 | },
239 | },
240 | {
241 | alert: 'KubeletClientCertificateRenewalErrors',
242 | expr: |||
243 | increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
244 | ||| % $._config,
245 | labels: {
246 | severity: 'warning',
247 | },
248 | 'for': '15m',
249 | annotations: {
250 | description: 'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [
251 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
252 | ],
253 | summary: 'Kubelet has failed to renew its client certificate.',
254 | },
255 | },
256 | {
257 | alert: 'KubeletServerCertificateRenewalErrors',
258 | expr: |||
259 | increase(kubelet_server_expiration_renew_errors[5m]) > 0
260 | ||| % $._config,
261 | labels: {
262 | severity: 'warning',
263 | },
264 | 'for': '15m',
265 | annotations: {
266 | description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [
267 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
268 | ],
269 | summary: 'Kubelet has failed to renew its server certificate.',
270 | },
271 | },
272 | (import '../lib/absent_alert.libsonnet') {
273 | componentName:: 'Kubelet',
274 | selector:: $._config.kubeletSelector,
275 | },
276 | ],
277 | },
278 | ],
279 | },
280 | }
281 |
--------------------------------------------------------------------------------
/alerts/resource_alerts.libsonnet:
--------------------------------------------------------------------------------
1 | local utils = import '../lib/utils.libsonnet';
2 |
3 | {
4 | _config+:: {
5 | kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
6 | nodeExporterSelector: error 'must provide selector for node-exporter',
7 | namespaceSelector: null,
8 | prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '',
9 |
10 | // We alert when the aggregate (CPU, Memory) quota for all namespaces is
11 | // greater than the amount of the resources in the cluster. We do however
12 | // allow you to overcommit if you wish.
13 | namespaceOvercommitFactor: 1.5,
14 | cpuThrottlingPercent: 25,
15 | cpuThrottlingSelector: '',
16 | // Set this selector for seleting namespaces that contains resources used for overprovision
17 | // See https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-configure-overprovisioning-with-cluster-autoscaler
18 | // for more details.
19 | ignoringOverprovisionedWorkloadSelector: '',
20 | },
21 |
22 | prometheusAlerts+:: {
23 | groups+: [
24 | {
25 | name: 'kubernetes-resources',
26 | rules: [
27 | {
28 | alert: 'KubeCPUOvercommit',
29 | labels: {
30 | severity: 'warning',
31 | },
32 | annotations: {
33 | summary: 'Cluster has overcommitted CPU resource requests.',
34 | },
35 | 'for': '10m',
36 | } +
37 | if $._config.showMultiCluster then {
38 | expr: |||
39 | sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
40 | and
41 | (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
42 | ||| % $._config,
43 | annotations+: {
44 | description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % $._config,
45 | },
46 | } else {
47 | expr: |||
48 | sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
49 | and
50 | (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
51 | ||| % $._config,
52 | annotations+: {
53 | description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
54 | },
55 | },
56 | {
57 | alert: 'KubeMemoryOvercommit',
58 | labels: {
59 | severity: 'warning',
60 | },
61 | annotations: {
62 | summary: 'Cluster has overcommitted memory resource requests.',
63 | },
64 | 'for': '10m',
65 | } +
66 | if $._config.showMultiCluster then {
67 | expr: |||
68 | sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
69 | and
70 | (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
71 | ||| % $._config,
72 | annotations+: {
73 | description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
74 | },
75 | } else
76 | {
77 | expr: |||
78 | sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
79 | and
80 | (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
81 | ||| % $._config,
82 | annotations+: {
83 | description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
84 | },
85 | },
86 | {
87 | alert: 'KubeCPUQuotaOvercommit',
88 | labels: {
89 | severity: 'warning',
90 | },
91 | annotations: {
92 | summary: 'Cluster has overcommitted CPU resource requests.',
93 | },
94 | 'for': '5m',
95 | } +
96 | if $._config.showMultiCluster then {
97 | expr: |||
98 | sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"})) by (%(clusterLabel)s)
99 | /
100 | sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)
101 | > %(namespaceOvercommitFactor)s
102 | ||| % $._config,
103 | annotations+: {
104 | description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Namespaces.' % $._config,
105 | },
106 | } else
107 | {
108 | expr: |||
109 | sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"}))
110 | /
111 | sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})
112 | > %(namespaceOvercommitFactor)s
113 | ||| % $._config,
114 | annotations+: {
115 | description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
116 | },
117 | },
118 | {
119 | alert: 'KubeMemoryQuotaOvercommit',
120 | labels: {
121 | severity: 'warning',
122 | },
123 | annotations: {
124 | summary: 'Cluster has overcommitted memory resource requests.',
125 | },
126 | 'for': '5m',
127 | } +
128 | if $._config.showMultiCluster then {
129 | expr: |||
130 | sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"})) by (%(clusterLabel)s)
131 | /
132 | sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)
133 | > %(namespaceOvercommitFactor)s
134 | ||| % $._config,
135 | annotations+: {
136 | description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Namespaces.' % $._config,
137 | },
138 | } else
139 | {
140 | expr: |||
141 | sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"}))
142 | /
143 | sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})
144 | > %(namespaceOvercommitFactor)s
145 | ||| % $._config,
146 | annotations+: {
147 | description: 'Cluster has overcommitted memory resource requests for Namespaces.',
148 | },
149 | },
150 | {
151 | alert: 'KubeQuotaAlmostFull',
152 | expr: |||
153 | kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"}
154 | / ignoring(instance, job, type)
155 | (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0)
156 | > 0.9 < 1
157 | ||| % $._config,
158 | 'for': '15m',
159 | labels: {
160 | severity: 'info',
161 | },
162 | annotations: {
163 | description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
164 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
165 | ],
166 | summary: 'Namespace quota is going to be full.',
167 | },
168 | },
169 | {
170 | alert: 'KubeQuotaFullyUsed',
171 | expr: |||
172 | kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"}
173 | / ignoring(instance, job, type)
174 | (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0)
175 | == 1
176 | ||| % $._config,
177 | 'for': '15m',
178 | labels: {
179 | severity: 'info',
180 | },
181 | annotations: {
182 | description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
183 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
184 | ],
185 | summary: 'Namespace quota is fully used.',
186 | },
187 | },
188 | {
189 | alert: 'KubeQuotaExceeded',
190 | expr: |||
191 | kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"}
192 | / ignoring(instance, job, type)
193 | (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0)
194 | > 1
195 | ||| % $._config,
196 | 'for': '15m',
197 | labels: {
198 | severity: 'warning',
199 | },
200 | annotations: {
201 | description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
202 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
203 | ],
204 | summary: 'Namespace quota has exceeded the limits.',
205 | },
206 | },
207 | {
208 | alert: 'CPUThrottlingHigh',
209 | expr: |||
210 | sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
211 | / on (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) group_left
212 | sum(increase(container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
213 | > ( %(cpuThrottlingPercent)s / 100 )
214 | ||| % $._config,
215 | 'for': '15m',
216 | labels: {
217 | severity: 'info',
218 | },
219 | annotations: {
220 | description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}%s.' % [
221 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
222 | ],
223 | summary: 'Processes experience elevated CPU throttling.',
224 | },
225 | },
226 | ],
227 | },
228 | ],
229 | },
230 | }
231 |
--------------------------------------------------------------------------------
/alerts/storage_alerts.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
4 | kubeletSelector: error 'must provide selector for kubelet',
5 | namespaceSelector: null,
6 | prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '',
7 |
8 | // We alert when a disk is expected to fill up in four days. Depending on
9 | // the data-set it might be useful to change the sampling-time for the
10 | // prediction
11 | volumeFullPredictionSampleTime: '6h',
12 | },
13 |
14 | prometheusAlerts+:: {
15 | groups+: [
16 | {
17 | name: 'kubernetes-storage',
18 | rules: [
19 | {
20 | alert: 'KubePersistentVolumeFillingUp',
21 | expr: |||
22 | (
23 | kubelet_volume_stats_available_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
24 | /
25 | kubelet_volume_stats_capacity_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
26 | ) < 0.03
27 | and
28 | kubelet_volume_stats_used_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0
29 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim)
30 | kube_persistentvolumeclaim_access_mode{%(prefixedNamespaceSelector)s access_mode="ReadOnlyMany"} == 1
31 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim)
32 | kube_persistentvolumeclaim_labels{%(prefixedNamespaceSelector)s%(pvExcludedSelector)s} == 1
33 | ||| % $._config,
34 | 'for': '1m',
35 | labels: {
36 | severity: 'critical',
37 | },
38 | annotations: {
39 | description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.' % $._config,
40 | summary: 'PersistentVolume is filling up.',
41 | },
42 | },
43 | {
44 | alert: 'KubePersistentVolumeFillingUp',
45 | expr: |||
46 | (
47 | kubelet_volume_stats_available_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
48 | /
49 | kubelet_volume_stats_capacity_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
50 | ) < 0.15
51 | and
52 | kubelet_volume_stats_used_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0
53 | and
54 | predict_linear(kubelet_volume_stats_available_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}[%(volumeFullPredictionSampleTime)s], 4 * 24 * 3600) < 0
55 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim)
56 | kube_persistentvolumeclaim_access_mode{%(prefixedNamespaceSelector)s access_mode="ReadOnlyMany"} == 1
57 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim)
58 | kube_persistentvolumeclaim_labels{%(prefixedNamespaceSelector)s%(pvExcludedSelector)s} == 1
59 | ||| % $._config,
60 | 'for': '1h',
61 | labels: {
62 | severity: 'warning',
63 | },
64 | annotations: {
65 | description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.' % $._config,
66 | summary: 'PersistentVolume is filling up.',
67 | },
68 | },
69 | {
70 | alert: 'KubePersistentVolumeInodesFillingUp',
71 | expr: |||
72 | (
73 | kubelet_volume_stats_inodes_free{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
74 | /
75 | kubelet_volume_stats_inodes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
76 | ) < 0.03
77 | and
78 | kubelet_volume_stats_inodes_used{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0
79 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim)
80 | kube_persistentvolumeclaim_access_mode{%(prefixedNamespaceSelector)s access_mode="ReadOnlyMany"} == 1
81 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim)
82 | kube_persistentvolumeclaim_labels{%(prefixedNamespaceSelector)s%(pvExcludedSelector)s} == 1
83 | ||| % $._config,
84 | 'for': '1m',
85 | labels: {
86 | severity: 'critical',
87 | },
88 | annotations: {
89 | description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.' % $._config,
90 | summary: 'PersistentVolumeInodes are filling up.',
91 | },
92 | },
93 | {
94 | alert: 'KubePersistentVolumeInodesFillingUp',
95 | expr: |||
96 | (
97 | kubelet_volume_stats_inodes_free{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
98 | /
99 | kubelet_volume_stats_inodes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
100 | ) < 0.15
101 | and
102 | kubelet_volume_stats_inodes_used{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0
103 | and
104 | predict_linear(kubelet_volume_stats_inodes_free{%(prefixedNamespaceSelector)s%(kubeletSelector)s}[%(volumeFullPredictionSampleTime)s], 4 * 24 * 3600) < 0
105 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim)
106 | kube_persistentvolumeclaim_access_mode{%(prefixedNamespaceSelector)s access_mode="ReadOnlyMany"} == 1
107 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim)
108 | kube_persistentvolumeclaim_labels{%(prefixedNamespaceSelector)s%(pvExcludedSelector)s} == 1
109 | ||| % $._config,
110 | 'for': '1h',
111 | labels: {
112 | severity: 'warning',
113 | },
114 | annotations: {
115 | description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free.' % $._config,
116 | summary: 'PersistentVolumeInodes are filling up.',
117 | },
118 | },
119 | {
120 | alert: 'KubePersistentVolumeErrors',
121 | expr: |||
122 | kube_persistentvolume_status_phase{phase=~"Failed|Pending",%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
123 | ||| % $._config,
124 | 'for': '5m',
125 | labels: {
126 | severity: 'critical',
127 | },
128 | annotations: {
129 | description: 'The persistent volume {{ $labels.persistentvolume }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.' % $._config,
130 | summary: 'PersistentVolume is having issues with provisioning.',
131 | },
132 | },
133 | ],
134 | },
135 | ],
136 | },
137 | }
138 |
--------------------------------------------------------------------------------
/alerts/system_alerts.libsonnet:
--------------------------------------------------------------------------------
1 | local utils = import '../lib/utils.libsonnet';
2 |
3 | {
4 | _config+:: {
5 | notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"',
6 | kubeApiserverSelector: 'job="kube-apiserver"',
7 | },
8 |
9 | prometheusAlerts+:: {
10 | groups+: [
11 | {
12 | name: 'kubernetes-system',
13 | rules: [
14 | {
15 | alert: 'KubeVersionMismatch',
16 | expr: |||
17 | count by (%(clusterLabel)s) (count by (git_version, %(clusterLabel)s) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
18 | ||| % $._config,
19 | 'for': '15m',
20 | labels: {
21 | severity: 'warning',
22 | },
23 | annotations: {
24 | description: 'There are {{ $value }} different semantic versions of Kubernetes components running%s.' % [
25 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
26 | ],
27 | summary: 'Different semantic versions of Kubernetes components running.',
28 | },
29 | },
30 | {
31 | alert: 'KubeClientErrors',
32 | // Many clients use get requests to check the existence of objects,
33 | // this is normal and an expected error, therefore it should be
34 | // ignored in this alert.
35 | expr: |||
36 | (sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
37 | /
38 | sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace))
39 | > 0.01
40 | ||| % $._config,
41 | 'for': '15m',
42 | labels: {
43 | severity: 'warning',
44 | },
45 | annotations: {
46 | description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors%s." % [
47 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
48 | ],
49 | summary: 'Kubernetes API server client is experiencing errors.',
50 | },
51 | },
52 | ],
53 | },
54 | ],
55 | },
56 | }
57 |
--------------------------------------------------------------------------------
/config.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | SLOs: {
4 | apiserver: {
5 | days: 30, // The number of days we alert on burning too much error budget for.
6 | target: 0.99, // The target percentage of availability between 0-1. (0.99 = 99%, 0.999 = 99.9%)
7 |
8 | // Only change these windows when you really understand multi burn rate errors.
9 | // Even though you can change the days above (which will change availability calculations)
10 | // these windows will alert on a 30 days sliding window. We're looking into basing these windows on the given days too.
11 | windows: [
12 | { severity: 'critical', 'for': '2m', long: '1h', short: '5m', factor: 14.4 },
13 | { severity: 'critical', 'for': '15m', long: '6h', short: '30m', factor: 6 },
14 | { severity: 'warning', 'for': '1h', long: '1d', short: '2h', factor: 3 },
15 | { severity: 'warning', 'for': '3h', long: '3d', short: '6h', factor: 1 },
16 | ],
17 | },
18 | },
19 |
20 | // Selectors are inserted between {} in Prometheus queries.
21 | cadvisorSelector: 'job="cadvisor"',
22 | kubeletSelector: 'job="kubelet"',
23 | kubeStateMetricsSelector: 'job="kube-state-metrics"',
24 | nodeExporterSelector: 'job="node-exporter"',
25 | kubeSchedulerSelector: 'job="kube-scheduler"',
26 | kubeControllerManagerSelector: 'job="kube-controller-manager"',
27 | kubeApiserverSelector: 'job="kube-apiserver"',
28 | kubeProxySelector: 'job="kube-proxy"',
29 | podLabel: 'pod',
30 | hostNetworkInterfaceSelector: 'device!~"veth.+"',
31 | hostMountpointSelector: 'mountpoint="/"',
32 | windowsExporterSelector: 'job="kubernetes-windows-exporter"',
33 | containerfsSelector: 'container!=""',
34 |
35 | // List of labels to join for different type of metrics
36 | // Only works if your environment has the labels kube_%s_labels (e.g. kube_pod_labels) available.
37 | common_join_labels: [],
38 | pods_join_labels: $._config.common_join_labels,
39 | statefulsets_join_labels: $._config.common_join_labels,
40 | deployments_join_labels: $._config.common_join_labels,
41 | daemonsets_join_labels: $._config.common_join_labels,
42 | horizontalpodautoscalers_join_labels: $._config.common_join_labels,
43 | jobs_join_labels: $._config.common_join_labels,
44 |
45 | // Grafana dashboard IDs are necessary for stable links for dashboards
46 | grafanaDashboardIDs: {
47 | 'apiserver.json': std.md5('apiserver.json'),
48 | 'cluster-total.json': std.md5('cluster-total.json'),
49 | 'controller-manager.json': std.md5('controller-manager.json'),
50 | 'k8s-resources-cluster.json': std.md5('k8s-resources-cluster.json'),
51 | 'k8s-resources-multicluster.json': std.md5('k8s-resources-multicluster.json'),
52 | 'k8s-resources-namespace.json': std.md5('k8s-resources-namespace.json'),
53 | 'k8s-resources-node.json': std.md5('k8s-resources-node.json'),
54 | 'k8s-resources-pod.json': std.md5('k8s-resources-pod.json'),
55 | 'k8s-resources-windows-cluster.json': std.md5('k8s-resources-windows-cluster.json'),
56 | 'k8s-resources-windows-namespace.json': std.md5('k8s-resources-windows-namespace.json'),
57 | 'k8s-resources-windows-pod.json': std.md5('k8s-resources-windows-pod.json'),
58 | 'k8s-resources-workload.json': std.md5('k8s-resources-workload.json'),
59 | 'k8s-resources-workloads-namespace.json': std.md5('k8s-resources-workloads-namespace.json'),
60 | 'k8s-windows-cluster-rsrc-use.json': std.md5('k8s-windows-cluster-rsrc-use.json'),
61 | 'k8s-windows-node-rsrc-use.json': std.md5('k8s-windows-node-rsrc-use.json'),
62 | 'kubelet.json': std.md5('kubelet.json'),
63 | 'namespace-by-pod.json': std.md5('namespace-by-pod.json'),
64 | 'namespace-by-workload.json': std.md5('namespace-by-workload.json'),
65 | 'persistentvolumesusage.json': std.md5('persistentvolumesusage.json'),
66 | 'pod-total.json': std.md5('pod-total.json'),
67 | 'proxy.json': std.md5('proxy.json'),
68 | 'scheduler.json': std.md5('scheduler.json'),
69 | 'workload-total.json': std.md5('workload-total.json'),
70 | },
71 |
72 | // Support for Grafana 7.2+ `$__rate_interval` instead of `$__interval`
73 | grafana72: true,
74 | grafanaIntervalVar: if self.grafana72 then '$__rate_interval' else '$__interval',
75 |
76 | // Config for the Grafana dashboards in the Kubernetes Mixin
77 | grafanaK8s: {
78 | dashboardNamePrefix: 'Kubernetes / ',
79 | dashboardTags: ['kubernetes-mixin'],
80 |
81 | // For links between grafana dashboards, you need to tell us if your grafana
82 | // servers under some non-root path.
83 | linkPrefix: '',
84 |
85 | // The default refresh time for all dashboards, default to 10s
86 | refresh: '10s',
87 | minimumTimeInterval: '1m',
88 |
89 | // Timezone for Grafana dashboards:: UTC, browser, ...
90 | grafanaTimezone: 'UTC',
91 | },
92 |
93 | // Opt-in to multiCluster dashboards by overriding this and the clusterLabel.
94 | showMultiCluster: false,
95 | clusterLabel: 'cluster',
96 |
97 | namespaceLabel: 'namespace',
98 |
99 | // Default datasource name
100 | datasourceName: 'default',
101 |
102 | // Datasource instance filter regex
103 | datasourceFilterRegex: '',
104 |
105 | // This list of filesystem is referenced in various expressions.
106 | fstypes: ['ext[234]', 'btrfs', 'xfs', 'zfs'],
107 | fstypeSelector: 'fstype=~"%s"' % std.join('|', self.fstypes),
108 |
109 | // This list of disk device names is referenced in various expressions.
110 | diskDevices: ['mmcblk.p.+', 'nvme.+', 'rbd.+', 'sd.+', 'vd.+', 'xvd.+', 'dm-.+', 'dasd.+'],
111 | diskDeviceSelector: 'device=~"(/dev.+)|%s"' % std.join('|', self.diskDevices),
112 |
113 | // Certain workloads (e.g. KubeVirt/CDI) will fully utilise the persistent volume they claim
114 | // the size of the PV will never grow since they consume the entirety of the volume by design.
115 | // This selector allows an admin to 'pre-mark' the PVC of such a workload (or for any other use case)
116 | // so that specific storage alerts will not fire.With the default selector, adding a label `excluded-from-alerts: 'true'`
117 | // to the PVC will have the desired effect.
118 | pvExcludedSelector: 'label_excluded_from_alerts="true"',
119 |
120 | // Default timeout value for k8s Jobs. The jobs which are active beyond this duration would trigger KubeJobNotCompleted alert.
121 | kubeJobTimeoutDuration: 12 * 60 * 60,
122 | },
123 | }
124 |
--------------------------------------------------------------------------------
/dashboards/controller-manager.libsonnet:
--------------------------------------------------------------------------------
1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
2 | local prometheus = g.query.prometheus;
3 | local stat = g.panel.stat;
4 | local timeSeries = g.panel.timeSeries;
5 | local var = g.dashboard.variable;
6 |
7 | {
8 | local statPanel(title, unit, query) =
9 | stat.new(title)
10 | + stat.options.withColorMode('none')
11 | + stat.standardOptions.withUnit(unit)
12 | + stat.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval)
13 | + stat.queryOptions.withTargets([
14 | prometheus.new('${datasource}', query)
15 | + prometheus.withInstant(true),
16 | ]),
17 |
18 | local tsPanel =
19 | timeSeries {
20 | new(title):
21 | timeSeries.new(title)
22 | + timeSeries.options.legend.withShowLegend()
23 | + timeSeries.options.legend.withAsTable()
24 | + timeSeries.options.legend.withDisplayMode('table')
25 | + timeSeries.options.legend.withPlacement('right')
26 | + timeSeries.options.legend.withCalcs(['lastNotNull'])
27 | + timeSeries.options.tooltip.withMode('single')
28 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never')
29 | + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10)
30 | + timeSeries.fieldConfig.defaults.custom.withSpanNulls(true)
31 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval),
32 | },
33 |
34 | grafanaDashboards+:: {
35 | 'controller-manager.json':
36 | local variables = {
37 | datasource:
38 | var.datasource.new('datasource', 'prometheus')
39 | + var.datasource.withRegex($._config.datasourceFilterRegex)
40 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue()
41 | + var.datasource.generalOptions.withLabel('Data source')
42 | + {
43 | current: {
44 | selected: true,
45 | text: $._config.datasourceName,
46 | value: $._config.datasourceName,
47 | },
48 | },
49 |
50 | cluster:
51 | var.query.new('cluster')
52 | + var.query.withDatasourceFromVariable(self.datasource)
53 | + var.query.queryTypes.withLabelValues(
54 | $._config.clusterLabel,
55 | 'up{%(kubeControllerManagerSelector)s}' % $._config,
56 | )
57 | + var.query.generalOptions.withLabel('cluster')
58 | + var.query.refresh.onTime()
59 | + (
60 | if $._config.showMultiCluster
61 | then var.query.generalOptions.showOnDashboard.withLabelAndValue()
62 | else var.query.generalOptions.showOnDashboard.withNothing()
63 | )
64 | + var.query.withSort(type='alphabetical'),
65 |
66 | instance:
67 | var.query.new('instance')
68 | + var.query.withDatasourceFromVariable(self.datasource)
69 | + var.query.queryTypes.withLabelValues(
70 | 'instance',
71 | 'up{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s}' % $._config,
72 | )
73 | + var.query.generalOptions.withLabel('instance')
74 | + var.query.refresh.onTime()
75 | + var.query.generalOptions.showOnDashboard.withLabelAndValue()
76 | + var.query.withSort(type='alphabetical')
77 | + var.query.selectionOptions.withIncludeAll(),
78 | };
79 |
80 | local panels = [
81 | statPanel(
82 | 'Up',
83 | 'none',
84 | 'sum(up{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s})' % $._config
85 | )
86 | + stat.gridPos.withW(4),
87 |
88 | tsPanel.new('Work Queue Add Rate')
89 | + tsPanel.gridPos.withW(20)
90 | + tsPanel.standardOptions.withUnit('ops')
91 | + tsPanel.queryOptions.withTargets([
92 | prometheus.new(
93 | '${datasource}',
94 | 'sum(rate(workqueue_adds_total{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, name)' % $._config
95 | )
96 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} {{name}}' % $._config),
97 | ]),
98 |
99 | tsPanel.new('Work Queue Depth')
100 | + tsPanel.standardOptions.withUnit('short')
101 | + tsPanel.queryOptions.withTargets([
102 | prometheus.new(
103 | '${datasource}',
104 | 'sum(rate(workqueue_depth{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, name)' % $._config
105 | )
106 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} {{name}}' % $._config),
107 | ]),
108 |
109 | tsPanel.new('Work Queue Latency')
110 | + tsPanel.standardOptions.withUnit('s')
111 | + tsPanel.queryOptions.withTargets([
112 | prometheus.new(
113 | '${datasource}',
114 | 'histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, name, le))' % $._config
115 | )
116 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} {{name}}' % $._config),
117 | ]),
118 |
119 | tsPanel.new('Kube API Request Rate')
120 | + tsPanel.gridPos.withW(8)
121 | + tsPanel.standardOptions.withUnit('ops')
122 | + tsPanel.queryOptions.withTargets([
123 | prometheus.new(
124 | '${datasource}',
125 | 'sum(rate(rest_client_requests_total{%(kubeControllerManagerSelector)s, instance=~"$instance",code=~"2.."}[%(grafanaIntervalVar)s]))' % $._config
126 | )
127 | + prometheus.withLegendFormat('2xx'),
128 |
129 | prometheus.new(
130 | '${datasource}',
131 | 'sum(rate(rest_client_requests_total{%(kubeControllerManagerSelector)s, instance=~"$instance",code=~"3.."}[%(grafanaIntervalVar)s]))' % $._config
132 | )
133 | + prometheus.withLegendFormat('3xx'),
134 |
135 | prometheus.new(
136 | '${datasource}',
137 | 'sum(rate(rest_client_requests_total{%(kubeControllerManagerSelector)s, instance=~"$instance",code=~"4.."}[%(grafanaIntervalVar)s]))' % $._config
138 | )
139 | + prometheus.withLegendFormat('4xx'),
140 |
141 | prometheus.new(
142 | '${datasource}',
143 | 'sum(rate(rest_client_requests_total{%(kubeControllerManagerSelector)s, instance=~"$instance",code=~"5.."}[%(grafanaIntervalVar)s]))' % $._config
144 | )
145 | + prometheus.withLegendFormat('5xx'),
146 | ]),
147 |
148 | tsPanel.new('Post Request Latency 99th Quantile')
149 | + tsPanel.gridPos.withW(16)
150 | + tsPanel.standardOptions.withUnit('s')
151 | + tsPanel.queryOptions.withTargets([
152 | prometheus.new(
153 | '${datasource}',
154 | 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance", verb="POST"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config
155 | )
156 | + prometheus.withLegendFormat('{{verb}}'),
157 | ]),
158 |
159 | tsPanel.new('Get Request Latency 99th Quantile')
160 | + tsPanel.standardOptions.withUnit('s')
161 | + tsPanel.queryOptions.withTargets([
162 | prometheus.new(
163 | '${datasource}',
164 | 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance", verb="GET"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config
165 | )
166 | + prometheus.withLegendFormat('{{verb}}'),
167 | ]),
168 |
169 | tsPanel.new('Memory')
170 | + tsPanel.gridPos.withW(8)
171 | + tsPanel.standardOptions.withUnit('bytes')
172 | + tsPanel.queryOptions.withTargets([
173 | prometheus.new(
174 | '${datasource}',
175 | 'process_resident_memory_bytes{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s,instance=~"$instance"}' % $._config
176 | )
177 | + prometheus.withLegendFormat('{{instance}}'),
178 | ]),
179 |
180 | tsPanel.new('CPU usage')
181 | + tsPanel.gridPos.withW(8)
182 | + tsPanel.standardOptions.withUnit('short')
183 | + tsPanel.queryOptions.withTargets([
184 | prometheus.new(
185 | '${datasource}',
186 | 'rate(process_cpu_seconds_total{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])' % $._config
187 | )
188 | + prometheus.withLegendFormat('{{instance}}'),
189 | ]),
190 |
191 | tsPanel.new('Goroutines')
192 | + tsPanel.gridPos.withW(8)
193 | + tsPanel.standardOptions.withUnit('short')
194 | + tsPanel.queryOptions.withTargets([
195 | prometheus.new(
196 | '${datasource}',
197 | 'go_goroutines{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s,instance=~"$instance"}' % $._config
198 | )
199 | + prometheus.withLegendFormat('{{instance}}'),
200 | ]),
201 | ];
202 |
203 | g.dashboard.new('%(dashboardNamePrefix)sController Manager' % $._config.grafanaK8s)
204 | + g.dashboard.withUid($._config.grafanaDashboardIDs['controller-manager.json'])
205 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags)
206 | + g.dashboard.withEditable(false)
207 | + g.dashboard.time.withFrom('now-1h')
208 | + g.dashboard.time.withTo('now')
209 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh)
210 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.instance])
211 | + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=24, panelHeight=7)),
212 | },
213 | }
214 |
--------------------------------------------------------------------------------
/dashboards/dashboards.libsonnet:
--------------------------------------------------------------------------------
1 | (import 'network.libsonnet') +
2 | (import 'persistentvolumesusage.libsonnet') +
3 | (import 'resources.libsonnet') +
4 | (import 'apiserver.libsonnet') +
5 | (import 'controller-manager.libsonnet') +
6 | (import 'scheduler.libsonnet') +
7 | (import 'proxy.libsonnet') +
8 | (import 'kubelet.libsonnet') +
9 | (import 'defaults.libsonnet') +
10 | (import 'windows.libsonnet')
11 |
--------------------------------------------------------------------------------
/dashboards/defaults.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | local kubernetesMixin = self,
3 | local grafanaDashboards = super.grafanaDashboards,
4 |
5 | // Automatically add a uid to each dashboard based on the base64 encoding
6 | // of the file name and set the timezone to be 'default'.
7 | grafanaDashboards:: {
8 | [filename]: grafanaDashboards[filename] {
9 | uid: std.get(kubernetesMixin._config.grafanaDashboardIDs, filename, default=std.md5(filename)),
10 | timezone: kubernetesMixin._config.grafanaK8s.grafanaTimezone,
11 | refresh: kubernetesMixin._config.grafanaK8s.refresh,
12 | tags: kubernetesMixin._config.grafanaK8s.dashboardTags,
13 | links: [
14 | {
15 | asDropdown: true,
16 | includeVars: true,
17 | keepTime: true,
18 | tags: kubernetesMixin._config.grafanaK8s.dashboardTags,
19 | targetBlank: false,
20 | title: 'Kubernetes',
21 | type: 'dashboards',
22 | },
23 | ],
24 |
25 | [if 'rows' in super then 'rows']: [
26 | row {
27 | panels: [
28 | panel {
29 | // Modify tooltip to only show a single value
30 | tooltip+: {
31 | shared: false,
32 | },
33 | // Modify legend to always show as table on right side
34 | legend+: {
35 | alignAsTable: true,
36 | rightSide: true,
37 | },
38 | // Set minimum time interval for all panels
39 | interval: kubernetesMixin._config.grafanaK8s.minimumTimeInterval,
40 | }
41 | for panel in super.panels
42 | ],
43 | }
44 | for row in super.rows
45 | ],
46 |
47 | }
48 | for filename in std.objectFields(grafanaDashboards)
49 | },
50 | }
51 |
--------------------------------------------------------------------------------
/dashboards/network-usage/pod-total.libsonnet:
--------------------------------------------------------------------------------
1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
2 | local gauge = g.panel.gauge;
3 | local prometheus = g.query.prometheus;
4 | local timeSeries = g.panel.timeSeries;
5 | local var = g.dashboard.variable;
6 |
7 | {
8 | local tsPanel =
9 | timeSeries {
10 | new(title):
11 | timeSeries.new(title)
12 | + timeSeries.options.legend.withShowLegend()
13 | + timeSeries.options.legend.withAsTable()
14 | + timeSeries.options.legend.withDisplayMode('table')
15 | + timeSeries.options.legend.withPlacement('right')
16 | + timeSeries.options.tooltip.withMode('single')
17 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never')
18 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval),
19 | },
20 |
21 | grafanaDashboards+:: {
22 | 'pod-total.json':
23 | local variables = {
24 | datasource:
25 | var.datasource.new('datasource', 'prometheus')
26 | + var.datasource.withRegex($._config.datasourceFilterRegex)
27 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue()
28 | + var.datasource.generalOptions.withLabel('Data source')
29 | + {
30 | current: {
31 | selected: true,
32 | text: $._config.datasourceName,
33 | value: $._config.datasourceName,
34 | },
35 | },
36 |
37 | cluster:
38 | var.query.new('cluster')
39 | + var.query.withDatasourceFromVariable(self.datasource)
40 | + var.query.queryTypes.withLabelValues(
41 | $._config.clusterLabel,
42 | 'up{%(cadvisorSelector)s}' % $._config,
43 | )
44 | + var.query.generalOptions.withLabel('cluster')
45 | + var.query.refresh.onTime()
46 | + (
47 | if $._config.showMultiCluster
48 | then var.query.generalOptions.showOnDashboard.withLabelAndValue()
49 | else var.query.generalOptions.showOnDashboard.withNothing()
50 | )
51 | + var.query.withSort(type='alphabetical'),
52 |
53 | namespace:
54 | var.query.new('namespace')
55 | + var.query.selectionOptions.withIncludeAll(true, '.+')
56 | + var.query.withDatasourceFromVariable(self.datasource)
57 | + var.query.queryTypes.withLabelValues(
58 | 'namespace',
59 | 'container_network_receive_packets_total{%(clusterLabel)s="$cluster"}' % $._config,
60 | )
61 | + var.query.generalOptions.withCurrent('kube-system')
62 | + var.query.generalOptions.withLabel('namespace')
63 | + var.query.refresh.onTime()
64 | + var.query.generalOptions.showOnDashboard.withLabelAndValue()
65 | + var.query.withSort(type='alphabetical'),
66 |
67 | pod:
68 | var.query.new('pod')
69 | + var.query.withDatasourceFromVariable(self.datasource)
70 | + var.query.queryTypes.withLabelValues(
71 | 'pod',
72 | 'container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace=~"$namespace"}' % $._config,
73 | )
74 | + var.query.generalOptions.withCurrent('kube-system')
75 | + var.query.generalOptions.withLabel('pod')
76 | + var.query.refresh.onTime()
77 | + var.query.generalOptions.showOnDashboard.withLabelAndValue()
78 | + var.query.withSort(type='alphabetical'),
79 | };
80 |
81 | local panels = [
82 | gauge.new('Current Rate of Bytes Received')
83 | + gauge.standardOptions.withDisplayName('$pod')
84 | + gauge.standardOptions.withUnit('Bps')
85 | + gauge.standardOptions.withMin(0)
86 | + gauge.standardOptions.withMax(10000000000) // 10GBs
87 | + gauge.standardOptions.thresholds.withSteps([
88 | {
89 | color: 'dark-green',
90 | index: 0,
91 | value: null, // 0GBs
92 | },
93 | {
94 | color: 'dark-yellow',
95 | index: 1,
96 | value: 5000000000, // 5GBs
97 | },
98 | {
99 | color: 'dark-red',
100 | index: 2,
101 | value: 7000000000, // 7GBs
102 | },
103 | ])
104 | + gauge.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval)
105 | + gauge.queryOptions.withTargets([
106 | prometheus.new(
107 | '${datasource}',
108 | 'sum(rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s]))' % $._config
109 | )
110 | + prometheus.withLegendFormat('__auto'),
111 | ]),
112 |
113 | gauge.new('Current Rate of Bytes Transmitted')
114 | + gauge.standardOptions.withDisplayName('$pod')
115 | + gauge.standardOptions.withUnit('Bps')
116 | + gauge.standardOptions.withMin(0)
117 | + gauge.standardOptions.withMax(10000000000) // 10GBs
118 | + gauge.standardOptions.thresholds.withSteps([
119 | {
120 | color: 'dark-green',
121 | index: 0,
122 | value: null, // 0GBs
123 | },
124 | {
125 | color: 'dark-yellow',
126 | index: 1,
127 | value: 5000000000, // 5GBs
128 | },
129 | {
130 | color: 'dark-red',
131 | index: 2,
132 | value: 7000000000, // 7GBs
133 | },
134 | ])
135 | + gauge.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval)
136 | + gauge.queryOptions.withTargets([
137 | prometheus.new(
138 | '${datasource}',
139 | 'sum(rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s]))' % $._config
140 | )
141 | + prometheus.withLegendFormat('__auto'),
142 | ]),
143 |
144 | tsPanel.new('Receive Bandwidth')
145 | + tsPanel.standardOptions.withUnit('binBps')
146 | + tsPanel.queryOptions.withTargets([
147 | prometheus.new(
148 | '${datasource}',
149 | 'sum(rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config
150 | )
151 | + prometheus.withLegendFormat('__auto'),
152 | ]),
153 |
154 | tsPanel.new('Transmit Bandwidth')
155 | + tsPanel.standardOptions.withUnit('binBps')
156 | + tsPanel.queryOptions.withTargets([
157 | prometheus.new(
158 | '${datasource}',
159 | 'sum(rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config
160 | )
161 | + prometheus.withLegendFormat('__auto'),
162 | ]),
163 |
164 | tsPanel.new('Rate of Received Packets')
165 | + tsPanel.standardOptions.withUnit('pps')
166 | + tsPanel.queryOptions.withTargets([
167 | prometheus.new('${datasource}', 'sum(rate(container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config)
168 | + prometheus.withLegendFormat('__auto'),
169 | ]),
170 |
171 | tsPanel.new('Rate of Transmitted Packets')
172 | + tsPanel.standardOptions.withUnit('pps')
173 | + tsPanel.queryOptions.withTargets([
174 | prometheus.new('${datasource}', 'sum(rate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config)
175 | + prometheus.withLegendFormat('__auto'),
176 | ]),
177 |
178 | tsPanel.new('Rate of Received Packets Dropped')
179 | + tsPanel.standardOptions.withUnit('pps')
180 | + tsPanel.queryOptions.withTargets([
181 | prometheus.new('${datasource}', 'sum(rate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config)
182 | + prometheus.withLegendFormat('__auto'),
183 | ]),
184 |
185 | tsPanel.new('Rate of Transmitted Packets Dropped')
186 | + tsPanel.standardOptions.withUnit('pps')
187 | + tsPanel.queryOptions.withTargets([
188 | prometheus.new('${datasource}', 'sum(rate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config)
189 | + prometheus.withLegendFormat('__auto'),
190 | ]),
191 | ];
192 |
193 | g.dashboard.new('%(dashboardNamePrefix)sNetworking / Pod' % $._config.grafanaK8s)
194 | + g.dashboard.withUid($._config.grafanaDashboardIDs['pod-total.json'])
195 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags)
196 | + g.dashboard.withEditable(false)
197 | + g.dashboard.time.withFrom('now-1h')
198 | + g.dashboard.time.withTo('now')
199 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh)
200 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.namespace, variables.pod])
201 | + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=12, panelHeight=9)),
202 | },
203 | }
204 |
--------------------------------------------------------------------------------
/dashboards/network.libsonnet:
--------------------------------------------------------------------------------
1 | (import 'network-usage/cluster-total.libsonnet') +
2 | (import 'network-usage/namespace-by-workload.libsonnet') +
3 | (import 'network-usage/namespace-by-pod.libsonnet') +
4 | (import 'network-usage/pod-total.libsonnet') +
5 | (import 'network-usage/workload-total.libsonnet')
6 |
--------------------------------------------------------------------------------
/dashboards/persistentvolumesusage.libsonnet:
--------------------------------------------------------------------------------
1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
2 | local prometheus = g.query.prometheus;
3 | local gauge = g.panel.gauge;
4 | local timeSeries = g.panel.timeSeries;
5 | local var = g.dashboard.variable;
6 |
7 | {
8 | local gaugePanel(title, unit, query) =
9 | gauge.new(title)
10 | + gauge.standardOptions.withUnit(unit)
11 | + gauge.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval)
12 | + gauge.queryOptions.withTargets([
13 | prometheus.new('${datasource}', query)
14 | + prometheus.withInstant(true),
15 | ]),
16 |
17 | local tsPanel =
18 | timeSeries {
19 | new(title):
20 | timeSeries.new(title)
21 | + timeSeries.options.legend.withShowLegend()
22 | + timeSeries.options.legend.withAsTable()
23 | + timeSeries.options.legend.withDisplayMode('table')
24 | + timeSeries.options.legend.withPlacement('right')
25 | + timeSeries.options.legend.withCalcs(['lastNotNull'])
26 | + timeSeries.options.tooltip.withMode('single')
27 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never')
28 | + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10)
29 | + timeSeries.fieldConfig.defaults.custom.withSpanNulls(true)
30 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval),
31 | },
32 |
33 | grafanaDashboards+:: {
34 | 'persistentvolumesusage.json':
35 | local variables = {
36 | datasource:
37 | var.datasource.new('datasource', 'prometheus')
38 | + var.datasource.withRegex($._config.datasourceFilterRegex)
39 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue()
40 | + var.datasource.generalOptions.withLabel('Data source')
41 | + {
42 | current: {
43 | selected: true,
44 | text: $._config.datasourceName,
45 | value: $._config.datasourceName,
46 | },
47 | },
48 |
49 | cluster:
50 | var.query.new('cluster')
51 | + var.query.withDatasourceFromVariable(self.datasource)
52 | + var.query.queryTypes.withLabelValues(
53 | $._config.clusterLabel,
54 | 'kubelet_volume_stats_capacity_bytes{%(kubeletSelector)s}' % $._config,
55 | )
56 | + var.query.generalOptions.withLabel('cluster')
57 | + var.query.refresh.onTime()
58 | + (
59 | if $._config.showMultiCluster
60 | then var.query.generalOptions.showOnDashboard.withLabelAndValue()
61 | else var.query.generalOptions.showOnDashboard.withNothing()
62 | )
63 | + var.query.withSort(type='alphabetical'),
64 |
65 | namespace:
66 | var.query.new('namespace')
67 | + var.query.withDatasourceFromVariable(self.datasource)
68 | + var.query.queryTypes.withLabelValues(
69 | 'namespace',
70 | 'kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s}' % $._config,
71 | )
72 | + var.query.generalOptions.withLabel('Namespace')
73 | + var.query.refresh.onTime()
74 | + var.query.generalOptions.showOnDashboard.withLabelAndValue()
75 | + var.query.withSort(type='alphabetical'),
76 |
77 | volume:
78 | var.query.new('volume')
79 | + var.query.withDatasourceFromVariable(self.datasource)
80 | + var.query.queryTypes.withLabelValues(
81 | 'persistentvolumeclaim',
82 | 'kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace"}' % $._config,
83 | )
84 | + var.query.generalOptions.withLabel('PersistentVolumeClaim')
85 | + var.query.refresh.onTime()
86 | + var.query.generalOptions.showOnDashboard.withLabelAndValue()
87 | + var.query.withSort(type='alphabetical'),
88 | };
89 |
90 | local panels = {
91 | tsUsage:
92 | tsPanel.new('Volume Space Usage')
93 | + tsPanel.standardOptions.withUnit('bytes')
94 | + tsPanel.queryOptions.withTargets([
95 | prometheus.new('${datasource}', |||
96 | (
97 | sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})))
98 | -
99 | sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})))
100 | )
101 | ||| % $._config)
102 | + prometheus.withLegendFormat('Used Space'),
103 |
104 | prometheus.new('${datasource}', |||
105 | sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})))
106 | ||| % $._config)
107 | + prometheus.withLegendFormat('Free Space'),
108 | ]),
109 | gaugeUsage:
110 | gaugePanel(
111 | 'Volume Space Usage',
112 | 'percent',
113 | |||
114 | max without(instance,node) (
115 | (
116 | topk(1, kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})
117 | -
118 | topk(1, kubelet_volume_stats_available_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})
119 | )
120 | /
121 | topk(1, kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})
122 | * 100)
123 | ||| % $._config
124 | )
125 | + gauge.standardOptions.withMin(0)
126 | + gauge.standardOptions.withMax(100)
127 | + gauge.standardOptions.color.withMode('thresholds')
128 | + gauge.standardOptions.thresholds.withMode('absolute')
129 | + gauge.standardOptions.thresholds.withSteps(
130 | [
131 | gauge.thresholdStep.withColor('green')
132 | + gauge.thresholdStep.withValue(0),
133 |
134 | gauge.thresholdStep.withColor('orange')
135 | + gauge.thresholdStep.withValue(80),
136 |
137 | gauge.thresholdStep.withColor('red')
138 | + gauge.thresholdStep.withValue(90),
139 | ]
140 | ),
141 |
142 | tsInodes:
143 | tsPanel.new('Volume inodes Usage')
144 | + tsPanel.standardOptions.withUnit('none')
145 | + tsPanel.queryOptions.withTargets([
146 | prometheus.new('${datasource}', 'sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})))' % $._config)
147 | + prometheus.withLegendFormat('Used inodes'),
148 |
149 | prometheus.new('${datasource}', |||
150 | (
151 | sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})))
152 | -
153 | sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})))
154 | )
155 | ||| % $._config)
156 | + prometheus.withLegendFormat('Free inodes'),
157 | ]),
158 | gaugeInodes:
159 | gaugePanel(
160 | 'Volume inodes Usage',
161 | 'percent',
162 | |||
163 | max without(instance,node) (
164 | topk(1, kubelet_volume_stats_inodes_used{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})
165 | /
166 | topk(1, kubelet_volume_stats_inodes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})
167 | * 100)
168 | ||| % $._config
169 | )
170 | + gauge.standardOptions.withMin(0)
171 | + gauge.standardOptions.withMax(100)
172 | + gauge.standardOptions.color.withMode('thresholds')
173 | + gauge.standardOptions.thresholds.withMode('absolute')
174 | + gauge.standardOptions.thresholds.withSteps(
175 | [
176 | gauge.thresholdStep.withColor('green')
177 | + gauge.thresholdStep.withValue(0),
178 |
179 | gauge.thresholdStep.withColor('orange')
180 | + gauge.thresholdStep.withValue(80),
181 |
182 | gauge.thresholdStep.withColor('red')
183 | + gauge.thresholdStep.withValue(90),
184 | ]
185 | ),
186 | };
187 |
188 | g.dashboard.new('%(dashboardNamePrefix)sPersistent Volumes' % $._config.grafanaK8s)
189 | + g.dashboard.withUid($._config.grafanaDashboardIDs['persistentvolumesusage.json'])
190 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags)
191 | + g.dashboard.withEditable(false)
192 | + g.dashboard.time.withFrom('now-1h')
193 | + g.dashboard.time.withTo('now')
194 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh)
195 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.namespace, variables.volume])
196 | + g.dashboard.withPanels([
197 | panels.tsUsage { gridPos+: { w: 18, h: 7, y: 0 } },
198 | panels.gaugeUsage { gridPos+: { w: 6, h: 7, x: 18, y: 0 } },
199 | panels.tsInodes { gridPos+: { w: 18, h: 7, y: 7 } },
200 | panels.gaugeInodes { gridPos+: { w: 6, h: 7, x: 18, y: 7 } },
201 | ]),
202 | },
203 | }
204 |
--------------------------------------------------------------------------------
/dashboards/proxy.libsonnet:
--------------------------------------------------------------------------------
1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
2 |
3 | local prometheus = g.query.prometheus;
4 | local stat = g.panel.stat;
5 | local timeSeries = g.panel.timeSeries;
6 | local var = g.dashboard.variable;
7 |
8 | {
9 | local statPanel(title, unit, query) =
10 | stat.new(title)
11 | + stat.options.withColorMode('none')
12 | + stat.standardOptions.withUnit(unit)
13 | + stat.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval)
14 | + stat.queryOptions.withTargets([
15 | prometheus.new('${datasource}', query)
16 | + prometheus.withInstant(true),
17 | ]),
18 |
19 | local tsPanel =
20 | timeSeries {
21 | new(title):
22 | timeSeries.new(title)
23 | + timeSeries.options.legend.withShowLegend()
24 | + timeSeries.options.legend.withAsTable()
25 | + timeSeries.options.legend.withDisplayMode('table')
26 | + timeSeries.options.legend.withPlacement('right')
27 | + timeSeries.options.legend.withCalcs(['lastNotNull'])
28 | + timeSeries.options.tooltip.withMode('single')
29 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never')
30 | + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10)
31 | + timeSeries.fieldConfig.defaults.custom.withSpanNulls(true)
32 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval),
33 | },
34 |
35 | grafanaDashboards+:: {
36 | 'proxy.json':
37 | local variables = {
38 | datasource:
39 | var.datasource.new('datasource', 'prometheus')
40 | + var.datasource.withRegex($._config.datasourceFilterRegex)
41 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue()
42 | + var.datasource.generalOptions.withLabel('Data source')
43 | + {
44 | current: {
45 | selected: true,
46 | text: $._config.datasourceName,
47 | value: $._config.datasourceName,
48 | },
49 | },
50 |
51 | cluster:
52 | var.query.new('cluster')
53 | + var.query.withDatasourceFromVariable(self.datasource)
54 | + var.query.queryTypes.withLabelValues(
55 | $._config.clusterLabel,
56 | 'up{%(kubeProxySelector)s}' % $._config
57 | )
58 | + var.query.generalOptions.withLabel('cluster')
59 | + var.query.refresh.onTime()
60 | + (
61 | if $._config.showMultiCluster
62 | then var.query.generalOptions.showOnDashboard.withLabelAndValue()
63 | else var.query.generalOptions.showOnDashboard.withNothing()
64 | )
65 | + var.query.withSort(type='alphabetical'),
66 |
67 | instance:
68 | var.query.new('instance')
69 | + var.query.withDatasourceFromVariable(self.datasource)
70 | + var.query.queryTypes.withLabelValues(
71 | 'instance',
72 | 'up{%(kubeProxySelector)s, %(clusterLabel)s="$cluster", %(kubeProxySelector)s}' % $._config,
73 | )
74 | + var.query.generalOptions.withLabel('instance')
75 | + var.query.refresh.onTime()
76 | + var.query.generalOptions.showOnDashboard.withLabelAndValue()
77 | + var.query.selectionOptions.withIncludeAll(true, '.+'),
78 | };
79 |
80 | local panels = [
81 | statPanel('Up', 'none', 'sum(up{%(clusterLabel)s="$cluster", %(kubeProxySelector)s})' % $._config)
82 | + stat.gridPos.withW(4),
83 |
84 | tsPanel.new('Rules Sync Rate')
85 | + tsPanel.gridPos.withW(10)
86 | + tsPanel.standardOptions.withUnit('ops')
87 | + tsPanel.queryOptions.withTargets([
88 | prometheus.new('${datasource}', 'sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s]))' % $._config)
89 | + prometheus.withLegendFormat('rate'),
90 | ]),
91 |
92 | tsPanel.new('Rules Sync Latency 99th Quantile')
93 | + tsPanel.gridPos.withW(10)
94 | + tsPanel.standardOptions.withUnit('s')
95 | + tsPanel.queryOptions.withTargets([
96 | prometheus.new('${datasource}', 'histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s]))' % $._config)
97 | + prometheus.withLegendFormat('{{instance}}'),
98 | ]),
99 |
100 | tsPanel.new('Network Programming Rate')
101 | + tsPanel.standardOptions.withUnit('ops')
102 | + tsPanel.queryOptions.withTargets([
103 | prometheus.new('${datasource}', 'sum(rate(kubeproxy_network_programming_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s]))' % $._config)
104 | + prometheus.withLegendFormat('rate'),
105 | ]),
106 |
107 | tsPanel.new('Network Programming Latency 99th Quantile')
108 | + tsPanel.standardOptions.withUnit('s')
109 | + tsPanel.queryOptions.withTargets([
110 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (instance, le))' % $._config)
111 | + prometheus.withLegendFormat('{{instance}}'),
112 | ]),
113 |
114 | tsPanel.new('Kube API Request Rate')
115 | + tsPanel.gridPos.withW(8)
116 | + tsPanel.standardOptions.withUnit('ops')
117 | + tsPanel.queryOptions.withTargets([
118 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeProxySelector)s, instance=~"$instance",code=~"2.."}[%(grafanaIntervalVar)s]))' % $._config)
119 | + prometheus.withLegendFormat('2xx'),
120 |
121 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeProxySelector)s, instance=~"$instance",code=~"3.."}[%(grafanaIntervalVar)s]))' % $._config)
122 | + prometheus.withLegendFormat('3xx'),
123 |
124 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeProxySelector)s, instance=~"$instance",code=~"4.."}[%(grafanaIntervalVar)s]))' % $._config)
125 | + prometheus.withLegendFormat('4xx'),
126 |
127 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeProxySelector)s, instance=~"$instance",code=~"5.."}[%(grafanaIntervalVar)s]))' % $._config)
128 | + prometheus.withLegendFormat('5xx'),
129 | ]),
130 |
131 | tsPanel.new('Post Request Latency 99th Quantile')
132 | + tsPanel.gridPos.withW(16)
133 | + tsPanel.standardOptions.withUnit('ops')
134 | + tsPanel.queryOptions.withTargets([
135 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeProxySelector)s,instance=~"$instance",verb="POST"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config)
136 | + prometheus.withLegendFormat('{{verb}}'),
137 | ]),
138 |
139 | tsPanel.new('Get Request Latency 99th Quantile')
140 | + tsPanel.gridPos.withW(24)
141 | + tsPanel.standardOptions.withUnit('s')
142 | + tsPanel.queryOptions.withTargets([
143 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance", verb="GET"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config)
144 | + prometheus.withLegendFormat('{{verb}}'),
145 | ]),
146 |
147 |
148 | tsPanel.new('Memory')
149 | + tsPanel.gridPos.withW(8)
150 | + tsPanel.standardOptions.withUnit('bytes')
151 | + tsPanel.queryOptions.withTargets([
152 | prometheus.new('${datasource}', 'process_resident_memory_bytes{%(clusterLabel)s="$cluster", %(kubeProxySelector)s,instance=~"$instance"}' % $._config)
153 | + prometheus.withLegendFormat('{{instance}}'),
154 | ]),
155 |
156 | tsPanel.new('CPU usage')
157 | + tsPanel.gridPos.withW(8)
158 | + tsPanel.standardOptions.withUnit('short')
159 | + tsPanel.queryOptions.withTargets([
160 | prometheus.new('${datasource}', 'rate(process_cpu_seconds_total{%(clusterLabel)s="$cluster", %(kubeProxySelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])' % $._config)
161 | + prometheus.withLegendFormat('{{instance}}'),
162 | ]),
163 |
164 | tsPanel.new('Goroutines')
165 | + tsPanel.gridPos.withW(8)
166 | + tsPanel.standardOptions.withUnit('short')
167 | + tsPanel.queryOptions.withTargets([
168 | prometheus.new('${datasource}', 'go_goroutines{%(clusterLabel)s="$cluster", %(kubeProxySelector)s,instance=~"$instance"}' % $._config)
169 | + prometheus.withLegendFormat('{{instance}}'),
170 | ]),
171 | ];
172 |
173 | g.dashboard.new('%(dashboardNamePrefix)sProxy' % $._config.grafanaK8s)
174 | + g.dashboard.withUid($._config.grafanaDashboardIDs['proxy.json'])
175 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags)
176 | + g.dashboard.withEditable(false)
177 | + g.dashboard.time.withFrom('now-1h')
178 | + g.dashboard.time.withTo('now')
179 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh)
180 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.instance])
181 | + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=12, panelHeight=7)),
182 | },
183 | }
184 |
--------------------------------------------------------------------------------
/dashboards/resources.libsonnet:
--------------------------------------------------------------------------------
1 | (import 'resources/cluster.libsonnet') +
2 | (import 'resources/multi-cluster.libsonnet') +
3 | (import 'resources/namespace.libsonnet') +
4 | (import 'resources/node.libsonnet') +
5 | (import 'resources/pod.libsonnet') +
6 | (import 'resources/workload-namespace.libsonnet') +
7 | (import 'resources/workload.libsonnet')
8 |
--------------------------------------------------------------------------------
/dashboards/scheduler.libsonnet:
--------------------------------------------------------------------------------
1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
2 |
3 | local prometheus = g.query.prometheus;
4 | local stat = g.panel.stat;
5 | local timeSeries = g.panel.timeSeries;
6 | local var = g.dashboard.variable;
7 |
8 | {
9 | local statPanel(title, unit, query) =
10 | stat.new(title)
11 | + stat.options.withColorMode('none')
12 | + stat.standardOptions.withUnit(unit)
13 | + stat.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval)
14 | + stat.queryOptions.withTargets([
15 | prometheus.new('${datasource}', query)
16 | + prometheus.withInstant(true),
17 | ]),
18 |
19 | local tsPanel =
20 | timeSeries {
21 | new(title):
22 | timeSeries.new(title)
23 | + timeSeries.options.legend.withShowLegend()
24 | + timeSeries.options.legend.withAsTable()
25 | + timeSeries.options.legend.withDisplayMode('table')
26 | + timeSeries.options.legend.withPlacement('right')
27 | + timeSeries.options.legend.withCalcs(['lastNotNull'])
28 | + timeSeries.options.tooltip.withMode('single')
29 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never')
30 | + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10)
31 | + timeSeries.fieldConfig.defaults.custom.withSpanNulls(true)
32 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval),
33 | },
34 |
35 | grafanaDashboards+:: {
36 | 'scheduler.json':
37 |
38 | local variables = {
39 | datasource:
40 | var.datasource.new('datasource', 'prometheus')
41 | + var.datasource.withRegex($._config.datasourceFilterRegex)
42 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue()
43 | + var.datasource.generalOptions.withLabel('Data source')
44 | + {
45 | current: {
46 | selected: true,
47 | text: $._config.datasourceName,
48 | value: $._config.datasourceName,
49 | },
50 | },
51 |
52 | cluster:
53 | var.query.new('cluster')
54 | + var.query.withDatasourceFromVariable(self.datasource)
55 | + var.query.queryTypes.withLabelValues(
56 | $._config.clusterLabel,
57 | 'up{%(kubeSchedulerSelector)s}' % $._config
58 | )
59 | + var.query.generalOptions.withLabel('cluster')
60 | + var.query.refresh.onTime()
61 | + (
62 | if $._config.showMultiCluster
63 | then var.query.generalOptions.showOnDashboard.withLabelAndValue()
64 | else var.query.generalOptions.showOnDashboard.withNothing()
65 | )
66 | + var.query.withSort(type='alphabetical'),
67 |
68 | instance:
69 | var.query.new('instance')
70 | + var.query.withDatasourceFromVariable(self.datasource)
71 | + var.query.queryTypes.withLabelValues(
72 | 'instance',
73 | 'up{%(kubeSchedulerSelector)s, %(clusterLabel)s="$cluster"}' % $._config,
74 | )
75 | + var.query.generalOptions.withLabel('instance')
76 | + var.query.refresh.onTime()
77 | + var.query.generalOptions.showOnDashboard.withLabelAndValue()
78 | + var.query.selectionOptions.withIncludeAll(true, '.+'),
79 | };
80 |
81 | local panels = [
82 | statPanel('Up', 'none', 'sum(up{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s})' % $._config)
83 | + stat.gridPos.withW(4),
84 |
85 | tsPanel.new('Scheduling Rate')
86 | + tsPanel.gridPos.withW(10)
87 | + tsPanel.standardOptions.withUnit('ops')
88 | + tsPanel.queryOptions.withTargets([
89 | prometheus.new('${datasource}', 'sum(rate(scheduler_e2e_scheduling_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance)' % $._config)
90 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} e2e' % $._config),
91 |
92 | prometheus.new('${datasource}', 'sum(rate(scheduler_binding_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance)' % $._config)
93 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} binding' % $._config),
94 |
95 | prometheus.new('${datasource}', 'sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance)' % $._config)
96 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} scheduling algorithm' % $._config),
97 |
98 | prometheus.new('${datasource}', 'sum(rate(scheduler_volume_scheduling_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance)' % $._config)
99 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} volume' % $._config),
100 | ]),
101 |
102 | tsPanel.new('Scheduling latency 99th Quantile')
103 | + tsPanel.gridPos.withW(10)
104 | + tsPanel.standardOptions.withUnit('s')
105 | + tsPanel.queryOptions.withTargets([
106 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, le))' % $._config)
107 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} e2e' % $._config),
108 |
109 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, le))' % $._config)
110 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} binding' % $._config),
111 |
112 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, le))' % $._config)
113 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} scheduling algorithm' % $._config),
114 |
115 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, le))' % $._config)
116 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} volume' % $._config),
117 | ]),
118 |
119 | tsPanel.new('Kube API Request Rate')
120 | + tsPanel.gridPos.withW(8)
121 | + tsPanel.standardOptions.withUnit('ops')
122 | + tsPanel.queryOptions.withTargets([
123 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance",code=~"2.."}[%(grafanaIntervalVar)s]))' % $._config)
124 | + prometheus.withLegendFormat('2xx'),
125 |
126 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance",code=~"3.."}[%(grafanaIntervalVar)s]))' % $._config)
127 | + prometheus.withLegendFormat('3xx'),
128 |
129 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance",code=~"4.."}[%(grafanaIntervalVar)s]))' % $._config)
130 | + prometheus.withLegendFormat('4xx'),
131 |
132 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance",code=~"5.."}[%(grafanaIntervalVar)s]))' % $._config)
133 | + prometheus.withLegendFormat('5xx'),
134 | ]),
135 |
136 | tsPanel.new('Post Request Latency 99th Quantile')
137 | + tsPanel.gridPos.withW(16)
138 | + tsPanel.standardOptions.withUnit('ops')
139 | + tsPanel.queryOptions.withTargets([
140 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance", verb="POST"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config)
141 | + prometheus.withLegendFormat('{{verb}}'),
142 | ]),
143 |
144 | tsPanel.new('Get Request Latency 99th Quantile')
145 | + tsPanel.gridPos.withW(24)
146 | + tsPanel.standardOptions.withUnit('s')
147 | + tsPanel.queryOptions.withTargets([
148 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance", verb="GET"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config)
149 | + prometheus.withLegendFormat('{{verb}}'),
150 | ]),
151 |
152 |
153 | tsPanel.new('Memory')
154 | + tsPanel.gridPos.withW(8)
155 | + tsPanel.standardOptions.withUnit('bytes')
156 | + tsPanel.queryOptions.withTargets([
157 | prometheus.new('${datasource}', 'process_resident_memory_bytes{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}' % $._config)
158 | + prometheus.withLegendFormat('{{instance}}'),
159 | ]),
160 |
161 | tsPanel.new('CPU usage')
162 | + tsPanel.gridPos.withW(8)
163 | + tsPanel.standardOptions.withUnit('short')
164 | + tsPanel.queryOptions.withTargets([
165 | prometheus.new('${datasource}', 'rate(process_cpu_seconds_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])' % $._config)
166 | + prometheus.withLegendFormat('{{instance}}'),
167 | ]),
168 |
169 | tsPanel.new('Goroutines')
170 | + tsPanel.gridPos.withW(8)
171 | + tsPanel.standardOptions.withUnit('short')
172 | + tsPanel.queryOptions.withTargets([
173 | prometheus.new('${datasource}', 'go_goroutines{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}' % $._config)
174 | + prometheus.withLegendFormat('{{instance}}'),
175 | ]),
176 | ];
177 |
178 | g.dashboard.new('%(dashboardNamePrefix)sScheduler' % $._config.grafanaK8s)
179 | + g.dashboard.withUid($._config.grafanaDashboardIDs['scheduler.json'])
180 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags)
181 | + g.dashboard.withEditable(false)
182 | + g.dashboard.time.withFrom('now-1h')
183 | + g.dashboard.time.withTo('now')
184 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh)
185 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.instance])
186 | + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=12, panelHeight=7)),
187 | },
188 | }
189 |
--------------------------------------------------------------------------------
/jsonnetfile.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": 1,
3 | "dependencies": [
4 | {
5 | "source": {
6 | "git": {
7 | "remote": "https://github.com/grafana/grafonnet.git",
8 | "subdir": "gen/grafonnet-latest"
9 | }
10 | },
11 | "version": "main"
12 | }
13 | ],
14 | "legacyImports": false
15 | }
16 |
--------------------------------------------------------------------------------
/lib/absent_alert.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | local absentAlert = self,
3 | componentName:: error 'must provide component name',
4 | selector:: error 'must provide selector for component',
5 |
6 | alert: '%sDown' % absentAlert.componentName,
7 | expr: |||
8 | absent(up{%s} == 1)
9 | ||| % absentAlert.selector,
10 | 'for': '15m',
11 | labels: {
12 | severity: 'critical',
13 | },
14 | annotations: {
15 | description: '%s has disappeared from Prometheus target discovery.' % absentAlert.componentName,
16 | summary: 'Target disappeared from Prometheus target discovery.',
17 | },
18 | }
19 |
--------------------------------------------------------------------------------
/lib/add-runbook-links.libsonnet:
--------------------------------------------------------------------------------
1 | local utils = import 'utils.libsonnet';
2 |
3 | local lower(x) =
4 | local cp(c) = std.codepoint(c);
5 | local lowerLetter(c) =
6 | if cp(c) >= 65 && cp(c) < 91
7 | then std.char(cp(c) + 32)
8 | else c;
9 | std.join('', std.map(lowerLetter, std.stringChars(x)));
10 |
11 | {
12 | _config+:: {
13 | runbookURLPattern: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-%s',
14 | },
15 |
16 | prometheusAlerts+::
17 | local addRunbookURL(rule) = rule {
18 | [if 'alert' in rule && !('runbook_url' in rule.annotations) then 'annotations']+: {
19 | runbook_url: $._config.runbookURLPattern % lower(rule.alert),
20 | },
21 | };
22 | utils.mapRuleGroups(addRunbookURL),
23 | }
24 |
--------------------------------------------------------------------------------
/lib/alerts.jsonnet:
--------------------------------------------------------------------------------
1 | std.manifestYamlDoc((import '../mixin.libsonnet').prometheusAlerts)
2 |
--------------------------------------------------------------------------------
/lib/dashboards.jsonnet:
--------------------------------------------------------------------------------
1 | local dashboards = (import '../mixin.libsonnet').grafanaDashboards;
2 |
3 | {
4 | [name]: dashboards[name]
5 | for name in std.objectFields(dashboards)
6 | }
7 |
--------------------------------------------------------------------------------
/lib/rules.jsonnet:
--------------------------------------------------------------------------------
1 | std.manifestYamlDoc((import '../mixin.libsonnet').prometheusRules)
2 |
--------------------------------------------------------------------------------
/lib/utils.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | mapRuleGroups(f): {
3 | groups: [
4 | group {
5 | rules: [
6 | f(rule)
7 | for rule in super.rules
8 | ],
9 | }
10 | for group in super.groups
11 | ],
12 | },
13 |
14 | humanizeSeconds(s)::
15 | if s > 60 * 60 * 24
16 | then '%.1f days' % (s / 60 / 60 / 24)
17 | else '%.1f hours' % (s / 60 / 60),
18 |
19 | // Handle adding `group left` to join labels into rule by wrapping the rule in () * on(xxx) group_left(xxx) kube_xxx_labels
20 | // If kind of rule is not defined try to detect rule type by alert name
21 | wrap_rule_for_labels(rule, config):
22 | // Detect Kind of rule from name unless hidden `kind field is passed in the rule`
23 | local kind =
24 | if 'kind' in rule then rule.kind
25 | // Handle Alerts
26 | else if std.objectHas(rule, 'alert') then
27 | if std.startsWith(rule.alert, 'KubePod') then 'pod'
28 | else if std.startsWith(rule.alert, 'KubeContainer') then 'pod'
29 | else if std.startsWith(rule.alert, 'KubeStateful') then 'statefulset'
30 | else if std.startsWith(rule.alert, 'KubeDeploy') then 'deployment'
31 | else if std.startsWith(rule.alert, 'KubeDaemon') then 'daemonset'
32 | else if std.startsWith(rule.alert, 'KubeHpa') then 'horizontalpodautoscaler'
33 | else if std.startsWith(rule.alert, 'KubeJob') then 'job'
34 | else 'none'
35 | else 'none';
36 |
37 | local labels = {
38 | join_labels: config['%ss_join_labels' % kind],
39 | // since the label 'job' is reserved, the resource with kind Job uses the label 'job_name' instead
40 | on_labels: ['%s' % (if kind == 'job' then 'job_name' else kind), '%s' % config.namespaceLabel, '%s' % config.clusterLabel],
41 | metric: 'kube_%s_labels' % kind,
42 | };
43 |
44 | // Failed to identify kind - return raw rule
45 | if kind == 'none' then rule
46 | // No join labels passed in the config - return raw rule
47 | else if std.length(labels.join_labels) == 0 then rule
48 | // Wrap expr with join group left
49 | else
50 | rule {
51 | local expr = super.expr,
52 | expr: '(%(expr)s) * on (%(on)s) group_left(%(join)s) %(metric)s' % {
53 | expr: expr,
54 | on: std.join(',', labels.on_labels),
55 | join: std.join(',', labels.join_labels),
56 | metric: labels.metric,
57 | },
58 | },
59 |
60 | // if showMultiCluster is true in config, return the string, otherwise return an empty string
61 | ifShowMultiCluster(config, string)::
62 | if config.showMultiCluster then string else '',
63 | }
64 |
--------------------------------------------------------------------------------
/mixin.libsonnet:
--------------------------------------------------------------------------------
1 | (import 'alerts/alerts.libsonnet') +
2 | (import 'dashboards/dashboards.libsonnet') +
3 | (import 'rules/rules.libsonnet') +
4 | (import 'config.libsonnet')
5 |
--------------------------------------------------------------------------------
/rules/kube_apiserver-availability.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | prometheusRules+:: {
3 | local SLODays = $._config.SLOs.apiserver.days + 'd',
4 | local verbs = [
5 | { type: 'read', selector: $._config.kubeApiserverReadSelector },
6 | { type: 'write', selector: $._config.kubeApiserverWriteSelector },
7 | ],
8 |
9 | groups+: [
10 | {
11 | name: 'kube-apiserver-availability.rules',
12 | interval: '3m',
13 | rules: [
14 | {
15 | record: 'code_verb:apiserver_request_total:increase%s' % SLODays,
16 | expr: |||
17 | avg_over_time(code_verb:apiserver_request_total:increase1h[%s]) * 24 * %d
18 | ||| % [SLODays, $._config.SLOs.apiserver.days],
19 | },
20 | ] + [
21 | {
22 | record: 'code:apiserver_request_total:increase%s' % SLODays,
23 | expr: |||
24 | sum by (%s, code) (code_verb:apiserver_request_total:increase%s{%s})
25 | ||| % [$._config.clusterLabel, SLODays, verb.selector],
26 | labels: {
27 | verb: verb.type,
28 | },
29 | }
30 | for verb in verbs
31 | ] + [
32 | {
33 | record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h',
34 | expr: |||
35 | sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
36 | ||| % $._config,
37 | },
38 | {
39 | record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%s' % SLODays,
40 | expr: |||
41 | sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[%s]) * 24 * %s)
42 | ||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
43 | },
44 | {
45 | record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h',
46 | expr: |||
47 | sum by (%(clusterLabel)s, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le="+Inf"})
48 | ||| % $._config,
49 | },
50 | {
51 | record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%s' % SLODays,
52 | expr: |||
53 | sum by (%s, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%s{le="+Inf"})
54 | ||| % [$._config.clusterLabel, SLODays],
55 | },
56 | {
57 | record: 'apiserver_request:availability%s' % SLODays,
58 | expr: |||
59 | 1 - (
60 | (
61 | # write too slow
62 | sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
63 | -
64 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le=~"%(kubeApiserverWriteLatency)s"} or vector(0))
65 | ) +
66 | (
67 | # read too slow
68 | sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
69 | -
70 | (
71 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le=~"%(kubeApiserverReadResourceLatency)s"} or vector(0))
72 | +
73 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le=~"%(kubeApiserverReadNamespaceLatency)s"} or vector(0))
74 | +
75 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le=~"%(kubeApiserverReadClusterLatency)s"} or vector(0))
76 | )
77 | ) +
78 | # errors
79 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{code=~"5.."} or vector(0))
80 | )
81 | /
82 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s)
83 | ||| % ($._config { SLODays: SLODays }),
84 | labels: {
85 | verb: 'all',
86 | },
87 | },
88 | {
89 | record: 'apiserver_request:availability%s' % SLODays,
90 | expr: |||
91 | 1 - (
92 | sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
93 | -
94 | (
95 | # too slow
96 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le=~"%(kubeApiserverReadResourceLatency)s"} or vector(0))
97 | +
98 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le=~"%(kubeApiserverReadNamespaceLatency)s"} or vector(0))
99 | +
100 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le=~"%(kubeApiserverReadClusterLatency)s"} or vector(0))
101 | )
102 | +
103 | # errors
104 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="read",code=~"5.."} or vector(0))
105 | )
106 | /
107 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="read"})
108 | ||| % ($._config { SLODays: SLODays, days: $._config.SLOs.apiserver.days }),
109 | labels: {
110 | verb: 'read',
111 | },
112 | },
113 | {
114 | record: 'apiserver_request:availability%s' % SLODays,
115 | expr: |||
116 | 1 - (
117 | (
118 | # too slow
119 | sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
120 | -
121 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le=~"%(kubeApiserverWriteLatency)s"} or vector(0))
122 | )
123 | +
124 | # errors
125 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="write",code=~"5.."} or vector(0))
126 | )
127 | /
128 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="write"})
129 | ||| % ($._config { SLODays: SLODays, days: $._config.SLOs.apiserver.days }),
130 | labels: {
131 | verb: 'write',
132 | },
133 | },
134 | ] + [
135 | {
136 | record: 'code_resource:apiserver_request_total:rate5m',
137 | expr: |||
138 | sum by (%s,code,resource) (rate(apiserver_request_total{%s}[5m]))
139 | ||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector])],
140 | labels: {
141 | verb: verb.type,
142 | },
143 | }
144 | for verb in verbs
145 | ] + [
146 | {
147 | record: 'code_verb:apiserver_request_total:increase1h',
148 | expr: |||
149 | sum by (%s, code, verb) (increase(apiserver_request_total{%s,verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"%s"}[1h]))
150 | ||| % [$._config.clusterLabel, $._config.kubeApiserverSelector, code],
151 | }
152 | for code in ['2..', '3..', '4..', '5..']
153 | ],
154 | },
155 | ],
156 | },
157 | }
158 |
--------------------------------------------------------------------------------
/rules/kube_apiserver-burnrate.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | prometheusRules+:: {
3 | groups+: [
4 | {
5 | name: 'kube-apiserver-burnrate.rules',
6 | rules: [
7 | {
8 | record: 'apiserver_request:burnrate%(window)s' % w,
9 | expr: |||
10 | (
11 | (
12 | # too slow
13 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
14 | -
15 | (
16 | (
17 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le=~"%(kubeApiserverReadResourceLatency)s"}[%(window)s]))
18 | or
19 | vector(0)
20 | )
21 | +
22 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le=~"%(kubeApiserverReadNamespaceLatency)s"}[%(window)s]))
23 | +
24 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le=~"%(kubeApiserverReadClusterLatency)s"}[%(window)s]))
25 | )
26 | )
27 | +
28 | # errors
29 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,code=~"5.."}[%(window)s]))
30 | )
31 | /
32 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(window)s]))
33 | ||| % {
34 | clusterLabel: $._config.clusterLabel,
35 | window: w,
36 | kubeApiserverSelector: $._config.kubeApiserverSelector,
37 | kubeApiserverReadSelector: $._config.kubeApiserverReadSelector,
38 | kubeApiserverNonStreamingSelector: $._config.kubeApiserverNonStreamingSelector,
39 | kubeApiserverReadResourceLatency: $._config.kubeApiserverReadResourceLatency,
40 | kubeApiserverReadNamespaceLatency: $._config.kubeApiserverReadNamespaceLatency,
41 | kubeApiserverReadClusterLatency: $._config.kubeApiserverReadClusterLatency,
42 | },
43 | labels: {
44 | verb: 'read',
45 | },
46 | }
47 | for w in std.set([ // Get the unique array of short and long window rates
48 | w.short
49 | for w in $._config.SLOs.apiserver.windows
50 | ] + [
51 | w.long
52 | for w in $._config.SLOs.apiserver.windows
53 | ])
54 | ] + [
55 | {
56 | record: 'apiserver_request:burnrate%(window)s' % w,
57 | expr: |||
58 | (
59 | (
60 | # too slow
61 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
62 | -
63 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le=~"%(kubeApiserverWriteLatency)s"}[%(window)s]))
64 | )
65 | +
66 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s]))
67 | )
68 | /
69 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s}[%(window)s]))
70 | ||| % {
71 | clusterLabel: $._config.clusterLabel,
72 | window: w,
73 | kubeApiserverSelector: $._config.kubeApiserverSelector,
74 | kubeApiserverWriteSelector: $._config.kubeApiserverWriteSelector,
75 | kubeApiserverNonStreamingSelector: $._config.kubeApiserverNonStreamingSelector,
76 | kubeApiserverWriteLatency: $._config.kubeApiserverWriteLatency,
77 | },
78 | labels: {
79 | verb: 'write',
80 | },
81 | }
82 | for w in std.set([ // Get the unique array of short and long window rates
83 | w.short
84 | for w in $._config.SLOs.apiserver.windows
85 | ] + [
86 | w.long
87 | for w in $._config.SLOs.apiserver.windows
88 | ])
89 | ],
90 | },
91 | ],
92 | },
93 | }
94 |
--------------------------------------------------------------------------------
/rules/kube_apiserver-config.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | kubeApiserverSelector: 'job="kube-apiserver"',
4 | podLabel: 'pod',
5 | kubeApiserverReadSelector: 'verb=~"LIST|GET"',
6 | kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"',
7 | kubeApiserverNonStreamingSelector: 'subresource!~"proxy|attach|log|exec|portforward"',
8 | // These are buckets that exist on the apiserver_request_sli_duration_seconds_bucket histogram.
9 | // They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters.
10 | // If you want to change these, make sure the "le" buckets exist on the histogram!
11 | kubeApiserverReadResourceLatency: '1(\\\\.0)?',
12 | kubeApiserverReadNamespaceLatency: '5(\\\\.0)?',
13 | kubeApiserverReadClusterLatency: '30(\\\\.0)?',
14 | kubeApiserverWriteLatency: '1(\\\\.0)?',
15 | },
16 | }
17 |
--------------------------------------------------------------------------------
/rules/kube_apiserver-histogram.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | prometheusRules+:: {
3 | local verbs = [
4 | { type: 'read', selector: $._config.kubeApiserverReadSelector },
5 | { type: 'write', selector: $._config.kubeApiserverWriteSelector },
6 | ],
7 |
8 | groups+: [
9 | {
10 | name: 'kube-apiserver-histogram.rules',
11 | rules:
12 | [
13 | {
14 | record: 'cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile',
15 | expr: |||
16 | histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{%s}[5m]))) > 0
17 | ||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector, $._config.kubeApiserverNonStreamingSelector])],
18 | labels: {
19 | verb: verb.type,
20 | quantile: '0.99',
21 | },
22 | }
23 | for verb in verbs
24 | ],
25 | },
26 | ],
27 | },
28 | }
29 |
--------------------------------------------------------------------------------
/rules/kube_apiserver.libsonnet:
--------------------------------------------------------------------------------
1 | (import 'kube_apiserver-config.libsonnet') +
2 | (import 'kube_apiserver-availability.libsonnet') +
3 | (import 'kube_apiserver-burnrate.libsonnet') +
4 | (import 'kube_apiserver-histogram.libsonnet')
5 |
--------------------------------------------------------------------------------
/rules/kube_scheduler.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | kubeSchedulerSelector: 'job="kube-scheduler"',
4 | podLabel: 'pod',
5 | },
6 |
7 | prometheusRules+:: {
8 | groups+: [
9 | {
10 | name: 'kube-scheduler.rules',
11 | rules: [
12 | {
13 | record: 'cluster_quantile:%s:histogram_quantile' % metric,
14 | expr: |||
15 | histogram_quantile(%(quantile)s, sum(rate(%(metric)s_bucket{%(kubeSchedulerSelector)s}[5m])) without(instance, %(podLabel)s))
16 | ||| % ({ quantile: quantile, metric: metric } + $._config),
17 | labels: {
18 | quantile: quantile,
19 | },
20 | }
21 | for quantile in ['0.99', '0.9', '0.5']
22 | for metric in [
23 | 'scheduler_e2e_scheduling_duration_seconds',
24 | 'scheduler_scheduling_algorithm_duration_seconds',
25 | 'scheduler_binding_duration_seconds',
26 | ]
27 | ],
28 | },
29 | ],
30 | },
31 | }
32 |
--------------------------------------------------------------------------------
/rules/kubelet.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | kubeletSelector: 'job="kubelet"',
4 | },
5 |
6 | prometheusRules+:: {
7 | groups+: [
8 | {
9 | name: 'kubelet.rules',
10 | rules: [
11 | {
12 | record: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile',
13 | expr: |||
14 | histogram_quantile(%(quantile)s, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (%(clusterLabel)s, instance, le) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s})
15 | ||| % ({ quantile: quantile } + $._config),
16 | labels: {
17 | quantile: quantile,
18 | },
19 | }
20 | for quantile in ['0.99', '0.9', '0.5']
21 | ],
22 | },
23 | ],
24 | },
25 | }
26 |
--------------------------------------------------------------------------------
/rules/node.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | _config+:: {
3 | kubeStateMetricsSelector: 'job="kube-state-metrics"',
4 | nodeExporterSelector: 'job="node-exporter"',
5 | podLabel: 'pod',
6 | },
7 |
8 | prometheusRules+:: {
9 | groups+: [
10 | {
11 | name: 'node.rules',
12 | rules: [
13 | {
14 | // This rule results in the tuples (node, namespace, instance) => 1.
15 | // It is used to calculate per-node metrics, given namespace & instance.
16 | // We use the topk() aggregator to ensure that each (namespace,
17 | // instance) tuple is only associated to one node and thus avoid
18 | // "many-to-many matching not allowed" errors when joining with
19 | // other timeseries on (namespace, instance). See node:node_num_cpu:sum
20 | // below for instance.
21 | record: 'node_namespace_pod:kube_pod_info:',
22 | expr: |||
23 | topk by(%(clusterLabel)s, namespace, %(podLabel)s) (1,
24 | max by (%(clusterLabel)s, node, namespace, %(podLabel)s) (
25 | label_replace(kube_pod_info{%(kubeStateMetricsSelector)s,node!=""}, "%(podLabel)s", "$1", "pod", "(.*)")
26 | ))
27 | ||| % $._config,
28 | },
29 | {
30 | // This rule gives the number of CPUs per node.
31 | record: 'node:node_num_cpu:sum',
32 | expr: |||
33 | count by (%(clusterLabel)s, node) (
34 | node_cpu_seconds_total{mode="idle",%(nodeExporterSelector)s}
35 | * on (%(clusterLabel)s, namespace, %(podLabel)s) group_left(node)
36 | topk by(%(clusterLabel)s, namespace, %(podLabel)s) (1, node_namespace_pod:kube_pod_info:)
37 | )
38 | ||| % $._config,
39 | },
40 | // Add separate rules for Available memory, so we can aggregate across clusters in dashboards.
41 | {
42 | record: ':node_memory_MemAvailable_bytes:sum',
43 | expr: |||
44 | sum(
45 | node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} or
46 | (
47 | node_memory_Buffers_bytes{%(nodeExporterSelector)s} +
48 | node_memory_Cached_bytes{%(nodeExporterSelector)s} +
49 | node_memory_MemFree_bytes{%(nodeExporterSelector)s} +
50 | node_memory_Slab_bytes{%(nodeExporterSelector)s}
51 | )
52 | ) by (%(clusterLabel)s)
53 | ||| % $._config,
54 | },
55 | {
56 | // This rule gives cpu utilization per node.
57 | record: 'node:node_cpu_utilization:ratio_rate5m',
58 | expr: |||
59 | avg by (%(clusterLabel)s, node) (
60 | sum without (mode) (
61 | rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",%(nodeExporterSelector)s}[5m])
62 | )
63 | )
64 | ||| % $._config,
65 | },
66 | {
67 | // This rule gives cpu utilization per cluster
68 | record: 'cluster:node_cpu:ratio_rate5m',
69 | expr: |||
70 | avg by (%(clusterLabel)s) (
71 | node:node_cpu_utilization:ratio_rate5m
72 | )
73 | ||| % $._config,
74 | },
75 | ],
76 | },
77 | ],
78 | },
79 | }
80 |
--------------------------------------------------------------------------------
/rules/rules.libsonnet:
--------------------------------------------------------------------------------
1 | (import 'kube_apiserver.libsonnet') +
2 | (import 'apps.libsonnet') +
3 | (import 'kube_scheduler.libsonnet') +
4 | (import 'node.libsonnet') +
5 | (import 'kubelet.libsonnet')
6 |
--------------------------------------------------------------------------------
/rules/windows.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 | prometheusRules+:: {
3 | groups+: [
4 | {
5 | name: 'windows.node.rules',
6 | rules: [
7 | {
8 | // This rule gives the number of windows nodes
9 | record: 'node:windows_node:sum',
10 | expr: |||
11 | count by (%(clusterLabel)s) (
12 | windows_system_boot_time_timestamp_seconds{%(windowsExporterSelector)s}
13 | )
14 | ||| % $._config,
15 | },
16 | {
17 | // This rule gives the number of CPUs per node.
18 | record: 'node:windows_node_num_cpu:sum',
19 | expr: |||
20 | count by (%(clusterLabel)s, instance) (sum by (%(clusterLabel)s, instance, core) (
21 | windows_cpu_time_total{%(windowsExporterSelector)s}
22 | ))
23 | ||| % $._config,
24 | },
25 | {
26 | // CPU utilisation is % CPU is not idle.
27 | record: ':windows_node_cpu_utilisation:avg1m',
28 | expr: |||
29 | 1 - avg by (%(clusterLabel)s) (rate(windows_cpu_time_total{%(windowsExporterSelector)s,mode="idle"}[1m]))
30 | ||| % $._config,
31 | },
32 | {
33 | // CPU utilisation is % CPU is not idle.
34 | record: 'node:windows_node_cpu_utilisation:avg1m',
35 | expr: |||
36 | 1 - avg by (%(clusterLabel)s, instance) (
37 | rate(windows_cpu_time_total{%(windowsExporterSelector)s,mode="idle"}[1m])
38 | )
39 | ||| % $._config,
40 | },
41 | {
42 | record: ':windows_node_memory_utilisation:',
43 | expr: |||
44 | 1 -
45 | sum by (%(clusterLabel)s) (windows_memory_available_bytes{%(windowsExporterSelector)s})
46 | /
47 | sum by (%(clusterLabel)s) (windows_os_visible_memory_bytes{%(windowsExporterSelector)s})
48 | ||| % $._config,
49 | },
50 | // Add separate rules for Free & Total, so we can aggregate across clusters
51 | // in dashboards.
52 | {
53 | record: ':windows_node_memory_MemFreeCached_bytes:sum',
54 | expr: |||
55 | sum by (%(clusterLabel)s) (windows_memory_available_bytes{%(windowsExporterSelector)s} + windows_memory_cache_bytes{%(windowsExporterSelector)s})
56 | ||| % $._config,
57 | },
58 | {
59 | record: 'node:windows_node_memory_totalCached_bytes:sum',
60 | expr: |||
61 | (windows_memory_cache_bytes{%(windowsExporterSelector)s} + windows_memory_modified_page_list_bytes{%(windowsExporterSelector)s} + windows_memory_standby_cache_core_bytes{%(windowsExporterSelector)s} + windows_memory_standby_cache_normal_priority_bytes{%(windowsExporterSelector)s} + windows_memory_standby_cache_reserve_bytes{%(windowsExporterSelector)s})
62 | ||| % $._config,
63 | },
64 | {
65 | record: ':windows_node_memory_MemTotal_bytes:sum',
66 | expr: |||
67 | sum by (%(clusterLabel)s) (windows_os_visible_memory_bytes{%(windowsExporterSelector)s})
68 | ||| % $._config,
69 | },
70 | {
71 | // Available memory per node
72 | // SINCE 2018-02-08
73 | record: 'node:windows_node_memory_bytes_available:sum',
74 | expr: |||
75 | sum by (%(clusterLabel)s, instance) (
76 | (windows_memory_available_bytes{%(windowsExporterSelector)s})
77 | )
78 | ||| % $._config,
79 | },
80 | {
81 | // Total memory per node
82 | record: 'node:windows_node_memory_bytes_total:sum',
83 | expr: |||
84 | sum by (%(clusterLabel)s, instance) (
85 | windows_os_visible_memory_bytes{%(windowsExporterSelector)s}
86 | )
87 | ||| % $._config,
88 | },
89 | {
90 | // Memory utilisation per node, normalized by per-node memory
91 | record: 'node:windows_node_memory_utilisation:ratio',
92 | expr: |||
93 | (node:windows_node_memory_bytes_total:sum - node:windows_node_memory_bytes_available:sum)
94 | /
95 | scalar(sum(node:windows_node_memory_bytes_total:sum))
96 | |||,
97 | },
98 | {
99 | record: 'node:windows_node_memory_utilisation:',
100 | expr: |||
101 | 1 - (node:windows_node_memory_bytes_available:sum / node:windows_node_memory_bytes_total:sum)
102 | ||| % $._config,
103 | },
104 | {
105 | record: 'node:windows_node_memory_swap_io_pages:irate',
106 | expr: |||
107 | irate(windows_memory_swap_page_operations_total{%(windowsExporterSelector)s}[5m])
108 | ||| % $._config,
109 | },
110 | {
111 | // Disk utilisation (ms spent, by rate() it's bound by 1 second)
112 | record: ':windows_node_disk_utilisation:avg_irate',
113 | expr: |||
114 | avg by (%(clusterLabel)s) (irate(windows_logical_disk_read_seconds_total{%(windowsExporterSelector)s}[1m]) +
115 | irate(windows_logical_disk_write_seconds_total{%(windowsExporterSelector)s}[1m])
116 | )
117 | ||| % $._config,
118 | },
119 | {
120 | // Disk utilisation (ms spent, by rate() it's bound by 1 second)
121 | record: 'node:windows_node_disk_utilisation:avg_irate',
122 | expr: |||
123 | avg by (%(clusterLabel)s, instance) (
124 | (irate(windows_logical_disk_read_seconds_total{%(windowsExporterSelector)s}[1m]) +
125 | irate(windows_logical_disk_write_seconds_total{%(windowsExporterSelector)s}[1m]))
126 | )
127 | ||| % $._config,
128 | },
129 | {
130 | record: 'node:windows_node_filesystem_usage:',
131 | expr: |||
132 | max by (%(clusterLabel)s,instance,volume)(
133 | (windows_logical_disk_size_bytes{%(windowsExporterSelector)s}
134 | - windows_logical_disk_free_bytes{%(windowsExporterSelector)s})
135 | / windows_logical_disk_size_bytes{%(windowsExporterSelector)s}
136 | )
137 | ||| % $._config,
138 | },
139 | {
140 | record: 'node:windows_node_filesystem_avail:',
141 | expr: |||
142 | max by (%(clusterLabel)s, instance, volume) (windows_logical_disk_free_bytes{%(windowsExporterSelector)s} / windows_logical_disk_size_bytes{%(windowsExporterSelector)s})
143 | ||| % $._config,
144 | },
145 | {
146 | record: ':windows_node_net_utilisation:sum_irate',
147 | expr: |||
148 | sum by (%(clusterLabel)s) (irate(windows_net_bytes_total{%(windowsExporterSelector)s}[1m]))
149 | ||| % $._config,
150 | },
151 | {
152 | record: 'node:windows_node_net_utilisation:sum_irate',
153 | expr: |||
154 | sum by (%(clusterLabel)s, instance) (
155 | (irate(windows_net_bytes_total{%(windowsExporterSelector)s}[1m]))
156 | )
157 | ||| % $._config,
158 | },
159 | {
160 | record: ':windows_node_net_saturation:sum_irate',
161 | expr: |||
162 | sum by (%(clusterLabel)s) (irate(windows_net_packets_received_discarded_total{%(windowsExporterSelector)s}[1m])) +
163 | sum by (%(clusterLabel)s) (irate(windows_net_packets_outbound_discarded_total{%(windowsExporterSelector)s}[1m]))
164 | ||| % $._config,
165 | },
166 | {
167 | record: 'node:windows_node_net_saturation:sum_irate',
168 | expr: |||
169 | sum by (%(clusterLabel)s, instance) (
170 | (irate(windows_net_packets_received_discarded_total{%(windowsExporterSelector)s}[1m]) +
171 | irate(windows_net_packets_outbound_discarded_total{%(windowsExporterSelector)s}[1m]))
172 | )
173 | ||| % $._config,
174 | },
175 | ],
176 | },
177 | {
178 | name: 'windows.pod.rules',
179 | rules: [
180 | {
181 | record: 'windows_pod_container_available',
182 | expr: |||
183 | windows_container_available{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s)
184 | ||| % $._config,
185 | },
186 | {
187 | record: 'windows_container_total_runtime',
188 | expr: |||
189 | windows_container_cpu_usage_seconds_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s)
190 | ||| % $._config,
191 | },
192 | {
193 | record: 'windows_container_memory_usage',
194 | expr: |||
195 | windows_container_memory_usage_commit_bytes{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s)
196 | ||| % $._config,
197 | },
198 | {
199 | record: 'windows_container_private_working_set_usage',
200 | expr: |||
201 | windows_container_memory_usage_private_working_set_bytes{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s)
202 | ||| % $._config,
203 | },
204 | {
205 | record: 'windows_container_network_received_bytes_total',
206 | expr: |||
207 | windows_container_network_receive_bytes_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s)
208 | ||| % $._config,
209 | },
210 | {
211 | record: 'windows_container_network_transmitted_bytes_total',
212 | expr: |||
213 | windows_container_network_transmit_bytes_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s)
214 | ||| % $._config,
215 | },
216 | {
217 | record: 'kube_pod_windows_container_resource_memory_request',
218 | expr: |||
219 | max by (%(clusterLabel)s, namespace, pod, container) (
220 | kube_pod_container_resource_requests{resource="memory",%(kubeStateMetricsSelector)s}
221 | ) * on(container,pod,namespace,%(clusterLabel)s) (windows_pod_container_available)
222 | ||| % $._config,
223 | },
224 | {
225 | record: 'kube_pod_windows_container_resource_memory_limit',
226 | expr: |||
227 | kube_pod_container_resource_limits{resource="memory",%(kubeStateMetricsSelector)s} * on(container,pod,namespace,%(clusterLabel)s) (windows_pod_container_available)
228 | ||| % $._config,
229 | },
230 | {
231 | record: 'kube_pod_windows_container_resource_cpu_cores_request',
232 | expr: |||
233 | max by (%(clusterLabel)s, namespace, pod, container) (
234 | kube_pod_container_resource_requests{resource="cpu",%(kubeStateMetricsSelector)s}
235 | ) * on(container,pod,namespace,%(clusterLabel)s) (windows_pod_container_available)
236 | ||| % $._config,
237 | },
238 | {
239 | record: 'kube_pod_windows_container_resource_cpu_cores_limit',
240 | expr: |||
241 | kube_pod_container_resource_limits{resource="cpu",%(kubeStateMetricsSelector)s} * on(container,pod,namespace,%(clusterLabel)s) (windows_pod_container_available)
242 | ||| % $._config,
243 | },
244 | {
245 | record: 'namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate',
246 | expr: |||
247 | sum by (%(clusterLabel)s, namespace, pod, container) (
248 | rate(windows_container_total_runtime{}[5m])
249 | )
250 | ||| % $._config,
251 | },
252 | ],
253 | },
254 | ],
255 | },
256 | }
257 |
--------------------------------------------------------------------------------
/scripts/check-selectors-ksm.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Set -u to error out if we use an unset variable.
4 | # Set -o pipefail to propagate errors in a pipeline.
5 | set -uo pipefail
6 |
7 | # Remove kube-state-metrics directory if it exists.
8 | rm -rf kube-state-metrics
9 |
10 | # Clone kube-state-metrics repository.
11 | git clone https://github.com/kubernetes/kube-state-metrics --depth 1
12 |
13 | # Set the repository root.
14 | repository_root=$(git rev-parse --show-toplevel)
15 |
16 | # Change directory to kube-state-metrics.
17 | cd kube-state-metrics || exit
18 |
19 | # Grep all metrics in the codebase.
20 | find internal/store -type f -not -name '*_test.go' -exec sed -nE 's/.*"(kube_[^"]+)".*/\1/p' {} \; | sort -u > metrics.txt
21 |
22 | # Set the KSM selector specifier.
23 | ksm_selector="kubeStateMetricsSelector"
24 |
25 | # Set the paths to the alerts, lib and rules directories.
26 | alerts_path="$repository_root/alerts"
27 | lib_path="$repository_root/lib"
28 | rules_path="$repository_root/rules"
29 |
30 | # Read metrics.txt line by line.
31 | while IFS= read -r metric; do
32 | selector_misses=$(\
33 | grep --only-matching --color=always --line-number "$metric{[^}]*}" --directories=recurse "$alerts_path" "$lib_path" "$rules_path" |\
34 | grep --invert-match "$ksm_selector" \
35 | )
36 | if [ -n "$selector_misses" ]; then
37 | echo "The following $metric metrics are missing the $ksm_selector specifier:"
38 | echo "$selector_misses"
39 | fi
40 | done < metrics.txt
41 |
42 | # Clean artefacts.
43 | rm metrics.txt
44 | cd .. || exit
45 | rm -rf kube-state-metrics
46 |
47 | # TODO: Currently, there are only two possible states the workflow can report: success or failure.
48 | # We could benefit from a third "warning" state, for cases where we observe an overlap of selectors for the same metric.
49 | # Ref: https://docs.github.com/en/actions/creating-actions/setting-exit-codes-for-actions#about-exit-codes
50 |
--------------------------------------------------------------------------------
/scripts/tools.go:
--------------------------------------------------------------------------------
1 | //go:build tools
2 | // +build tools
3 |
4 | // Packae tols tracks dependencies for tools that used in the build process.
5 | // See https://github.com/golang/go/issues/25922
6 | package tools
7 |
8 | import (
9 | _ "github.com/Kunde21/markdownfmt/v3/cmd/markdownfmt"
10 | _ "github.com/cloudflare/pint/cmd/pint"
11 | _ "github.com/errata-ai/vale/v3/cmd/vale"
12 | _ "github.com/google/go-jsonnet/cmd/jsonnet"
13 | _ "github.com/google/go-jsonnet/cmd/jsonnet-lint"
14 | _ "github.com/google/go-jsonnet/cmd/jsonnetfmt"
15 | _ "github.com/grafana/dashboard-linter"
16 | _ "github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb"
17 | _ "github.com/prometheus/prometheus/cmd/promtool"
18 | )
19 |
--------------------------------------------------------------------------------
/tests/apiserver-availability-test.yaml:
--------------------------------------------------------------------------------
1 | rule_files:
2 | - ../prometheus_alerts.yaml
3 | - ../prometheus_rules.yaml
4 |
5 | evaluation_interval: 1m
6 |
7 | tests:
8 | - name: calculate apiserver request total increase 1h rate
9 | interval: 1m
10 | input_series:
11 | # 100 requests in the overall interval, 99 successful and 1 error
12 | - series: 'apiserver_request_total{job="kube-apiserver",verb="GET",code="200"}'
13 | values: '0 10 20 50 90 99'
14 | - series: 'apiserver_request_total{job="kube-apiserver",verb="GET",code="500"}'
15 | values: '0x2 1x2'
16 |
17 | promql_expr_test:
18 | - eval_time: 5m
19 | expr: code_verb:apiserver_request_total:increase1h{verb="GET"}
20 | exp_samples:
21 | - labels: 'code_verb:apiserver_request_total:increase1h{code="200", verb="GET"}'
22 | value: 99.0
23 | - labels: 'code_verb:apiserver_request_total:increase1h{code="500", verb="GET"}'
24 | value: 1.0
25 |
26 | - name: calculate apiserver request total increase 30d rate
27 | interval: 1m
28 | input_series:
29 | - series: code_verb:apiserver_request_total:increase1h{verb="GET",code="200"}
30 | values: '10+10x9'
31 | - series: code_verb:apiserver_request_total:increase1h{verb="GET",code="500"}
32 | values: '0+1x9'
33 |
34 | promql_expr_test:
35 | - eval_time: 10m
36 | expr: code_verb:apiserver_request_total:increase30d{verb="GET"}
37 | exp_samples:
38 | - labels: 'code_verb:apiserver_request_total:increase30d{code="200", verb="GET"}'
39 | value: 3.96e+4 # average of the input series values times 24 (hours) times 30 (days)
40 | - labels: 'code_verb:apiserver_request_total:increase30d{code="500", verb="GET"}'
41 | value: 3.24e+3
42 |
--------------------------------------------------------------------------------
/tests/apps_alerts-test.yaml:
--------------------------------------------------------------------------------
1 | rule_files:
2 | - ../prometheus_alerts.yaml
3 |
4 | tests:
5 | - interval: 1m
6 | name: KubePdbNotEnoughHealthyPods fires when current healthly pods are less than desired
7 | input_series:
8 | - series: 'kube_poddisruptionbudget_status_desired_healthy{cluster="cluster1", namespace="ns1", poddisruptionbudget="pdb1", job="kube-state-metrics"}'
9 | values: '4x15'
10 | - series: 'kube_poddisruptionbudget_status_current_healthy{cluster="cluster1", namespace="ns1", poddisruptionbudget="pdb1", job="kube-state-metrics"}'
11 | values: '3x15'
12 | alert_rule_test:
13 | - eval_time: 14m
14 | alertname: KubePdbNotEnoughHealthyPods
15 | - eval_time: 15m
16 | alertname: KubePdbNotEnoughHealthyPods
17 | exp_alerts:
18 | - exp_labels:
19 | severity: "warning"
20 | cluster: "cluster1"
21 | namespace: "ns1"
22 | poddisruptionbudget: "pdb1"
23 | job: "kube-state-metrics"
24 | exp_annotations:
25 | description: "PDB ns1/pdb1 expects 1 more healthy pods. The desired number of healthy pods has not been met for at least 15m."
26 | runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepdbnotenoughhealthypods"
27 | summary: "PDB does not have enough healthy pods."
28 |
29 | - interval: 1m
30 | name: KubeStatefulSetUpdateNotRolledOut still fires even if another label (e.g. instance) is present
31 | input_series:
32 | - series: 'kube_statefulset_status_current_revision{job="kube-state-metrics", cluster="c1", namespace="ns1", statefulset="ss1", revision="foo", instance="custom"}'
33 | values: '1x15'
34 | - series: 'kube_statefulset_status_update_revision{job="kube-state-metrics", cluster="c1", namespace="ns1", statefulset="ss1", revision="bar", instance="custom"}'
35 | values: '1x15'
36 | - series: 'kube_statefulset_replicas{job="kube-state-metrics", cluster="c1", namespace="ns1", statefulset="ss1", instance="custom"}'
37 | values: '5x15'
38 | - series: 'kube_statefulset_status_replicas_updated{job="kube-state-metrics", cluster="c1", namespace="ns1", statefulset="ss1", instance="custom"}'
39 | values: '1x15'
40 | alert_rule_test:
41 | - eval_time: 14m
42 | alertname: KubeStatefulSetUpdateNotRolledOut
43 | - eval_time: 15m
44 | alertname: KubeStatefulSetUpdateNotRolledOut
45 | exp_alerts:
46 | - exp_labels:
47 | cluster: "c1"
48 | job: "kube-state-metrics"
49 | namespace: "ns1"
50 | severity: "warning"
51 | statefulset: "ss1"
52 | exp_annotations:
53 | description: "StatefulSet ns1/ss1 update has not been rolled out."
54 | runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout"
55 | summary: "StatefulSet update has not been rolled out."
56 |
--------------------------------------------------------------------------------