├── .github ├── ISSUE_TEMPLATE │ ├── bugs.yaml │ └── enhancements.yaml ├── PULL_REQUEST_TEMPLATE │ └── prs.md ├── dependabot.yaml └── workflows │ ├── check-with-upstream.yaml │ ├── ci.yaml │ ├── release.yaml │ └── stale.yml ├── .gitignore ├── .lint ├── .vale.ini ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DESIGN.md ├── LICENSE ├── Makefile ├── OWNERS ├── README.md ├── SECURITY.md ├── SECURITY_CONTACTS ├── alerts ├── alerts.libsonnet ├── apps_alerts.libsonnet ├── kube_apiserver.libsonnet ├── kube_controller_manager.libsonnet ├── kube_proxy.libsonnet ├── kube_scheduler.libsonnet ├── kubelet.libsonnet ├── resource_alerts.libsonnet ├── storage_alerts.libsonnet └── system_alerts.libsonnet ├── config.libsonnet ├── dashboards ├── apiserver.libsonnet ├── controller-manager.libsonnet ├── dashboards.libsonnet ├── defaults.libsonnet ├── kubelet.libsonnet ├── network-usage │ ├── cluster-total.libsonnet │ ├── namespace-by-pod.libsonnet │ ├── namespace-by-workload.libsonnet │ ├── pod-total.libsonnet │ └── workload-total.libsonnet ├── network.libsonnet ├── persistentvolumesusage.libsonnet ├── proxy.libsonnet ├── resources.libsonnet ├── resources │ ├── cluster.libsonnet │ ├── multi-cluster.libsonnet │ ├── namespace.libsonnet │ ├── node.libsonnet │ ├── pod.libsonnet │ ├── workload-namespace.libsonnet │ └── workload.libsonnet ├── scheduler.libsonnet └── windows.libsonnet ├── jsonnetfile.json ├── lib ├── absent_alert.libsonnet ├── add-runbook-links.libsonnet ├── alerts.jsonnet ├── dashboards.jsonnet ├── rules.jsonnet └── utils.libsonnet ├── mixin.libsonnet ├── rules ├── apps.libsonnet ├── kube_apiserver-availability.libsonnet ├── kube_apiserver-burnrate.libsonnet ├── kube_apiserver-config.libsonnet ├── kube_apiserver-histogram.libsonnet ├── kube_apiserver.libsonnet ├── kube_scheduler.libsonnet ├── kubelet.libsonnet ├── node.libsonnet ├── rules.libsonnet └── windows.libsonnet ├── runbook.md ├── scripts ├── check-selectors-ksm.sh ├── go.mod ├── go.sum └── tools.go └── tests ├── apiserver-availability-test.yaml ├── apps_alerts-test.yaml ├── rules-pod-owner-test.yaml └── tests.yaml /.github/ISSUE_TEMPLATE/bugs.yaml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Report a bug in the existing codebase. 3 | title: '[Bug]: ' 4 | labels: ['kind/bug', 'pending-triage'] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Please use this template while reporting a bug and provide as much information as possible. If the matter is security related, please disclose it privately, see the project [security policy](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/main/SECURITY.md). 10 | - type: textarea 11 | id: cause 12 | attributes: 13 | label: What happened? 14 | description: A clear and concise description of what the bug is. Screenshots and screencasts are highly encouraged and helpful during triage, so please provide them if you can. 15 | placeholder: Describe the bug you encountered. Please do not paste any snippets here, use the next field instead. 16 | validations: 17 | required: true 18 | - type: textarea 19 | id: snippet 20 | attributes: 21 | label: Please provide any helpful snippets. 22 | description: If applicable, add code snippet(s) to help explain or reproduce the problem. This will be automatically formatted into code, so no need for backticks. Separate snippets using comments. 23 | render: jsonnet 24 | - type: dropdown 25 | id: contamination 26 | attributes: 27 | label: What parts of the codebase are affected? 28 | description: Select all that apply. 29 | multiple: true 30 | options: 31 | - Alerts 32 | - Dashboards 33 | - Rules 34 | - Other 35 | validations: 36 | required: true 37 | - type: checkboxes 38 | id: terms 39 | attributes: 40 | label: "I agree to the following terms:" 41 | options: 42 | - label: I agree to follow this project's [Code of Conduct](../../CODE_OF_CONDUCT.md). 43 | required: true 44 | - label: I have filled out all the required information above to the best of my ability. 45 | required: true 46 | - label: I have searched the issues of this repository and believe that this is not a duplicate. 47 | required: true 48 | - label: I have confirmed this bug exists in the default branch of the repository, as of the latest commit at the time of submission. 49 | required: true 50 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancements.yaml: -------------------------------------------------------------------------------- 1 | name: Enhancement Proposal 2 | description: Propose an enhancement for the existing codebase. 3 | title: '[Enhancement]: ' 4 | labels: ['kind/enhancement', 'pending-triage'] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Please use this template while proposing an enhancement and provide as much information as possible. If this is a feature request, please ensure that [a consensus has been reached](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/CONTRIBUTING.md?plain=1#L24) before submitting. 10 | - type: textarea 11 | id: idea 12 | attributes: 13 | label: What's the general idea for the enhancement? 14 | description: A clear and concise description of the enhancement's targeted problem and its proposed solution. Screenshots and screencasts are highly encouraged and helpful during triage, so please provide them if you can. 15 | placeholder: Describe the need for this enhancement. Please do not paste any snippets here, use the next field instead. 16 | validations: 17 | required: true 18 | - type: textarea 19 | id: snippet 20 | attributes: 21 | label: Please provide any helpful snippets. 22 | description: If applicable, add code snippet(s) to help explain or reproduce the problem. This will be automatically formatted into code, so no need for backticks. Separate snippets using comments. 23 | render: jsonnet 24 | - type: dropdown 25 | id: contamination 26 | attributes: 27 | label: What parts of the codebase does the enhancement target? 28 | description: Select all that apply. 29 | multiple: true 30 | options: 31 | - Alerts 32 | - Dashboards 33 | - Rules 34 | - Other 35 | validations: 36 | required: true 37 | - type: textarea 38 | id: extra 39 | attributes: 40 | label: Anything else relevant to the enhancement that would help with the triage process? 41 | description: Any additional context or information that would be helpful to the maintainers. For example, if you have considered any alternatives or workarounds, please share them here. 42 | placeholder: Add any additional information here. 43 | - type: checkboxes 44 | id: terms 45 | attributes: 46 | label: "I agree to the following terms:" 47 | options: 48 | - label: I agree to follow this project's [Code of Conduct](../../CODE_OF_CONDUCT.md). 49 | required: true 50 | - label: I have filled out all the required information above to the best of my ability. 51 | required: true 52 | - label: I have searched the issues of this repository and believe that this is not a duplicate. 53 | required: true 54 | - label: I have confirmed this proposal applies to the default branch of the repository, as of the latest commit at the time of submission. 55 | required: true 56 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/prs.md: -------------------------------------------------------------------------------- 1 | 6 | 7 | #### What does this PR fix? Please be as descriptive as possible.** 8 | 9 | #### Any helpful code snippets or visual aids (before and after this patch, if applicable)?** 10 |
11 | Details 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | Fixes # 20 | -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: github-actions 9 | directory: / 10 | schedule: 11 | interval: weekly 12 | - package-ecosystem: gomod 13 | directory: /scripts 14 | schedule: 15 | interval: weekly 16 | -------------------------------------------------------------------------------- /.github/workflows/check-with-upstream.yaml: -------------------------------------------------------------------------------- 1 | name: check-with-upstream 2 | permissions: {} 3 | # Run every Monday. 4 | on: 5 | schedule: 6 | - cron: "0 0 * * 1" 7 | jobs: 8 | check-selectors-ksm: 9 | runs-on: ubuntu-latest 10 | name: Check if KSM selectors are present on applicable metrics. 11 | steps: 12 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 13 | with: 14 | persist-credentials: false 15 | - run: make --always-make check-selectors-ksm 16 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: ci 2 | permissions: {} 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | jobs: 9 | matrix: 10 | runs-on: ubuntu-latest 11 | name: ${{ matrix.name }} 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | include: 16 | - name: Lint Alerts 17 | run: make --always-make alerts-lint 18 | - name: Generate YAML 19 | run: make --always-make generate && git diff --exit-code 20 | - name: Lint Grafana Dashboards 21 | run: make --always-make dashboards-lint 22 | - name: Format JSONNET 23 | run: make --always-make jsonnet-fmt && git diff --exit-code 24 | - name: Lint JSONNET 25 | run: make --always-make jsonnet-lint 26 | - name: Format MD 27 | run: make --always-make markdownfmt && git diff --exit-code 28 | - name: Lint MD 29 | run: make --always-make vale && git diff --exit-code 30 | - name: Lint YAML 31 | run: make --always-make pint-lint 32 | - name: Run unit tests 33 | run: make --always-make test 34 | 35 | steps: 36 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 37 | with: 38 | persist-credentials: false 39 | - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 40 | with: 41 | go-version-file: scripts/go.mod 42 | cache-dependency-path: scripts/go.sum 43 | - run: ${{ matrix.run }} 44 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "version-*" # Trigger the workflow on push events to version-* tags 7 | 8 | permissions: 9 | contents: write 10 | 11 | jobs: 12 | release: 13 | name: Release 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Create release on kubernetes-mixin 17 | uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | with: 21 | tag_name: ${{ github.ref_name }} 22 | repository: kubernetes-monitoring/kubernetes-mixin 23 | generate_release_notes: true 24 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: Check whether issues or PRs need attention 2 | on: 3 | workflow_dispatch: {} 4 | schedule: 5 | - cron: "0 0 * * *" 6 | permissions: 7 | issues: write 8 | pull-requests: write 9 | jobs: 10 | stale: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 14 | with: 15 | days-before-stale: 30 16 | days-before-close: 7 17 | stale-issue-message: | 18 | This issue has not had any activity in the past 30 days, so the 19 | `stale` label has been added to it. 20 | 21 | * The `stale` label will be removed if there is new activity 22 | * The issue will be closed in 7 days if there is no new activity 23 | * Add the `keepalive` label to exempt this issue from the stale check action 24 | 25 | Thank you for your contributions! 26 | stale-pr-message: | 27 | This PR has been automatically marked as stale because it has not 28 | had any activity in the past 30 days. 29 | 30 | The next time this stale check runs, the stale label will be 31 | removed if there is new activity. The issue will be closed in 7 32 | days if there is no new activity. 33 | 34 | Thank you for your contributions! 35 | stale-issue-label: stale 36 | stale-pr-label: stale 37 | exempt-issue-labels: keepalive 38 | exempt-pr-labels: keepalive 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | prometheus_alerts.yaml 2 | prometheus_rules.yaml 3 | dashboards_out 4 | vendor 5 | jsonnetfile.lock.json 6 | tmp 7 | .vale 8 | -------------------------------------------------------------------------------- /.lint: -------------------------------------------------------------------------------- 1 | exclusions: 2 | template-job-rule: 3 | template-instance-rule: 4 | target-job-rule: 5 | target-instance-rule: 6 | panel-title-description-rule: 7 | panel-units-rule: 8 | panel-datasource-rule: 9 | reason: The new Grafonnet promotes the use of datasources at the query level. This should probably end up in the linter as a valid option. 10 | -------------------------------------------------------------------------------- /.vale.ini: -------------------------------------------------------------------------------- 1 | StylesPath = .vale/styles 2 | 3 | MinAlertLevel = error 4 | 5 | Packages = Readability, write-good, alex 6 | 7 | [*] 8 | BasedOnStyles = Readability, write-good, alex 9 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # https://help.github.com/articles/about-codeowners/ 2 | 3 | # These owners will be the default owners for everything in the repo. Unless a 4 | # later match takes precedence, they will be requested for review when someone 5 | # opens a pull request. 6 | * @povilasv @skl 7 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. 8 | 9 | ## Our Standards 10 | 11 | Examples of behavior that contributes to a positive environment for our community include: 12 | 13 | * Demonstrating empathy and kindness toward other people 14 | * Being respectful of differing opinions, viewpoints, and experiences 15 | * Giving and gracefully accepting constructive feedback 16 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience 17 | * Focusing on what is best not just for us as individuals, but for the overall community 18 | 19 | Examples of unacceptable behavior include: 20 | 21 | * The use of sexualized language or imagery, and sexual attention or advances of any kind 22 | * Trolling, insulting or derogatory comments, and personal or political attacks 23 | * Public or private harassment 24 | * Publishing others' private information, such as a physical or email address, without their explicit permission 25 | * Other conduct which could reasonably be considered inappropriate in a professional setting 26 | 27 | ## Enforcement Responsibilities 28 | 29 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. 32 | 33 | ## Scope 34 | 35 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. 36 | 37 | ## Enforcement 38 | 39 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at kubernetes-mixin-security@googlegroups.com. All complaints will be reviewed and investigated promptly and fairly. 40 | 41 | All community leaders are obligated to respect the privacy and security of the reporter of any incident. 42 | 43 | ## Enforcement Guidelines 44 | 45 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: 46 | 47 | ### 1. Correction 48 | 49 | **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. 50 | 51 | **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. 52 | 53 | ### 2. Warning 54 | 55 | **Community Impact**: A violation through a single incident or series of actions. 56 | 57 | **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. 58 | 59 | ### 3. Temporary Ban 60 | 61 | **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. 62 | 63 | **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. 64 | 65 | ### 4. Permanent Ban 66 | 67 | **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. 68 | 69 | **Consequence**: A permanent ban from any sort of public interaction within the community. 70 | 71 | ## Attribution 72 | 73 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 74 | 75 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). 76 | 77 | For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. 78 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thank you for taking an interest in the project! We welcome all manner of contributions that are within the bounds of the project's [code of conduct](CODE_OF_CONDUCT.md). 4 | 5 | #### **Did you find a bug?** 6 | 7 | * **Do not open up a GitHub issue if the bug is a security vulnerability**, and instead to refer to our [security policy](SECURITY.md). 8 | 9 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/kubernetes-monitoring/kubernetes-mixin/issues). 10 | 11 | * If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/new). Be sure to include a **title and clear description**, as much relevant information as possible, and a **`jsonnet` snippet**, if applicable, as well as an optional **visual sample** demonstrating the expected behavior that is not occurring. 12 | 13 | * Whenever possible, use the relevant bug report templates to create the issue. 14 | 15 | #### **Did you write a patch that fixes a bug?** 16 | 17 | * Open a new GitHub pull request with the patch. 18 | 19 | * Ensure the PR description describes the problem **and** solution. Include the relevant issue number if applicable. 20 | 21 | * Before submitting, please make sure the pull request template is filled out correctly. 22 | 23 | #### **Do you intend to add a new feature or change an existing one?** 24 | 25 | * Suggest your change in [#monitoring-mixins](https://kubernetes.slack.com/archives/CAX9GU941) and start writing code. While doing so, please reflect on: 26 | * Is your feature request related to a problem? Please describe the necessity for the change. 27 | * Describe the solution you're proposing. Please provide any relevant context. 28 | * Add any other context (for example, any workarounds, code snippets, visual aids, etc.), if applicable. 29 | 30 | * Do not open an issue on GitHub until you have collected positive feedback about the change. GitHub issues are primarily intended for bug reports and fixes. 31 | 32 | #### **Do you have questions about the source code?** 33 | 34 | * Ask any question about how to use the `kubernetes-mixin` project in the [#monitoring-mixins](https://kubernetes.slack.com/archives/CAX9GU941). 35 | 36 | --- 37 | 38 | `kubernetes-mixin` is a volunteer effort. We encourage you to pitch in and join [the team](https://github.com/kubernetes-monitoring/kubernetes-mixin/graphs/contributors)! 39 | -------------------------------------------------------------------------------- /DESIGN.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Prometheus Monitoring Mixins 4 | 5 | ## Using jsonnet to package together dashboards, alerts and exporters. 6 | 7 | Status: Draft Tom Wilkie, Grafana Labs Frederic Branczyk, Red Hat 8 | 9 | In this design doc we present a technique for packaging and deploying "Monitoring Mixins" - extensible and customisable combinations of dashboards, alert definitions and exporters. 10 | 11 | ## Problem 12 | 13 | [Prometheus](#Notes) offers powerful open source monitoring and alerting - but that comes with higher degrees of freedom, making pre-configured monitoring configurations hard to build. Simultaneously, it has become accepted wisdom that the developers of a given software package are best placed to operate said software, or at least construct the basic monitoring configuration. 14 | 15 | This work aims to build on Julius Volz' document ["Prometheus Alerting and Dashboard Example Bundles"](#Notes) and subsequent PR ["Add initial node-exporter example bundle"](#Notes). In particular, we support the hypothesis that for Prometheus to gain increased traction we will need to appeal to non-monitoring-experts, and allow for a relatively seamless pre-configured monitoring experience. Where we disagree is around standardization: we do not want to prescribe a given label schema, example deployment or topology. That being said, a lot of the challenges surfaced in that doc are shared here. 16 | 17 | ## Aims 18 | 19 | This solution aims to define a minimal standard for how to package together Prometheus alerts, Prometheus recording rules and [Grafana](#Notes) dashboards in a way that is: 20 | 21 | **Easy to install and use, platform agnostic.** The users of these packages are unlikely to be monitoring experts. These packages must be easily installable with a few commands. And they must be general enough to work in all the environments where Prometheus can work: we're not just trying to build for Kubernetes here. That being said, the experience will be first class on Kubernetes. 22 | 23 | **Hosted alongside the programs which expose Prometheus metrics.** More often than not, the best people to build the alerting rules and dashboards for a given application are the authors of that application. And if that is not the case, then at least users of a given application will look to its source for monitoring best practices. We aim to provide a packaging method which allows the repo hosting the application source to also host the applications monitoring package; for them to be versioned along side the application. For example, we envisage the monitoring mixin for Etcd to live in the etcd repo and the monitoring package for Hashicorp's Consul to live in the [consul_exporter](#Notes) repo. 24 | 25 | **We want the ability to iterate and collaborate on packages.** A challenge with the existing published dashboards and alerts is that they are static: the only way to use them is to copy them into your codebase, edit them to make them fit with your deployment. This makes it hard for users to contribute changes back to the original author; it makes it impossible to download new improved versions and stay up to date with improvements. We want these packages to be constantly evolving; we want to encourage drive-by commits. 26 | 27 | **Packages should be reusable, configurable and extensible.** Users should be able to configure the packages to fit their deployments and labels schema without modifying the packages. Users should be able to extend the packages with extra dashboard panels and extra alerts, without having to copy, paste and modify them. The packages must be configurable so that they support the many different label schemes used today by different organisations. 28 | 29 | ## Proposal 30 | 31 | **Monitoring Mixins.** A monitoring mixin is a package of configuration containing Prometheus alerts, Prometheus recording rules and Grafana dashboards. Mixins will be maintained in version controlled repos (eg git) as a set of files. Versioning of mixins will be provided by the version control system; mixins themselves should not contain multiple versions. 32 | 33 | Mixins are intended just for the combination of Prometheus and Grafana, and not other monitoring or visualisation systems. Mixins are intended to be opinionated about the choice of monitoring technology. 34 | 35 | Mixins should not however be opinionated about how this configuration should be deployed; they should not contain manifests for deploying Prometheus and Grafana on Kubernetes, for instance. Multiple, separate projects can and should exist to help deploy mixins; we will provide example of how to do this on Kubernetes, and a tool for integrating with traditional config management systems. 36 | 37 | **Jsonnet.** We propose the use of [jsonnet](#Notes), a configuration language from Google, as the basis of our monitoring mixins. Jsonnet has some popularity in this space, as it is used in the [ksonnet](#Notes) project for achieving similar goals for Kubernetes. 38 | 39 | Jsonnet offers the ability to parameterise configuration, allowing for basic customisation. Furthermore, in Jsonnet one can reference another part of the data structure, reducing repetition. For example, with jsonnet one can specify a default job name, and then have all the alerts use that: 40 | 41 | ``` 42 | { 43 | _config+:: { 44 | kubeStateMetricsSelector: ‘job=”default/kube-state-metrics"', 45 | 46 | allowedNotReadyPods: 0, 47 | }, 48 | 49 | groups+: [ 50 | { 51 | name: "kubernetes", 52 | rules: [ 53 | { 54 | alert: "KubePodNotReady", 55 | expr: ||| 56 | sum by (namespace, pod) ( 57 | kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase!~"Running|Succeeded"} 58 | ) > $(allowedNotReadyPods)s 59 | ||| % $._config, 60 | "for": "1h", 61 | labels: { 62 | severity: "critical", 63 | }, 64 | annotations: { 65 | message: "{{ $labels.namespace }}/{{ $labels.pod }} is not ready.", 66 | }, 67 | }, 68 | ], 69 | }, 70 | ], 71 | } 72 | ``` 73 | 74 | **Configuration.* We'd like to suggest some standardisation of how configuration is supplied to mixins. A top level `_config` dictionary should be provided, containing various parameters for substitution into alerts and dashboards. In the above example, this is used to specify the selector for the kube-state-metrics pod, and the threshold for the alert. 75 | 76 | **Extension.** One of jsonnet's basic operations is to "merge” data structures - this also allows you to extend existing configurations. For example, given an existing dashboard: 77 | 78 | ``` 79 | local g = import "klumps/lib/grafana.libsonnet"; 80 | 81 | { 82 | dashboards+:: { 83 | "foo.json": g.dashboard("Foo") 84 | .addRow( 85 | g.row("Foo") 86 | .addPanel( 87 | g.panel("Bar") + 88 | g.queryPanel('irate(foor_bar_total[1m])', 'Foo Bar') 89 | ) 90 | ) 91 | }, 92 | } 93 | ``` 94 | 95 | It is relatively easy to import it and add extra rows: 96 | 97 | ``` 98 | local g = import "foo.libsonnet"; 99 | 100 | { 101 | dashboards+:: { 102 | "foo.json"+: 103 | super.addRow( 104 | g.row("A new row") 105 | .addPanel( 106 | g.panel("A new panel") + 107 | g.queryPanel('irate(new_total[1m])', 'New') 108 | ) 109 | ) 110 | }, 111 | } 112 | ``` 113 | 114 | These abilities offered by jsonnet are key to being able to separate out "upstream” alerts and dashboards from customizations, and keep upstream in sync with the source of the mixin. 115 | 116 | **Higher Order Abstractions.** jsonnet is a functional programming language, and as such allows you to build higher order abstractions over your configuration. For example, you can build functions to generate recording rules for a set of percentiles and labels aggregations, given a histogram: 117 | 118 | ``` 119 | local histogramRules(metric, labels) = 120 | local vars = { 121 | metric: metric, 122 | labels_underscore: std.join("_", labels), 123 | labels_comma: std.join(", ", labels), 124 | }; 125 | [ 126 | { 127 | record: "%(labels_underscore)s:%(metric)s:99quantile" % vars, 128 | expr: "histogram_quantile(0.99, sum(rate(%(metric)s_bucket[5m])) by (le, 129 | %(labels_comma)s))" % vars, 130 | }, 131 | { 132 | record: "%(labels_underscore)s:%(metric)s:50quantile" % vars, 133 | expr: "histogram_quantile(0.50, sum(rate(%(metric)s_bucket[5m])) by (le, 134 | %(labels_comma)s))" % vars, 135 | }, 136 | { 137 | record: "%(labels_underscore)s:%(metric)s:avg" % vars, 138 | expr: "sum(rate(%(metric)s_sum[5m])) by (%(labels_comma)s) / 139 | sum(rate(%(metric)s_count[5m])) by (%(labels_comma)s)" % vars, 140 | }, 141 | ]; 142 | 143 | { 144 | groups+: [{ 145 | name: "frontend_rules", 146 | rules: 147 | histogramRules("frontend_request_duration_seconds", ["job"]) + 148 | histogramRules("frontend_request_duration_seconds", ["job", "route"]), 149 | }], 150 | } 151 | ``` 152 | 153 | Other potential examples include functions to generate alerts at different thresholds, omitting multiple alerts, warning and critical. 154 | 155 | **[Grafonnet](#Notes)** An emerging pattern in the jsonnet ecosystem is the existence of libraries of helper functions to generate objects for a given system. For example, ksonnet is a library to generate objects for the Kubernetes object model. Grafonnet is a library for generating Grafana Dashboards using jsonnet. We envisage a series of libraries, such as Grafonnet, to help people build mixins. As such, any system for installing mixins needs to deal with transitive dependencies. 156 | 157 | **Package Management.** The current proof of concepts for mixins (see below) use the new package manager [jsonnet-bundler](#Notes) enabling the following workflow: 158 | 159 | ``` 160 | $ jb install kausal github.com/kausalco/public/consul-mixin 161 | ``` 162 | 163 | This downloads a copy of the mixin into `vendor/consul-mixin` and allows users to include the mixin in their ksonnet config like so: 164 | 165 | ``` 166 | local prometheus = import "prometheus-ksonnet/prometheus-ksonnet.libsonnet"; 167 | local consul_mixin = import "consul-mixin/mixin.libsonnet"; 168 | 169 | prometheus + consul_mixin { 170 | _config+:: { 171 | namespace: "default", 172 | }, 173 | } 174 | ``` 175 | 176 | This example also uses the prometheus-ksonnet package from [Kausal](#Notes), which understands the structure of the mixins and manifests alerting rules, recording rules and dashboards as config maps in Kubernetes, mounted into the Kubernetes pods in the correct place. 177 | 178 | However, we think this is a wider problem than just monitoring mixins, and are exploring designs for a generic jsonnet package manager in a [separate design doc](#Notes). 179 | 180 | **Proposed Schema.** To allow multiple tools to utilise mixins, we must agree on some common naming. The proposal is that a mixin is a single dictionary containing three keys: 181 | 182 | - `grafanaDashboards` A dictionary of dashboard file name (foo.json) to dashboard json. 183 | - `prometheusAlerts` A list of Prometheus alert groups. 184 | - `prometheusRules` A list of Prometheus rule groups. 185 | 186 | Each of these values will be expressed as jsonnet objects - not strings. It is the responsibility of the tool consuming the mixin to render these out as JSON or YAML. Jsonnet scripts to do this for you will be provided. 187 | 188 | ``` 189 | { 190 | grafanaDashboards+:: { 191 | "dashboard-name.json”: {...}, 192 | }, 193 | prometheusAlerts+:: [...], 194 | prometheusRules+:: [...], 195 | } 196 | ``` 197 | 198 | **Consuming a mixin.** 199 | 200 | - TODO examples of how we expect people to install, customise and extend mixins. 201 | - TODO Ability to manifest out jsonnet configuration in a variety of formats - YAML, JSON, INI etc 202 | - TODO show how it works with ksonnet but also with something like puppet.. 203 | 204 | Examples & Proof of Concepts We will probably put the specification and list of known mixins in a repo somewhere, as a readme. For now, these are the known mixins and related projects: 205 | 206 | | Application | Mixin | Author | 207 | |------------------|--------------------|--------------------------------| 208 | | CoreOS Etcd | etcd-mixin | Grapeshot / Tom Wilkie | 209 | | Cassandra | TBD | Grafana Labs | 210 | | Hashicorp Consul | consul-mixin | Kausal | 211 | | Hashicorp Vault | vault_exporter | Grapeshot / Tom Wilkie | 212 | | Kubernetes | kubernetes-mixin | Tom Wilkie & Frederic Branczyk | 213 | | Kubernetes | kubernetes-grafana | Frederic Branczyk | 214 | | Kubernetes | kube-prometheus | Frederic Branczyk | 215 | | Prometheus | prometheus-ksonnet | Kausal | 216 | 217 | **Open Questions** 218 | 219 | - Some systems require exporters; can / should these be packaged as part of the mixin? Hard to do generally, easy to do for kubernetes with ksonnet. 220 | - On the exporter topic, some systems need stats_exporter mappings to be consistent with alerts and dashboards. Even if we can include statds_exporter in the mixin, can we include the mappings? 221 | - A lot of questions from Julius' design are still open: how to deal with different aggregation windows, what labels to use on alerts etc. 222 | 223 | 224 | ## Notes 225 | 226 | This was recreated from a [web.archive.org](https://web.archive.org/web/20211021151124/https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/edit) capture of the original document, the license of this file is unknown. 227 | 228 | The links in the archive do not work and have not been recreated. 229 | 230 | The license of this file is unknown, but judging by the intent it was meant to be shared freely. 231 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BIN_DIR ?= $(shell pwd)/tmp/bin 2 | 3 | JSONNET_VENDOR=vendor 4 | GRAFANA_DASHBOARD_LINTER_BIN=$(BIN_DIR)/dashboard-linter 5 | JB_BIN=$(BIN_DIR)/jb 6 | JSONNET_BIN=$(BIN_DIR)/jsonnet 7 | JSONNETLINT_BIN=$(BIN_DIR)/jsonnet-lint 8 | JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt 9 | MD_FILES = $(shell find . \( -type d -name '.vale' -o -type d -name 'vendor' \) -prune -o -type f -name "*.md" -print) 10 | MARKDOWNFMT_BIN=$(BIN_DIR)/markdownfmt 11 | VALE_BIN=$(BIN_DIR)/vale 12 | PROMTOOL_BIN=$(BIN_DIR)/promtool 13 | PINT_BIN=$(BIN_DIR)/pint 14 | TOOLING=$(JB_BIN) $(JSONNETLINT_BIN) $(JSONNET_BIN) $(JSONNETFMT_BIN) $(PROMTOOL_BIN) $(GRAFANA_DASHBOARD_LINTER_BIN) $(MARKDOWNFMT_BIN) $(VALE_BIN) $(PINT_BIN) 15 | JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s 16 | SRC_DIR ?=dashboards 17 | OUT_DIR ?=dashboards_out 18 | 19 | .PHONY: all 20 | all: fmt generate lint test 21 | 22 | .PHONY: generate 23 | generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR) 24 | 25 | $(JSONNET_VENDOR): $(JB_BIN) jsonnetfile.json 26 | $(JB_BIN) install 27 | 28 | .PHONY: fmt 29 | fmt: jsonnet-fmt markdownfmt 30 | 31 | .PHONY: jsonnet-fmt 32 | jsonnet-fmt: $(JSONNETFMT_BIN) 33 | @find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ 34 | xargs -n 1 -- $(JSONNETFMT_BIN) $(JSONNETFMT_ARGS) -i 35 | 36 | .PHONY: markdownfmt 37 | markdownfmt: $(MARKDOWNFMT_BIN) 38 | @for file in $(MD_FILES); do $(MARKDOWNFMT_BIN) -w -gofmt $$file; done 39 | 40 | prometheus_alerts.yaml: $(JSONNET_BIN) mixin.libsonnet lib/alerts.jsonnet alerts/*.libsonnet 41 | @$(JSONNET_BIN) -J vendor -S lib/alerts.jsonnet > $@ 42 | 43 | prometheus_rules.yaml: $(JSONNET_BIN) mixin.libsonnet lib/rules.jsonnet rules/*.libsonnet 44 | @$(JSONNET_BIN) -J vendor -S lib/rules.jsonnet > $@ 45 | 46 | $(OUT_DIR): $(JSONNET_BIN) $(JSONNET_VENDOR) mixin.libsonnet lib/dashboards.jsonnet $(SRC_DIR)/*.libsonnet 47 | @mkdir -p $(OUT_DIR) 48 | @$(JSONNET_BIN) -J vendor -m $(OUT_DIR) lib/dashboards.jsonnet 49 | 50 | .PHONY: lint 51 | lint: jsonnet-lint alerts-lint dashboards-lint vale pint-lint 52 | 53 | .PHONY: jsonnet-lint 54 | jsonnet-lint: $(JSONNETLINT_BIN) $(JSONNET_VENDOR) 55 | @find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ 56 | xargs -n 1 -- $(JSONNETLINT_BIN) -J vendor 57 | 58 | .PHONY: alerts-lint 59 | alerts-lint: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml 60 | @$(PROMTOOL_BIN) check rules prometheus_rules.yaml 61 | @$(PROMTOOL_BIN) check rules prometheus_alerts.yaml 62 | 63 | $(OUT_DIR)/.lint: $(OUT_DIR) 64 | @cp .lint $@ 65 | 66 | .PHONY: dashboards-lint 67 | dashboards-lint: $(GRAFANA_DASHBOARD_LINTER_BIN) $(OUT_DIR)/.lint 68 | # Replace $$interval:$$resolution var with $$__rate_interval to make dashboard-linter happy. 69 | @sed -i -e 's/$$interval:$$resolution/$$__rate_interval/g' $(OUT_DIR)/*.json 70 | @find $(OUT_DIR) -name '*.json' -print0 | xargs -n 1 -0 $(GRAFANA_DASHBOARD_LINTER_BIN) lint --strict 71 | 72 | .PHONY: vale 73 | vale: $(VALE_BIN) 74 | @$(VALE_BIN) sync && \ 75 | $(VALE_BIN) $(MD_FILES) 76 | 77 | .PHONY: pint-lint 78 | pint-lint: generate $(PINT_BIN) 79 | @# Pint will not exit with a non-zero status code if there are linting issues. 80 | @output=$$($(PINT_BIN) -n -o -l WARN lint prometheus_alerts.yaml prometheus_rules.yaml 2>&1); \ 81 | if [ -n "$$output" ]; then \ 82 | echo "\n$$output"; \ 83 | exit 1; \ 84 | fi 85 | 86 | .PHONY: clean 87 | clean: 88 | # Remove all files and directories ignored by git. 89 | git clean -Xfd . 90 | 91 | .PHONY: test 92 | test: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml 93 | @$(PROMTOOL_BIN) test rules tests/*.yaml 94 | 95 | $(BIN_DIR): 96 | mkdir -p $(BIN_DIR) 97 | 98 | $(TOOLING): $(BIN_DIR) 99 | @echo Installing tools from hack/tools.go 100 | @cd scripts && go list -e -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) % 101 | 102 | ######################################## 103 | # "check-with-upstream" workflow checks. 104 | ######################################## 105 | 106 | check-selectors-ksm: 107 | @./scripts/check-selectors-ksm.sh 108 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | # See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md 2 | 3 | approvers: 4 | - brancz 5 | - csmarchbanks 6 | - metalmatze 7 | - tomwilkie 8 | - s-urbaniak 9 | - povilasv 10 | - paulfantom 11 | 12 | reviewers: 13 | - brancz 14 | - csmarchbanks 15 | - metalmatze 16 | - tomwilkie 17 | - s-urbaniak 18 | - povilasv 19 | - paulfantom 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prometheus Monitoring Mixin for Kubernetes 2 | 3 | [![ci](https://github.com/kubernetes-monitoring/kubernetes-mixin/actions/workflows/ci.yaml/badge.svg)](https://github.com/kubernetes-monitoring/kubernetes-mixin/actions/workflows/ci.yaml) 4 | 5 | > NOTE: This project is *pre-release* stage. Flags, configuration, behaviour and design may change significantly in following releases. 6 | 7 | A set of Grafana dashboards and Prometheus alerts for Kubernetes. 8 | 9 | ## Releases 10 | 11 | > Note: Releases up until `release-0.12` are changes in their own branches. Changelogs are included in releases starting from [version-0.13.0](https://github.com/kubernetes-monitoring/kubernetes-mixin/releases/tag/version-0.13.0). 12 | 13 | | Release branch | Kubernetes Compatibility | Prometheus Compatibility | Kube-state-metrics Compatibility | 14 | |----------------|--------------------------|--------------------------|----------------------------------| 15 | | release-0.1 | v1.13 and before | | | 16 | | release-0.2 | v1.14.1 and before | v2.11.0+ | | 17 | | release-0.3 | v1.17 and before | v2.11.0+ | | 18 | | release-0.4 | v1.18 | v2.11.0+ | | 19 | | release-0.5 | v1.19 | v2.11.0+ | | 20 | | release-0.6 | v1.19+ | v2.11.0+ | | 21 | | release-0.7 | v1.19+ | v2.11.0+ | v1.x | 22 | | release-0.8 | v1.20+ | v2.11.0+ | v2.0+ | 23 | | release-0.9 | v1.20+ | v2.11.0+ | v2.0+ | 24 | | release-0.10 | v1.20+ | v2.11.0+ | v2.0+ | 25 | | release-0.11 | v1.23+ | v2.11.0+ | v2.0+ | 26 | | release-0.12 | v1.23+ | v2.11.0+ | v2.0+ | 27 | | release-0.13 | v1.23+ | v2.11.0+ | v2.0+ | 28 | | master | v1.26+ | v2.11.0+ | v2.0+ | 29 | 30 | In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented. Therefore v0.1.x of this repository is the last release to support Kubernetes 1.13 and previous version on a best effort basis. 31 | 32 | Some alerts now use Prometheus filters made available in Prometheus 2.11.0, which makes this version of Prometheus a dependency. 33 | 34 | Warning: This compatibility matrix was initially created based on experience, we do not guarantee the compatibility, it may be updated based on new learnings. 35 | 36 | Warning: By default the expressions will generate *grafana 7.2+* compatible rules using the *$__rate_interval* variable for rate functions. If you need backward compatible rules please set *grafana72: false* in your *_config* 37 | 38 | ### Release steps 39 | 40 | Maintainers can trigger the [release workflow](.github/workflows/release.yaml) by pushing a git tag that matches the pattern: `version-*`. 41 | 42 | 1. Checkout `master` branch and pull for latest. 43 | 44 | ```bash 45 | git checkout master 46 | ``` 47 | 48 | 2. Create a tag following sem-ver versioning for the version and trigger release. 49 | 50 | ```bash 51 | # replace MAJOR.MINOR.PATCH with e.g. 1.2.3 52 | tag=version-MAJOR.MINOR.PATCH; git tag $tag && git push origin $tag 53 | ``` 54 | 55 | #### Decisions on backfilling releases 56 | 57 | We wanted to backfill `release-0.1` to `release-0.12` to have a changelog, but we were not able to use a GitHub action in a newer commit to trigger a release that generates a changelog on older commits. See #489 for full discussion. 58 | 59 | ## Metrics Deprecation 60 | 61 | The following recording rule is marked deprecated. It will be removed in v2.0.0. 62 | 63 | ```bash 64 | node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate 65 | ``` 66 | 67 | It will be replaced by the following recording rule to preserve data points using `rate` and add `5m` to indicate the range of the rate query in the recording rule name. 68 | 69 | ```bash 70 | node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m 71 | ``` 72 | 73 | ## How to use 74 | 75 | This mixin is designed to be vendored into the repo with your infrastructure config. To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler): 76 | 77 | You then have three options for deploying your dashboards 78 | 1. Generate the config files and deploy them yourself 79 | 2. Use ksonnet to deploy this mixin along with Prometheus and Grafana 80 | 3. Use prometheus-operator to deploy this mixin (TODO) 81 | 82 | ## Generate config files 83 | 84 | You can manually generate the alerts, dashboards and rules files, but first you must install some tools: 85 | 86 | ``` 87 | $ go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest 88 | $ brew install jsonnet 89 | ``` 90 | 91 | Then, grab the mixin and its dependencies: 92 | 93 | ``` 94 | $ git clone https://github.com/kubernetes-monitoring/kubernetes-mixin 95 | $ cd kubernetes-mixin 96 | $ jb install 97 | ``` 98 | 99 | Finally, build the mixin: 100 | 101 | ``` 102 | $ make prometheus_alerts.yaml 103 | $ make prometheus_rules.yaml 104 | $ make dashboards_out 105 | ``` 106 | 107 | The `prometheus_alerts.yaml` and `prometheus_rules.yaml` file then need to passed to your Prometheus server, and the files in `dashboards_out` need to be imported into you Grafana server. The exact details will depending on how you deploy your monitoring stack to Kubernetes. 108 | 109 | ### Dashboards for Windows Nodes 110 | 111 | There exist separate dashboards for windows resources. 112 | 1) Compute Resources / Cluster(Windows) 113 | 2) Compute Resources / Namespace(Windows) 114 | 3) Compute Resources / Pod(Windows) 115 | 4) USE Method / Cluster(Windows) 116 | 5) USE Method / Node(Windows) 117 | 118 | These dashboards are based on metrics populated by [windows-exporter](https://github.com/prometheus-community/windows_exporter) from each Windows node. 119 | 120 | ## Running the tests 121 | 122 | ```sh 123 | make test 124 | ``` 125 | 126 | ## Using with prometheus-ksonnet 127 | 128 | Alternatively you can also use the mixin with [prometheus-ksonnet](https://github.com/kausalco/public/tree/master/prometheus-ksonnet), a [ksonnet](https://github.com/ksonnet/ksonnet) module to deploy a fully-fledged Prometheus-based monitoring system for Kubernetes: 129 | 130 | Make sure you have the ksonnet v0.8.0: 131 | 132 | ``` 133 | $ brew install https://raw.githubusercontent.com/ksonnet/homebrew-tap/82ef24cb7b454d1857db40e38671426c18cd8820/ks.rb 134 | $ brew pin ks 135 | $ ks version 136 | ksonnet version: v0.8.0 137 | jsonnet version: v0.9.5 138 | client-go version: v1.6.8-beta.0+$Format:%h$ 139 | ``` 140 | 141 | In your config repo, if you don't have a ksonnet application, make a new one (will copy credentials from current context): 142 | 143 | ``` 144 | $ ks init 145 | $ cd 146 | $ ks env add default 147 | ``` 148 | 149 | Grab the kubernetes-jsonnet module using and its dependencies, which include the kubernetes-mixin: 150 | 151 | ``` 152 | $ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb 153 | $ jb init 154 | $ jb install github.com/kausalco/public/prometheus-ksonnet 155 | ``` 156 | 157 | Assuming you want to run in the default namespace ('environment' in ksonnet parlance), add the follow to the file `environments/default/main.jsonnet`: 158 | 159 | ```jsonnet 160 | local prometheus = import "prometheus-ksonnet/prometheus-ksonnet.libsonnet"; 161 | 162 | prometheus { 163 | _config+:: { 164 | namespace: "default", 165 | }, 166 | } 167 | ``` 168 | 169 | Apply your config: 170 | 171 | ``` 172 | $ ks apply default 173 | ``` 174 | 175 | ## Using prometheus-operator 176 | 177 | TODO 178 | 179 | ## Multi-cluster support 180 | 181 | Kubernetes-mixin can support dashboards across multiple clusters. You need either a multi-cluster [Thanos](https://github.com/improbable-eng/thanos) installation with `external_labels` configured or a [Cortex](https://github.com/cortexproject/cortex) system where a cluster label exists. To enable this feature you need to configure the following: 182 | 183 | ```jsonnet 184 | // Opt-in to multiCluster dashboards by overriding this and the clusterLabel. 185 | showMultiCluster: true, 186 | clusterLabel: '', 187 | ``` 188 | 189 | ## Customising the mixin 190 | 191 | Kubernetes-mixin allows you to override the selectors used for various jobs, to match those used in your Prometheus set. You can also customize the dashboard names and add grafana tags. 192 | 193 | In a new directory, add a file `mixin.libsonnet`: 194 | 195 | ```jsonnet 196 | local kubernetes = import "kubernetes-mixin/mixin.libsonnet"; 197 | 198 | kubernetes { 199 | _config+:: { 200 | kubeStateMetricsSelector: 'job="kube-state-metrics"', 201 | cadvisorSelector: 'job="kubernetes-cadvisor"', 202 | nodeExporterSelector: 'job="kubernetes-node-exporter"', 203 | kubeletSelector: 'job="kubernetes-kubelet"', 204 | grafanaK8s+:: { 205 | dashboardNamePrefix: 'Mixin / ', 206 | dashboardTags: ['kubernetes', 'infrastucture'], 207 | }, 208 | }, 209 | } 210 | ``` 211 | 212 | Then, install the kubernetes-mixin: 213 | 214 | ``` 215 | $ jb init 216 | $ jb install github.com/kubernetes-monitoring/kubernetes-mixin 217 | ``` 218 | 219 | Generate the alerts, rules and dashboards: 220 | 221 | ``` 222 | $ jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusAlerts)' > alerts.yml 223 | $ jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusRules)' >files/rules.yml 224 | $ jsonnet -J vendor -m files/dashboards -e '(import "mixin.libsonnet").grafanaDashboards' 225 | ``` 226 | 227 | ### Customising alert annotations 228 | 229 | The steps described below extend on the existing mixin library without modifying the original git repository. This is to make consuming updates to your extended alert definitions easier. These definitions can reside outside of this repository and added to your own custom location, where you can define your alert dependencies in your `jsonnetfile.json` and add customisations to the existing definitions. 230 | 231 | In your working directory, create a new file `kubernetes_mixin_override.libsonnet` with the following: 232 | 233 | ```jsonnet 234 | local utils = import 'lib/utils.libsonnet'; 235 | (import 'mixin.libsonnet') + 236 | ( 237 | { 238 | prometheusAlerts+:: 239 | // The specialAlerts can be in any other config file 240 | local slack = 'observability'; 241 | local specialAlerts = { 242 | KubePodCrashLooping: { slack_channel: slack }, 243 | KubePodNotReady: { slack_channel: slack }, 244 | }; 245 | 246 | local addExtraAnnotations(rule) = rule { 247 | [if 'alert' in rule then 'annotations']+: { 248 | dashboard: 'https://foo.bar.co', 249 | [if rule.alert in specialAlerts then 'slack_channel']: specialAlerts[rule.alert].slack_channel, 250 | }, 251 | }; 252 | utils.mapRuleGroups(addExtraAnnotations), 253 | } 254 | ) 255 | ``` 256 | 257 | Create new file: `lib/kubernetes_customised_alerts.jsonnet` with the following: 258 | 259 | ```jsonnet 260 | std.manifestYamlDoc((import '../kubernetes_mixin_override.libsonnet').prometheusAlerts) 261 | ``` 262 | 263 | Running `jsonnet -S lib/kubernetes_customised_alerts.jsonnet` will build the alerts with your customisations. 264 | 265 | Same result can be achieved by modyfying the existing `config.libsonnet` with the content of `kubernetes_mixin_override.libsonnet`. 266 | 267 | ## Background 268 | 269 | ### Alert Severities 270 | 271 | While the community has not yet fully agreed on alert severities and their to be used, this repository assumes the following paradigms when setting the severities: 272 | 273 | * Critical: An issue, that needs to page a person to take instant action 274 | * Warning: An issue, that needs to be worked on but in the regular work queue or for during office hours rather than paging the oncall 275 | * Info: Is meant to support a trouble shooting process by informing about a non-normal situation for one or more systems but not worth a page or ticket on its own. 276 | 277 | ### Architecture and Technical Decisions 278 | 279 | * For more motivation, see "[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin. 280 | * For more information about monitoring mixins, see this [design doc](DESIGN.md). 281 | 282 | ## Note 283 | 284 | You can use the external tool call [prom-metrics-check](https://github.com/ContainerSolutions/prom-metrics-check) to validate the created dashboards. This tool allows you to check if the metrics installed and used in Grafana dashboards exist in the Prometheus instance. Please have a look at https://github.com/ContainerSolutions/prom-metrics-check. 285 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | If you discover a security issue in this project, please report it to the project's [SECURITY_CONTACTS](SECURITY_CONTACTS). You can also ping the project's maintainers through the project's [Slack](https://kubernetes.slack.com/archives/CAX9GU941), privately. 6 | -------------------------------------------------------------------------------- /SECURITY_CONTACTS: -------------------------------------------------------------------------------- 1 | # Defined below are the security contacts for this repo. 2 | # 3 | # They are the contact point for the Product Security Committee to reach out 4 | # to for triaging and handling of incoming issues. 5 | # 6 | # The below names agree to abide by the 7 | # [Embargo Policy](https://git.k8s.io/security/private-distributors-list.md#embargo-policy) 8 | # and will be removed and replaced if they violate that agreement. 9 | # 10 | # DO NOT REPORT SECURITY VULNERABILITIES DIRECTLY TO THESE NAMES, FOLLOW THE 11 | # INSTRUCTIONS AT https://kubernetes.io/security/ 12 | 13 | brancz 14 | csmarchbanks 15 | metalmatze 16 | tomwilkie 17 | -------------------------------------------------------------------------------- /alerts/alerts.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'apps_alerts.libsonnet') + 2 | (import 'resource_alerts.libsonnet') + 3 | (import 'storage_alerts.libsonnet') + 4 | (import 'system_alerts.libsonnet') + 5 | (import 'kube_apiserver.libsonnet') + 6 | (import 'kubelet.libsonnet') + 7 | (import 'kube_scheduler.libsonnet') + 8 | (import 'kube_controller_manager.libsonnet') + 9 | (import 'kube_proxy.libsonnet') + 10 | (import '../lib/add-runbook-links.libsonnet') 11 | -------------------------------------------------------------------------------- /alerts/kube_apiserver.libsonnet: -------------------------------------------------------------------------------- 1 | local utils = import '../lib/utils.libsonnet'; 2 | 3 | { 4 | _config+:: { 5 | kubeApiserverSelector: error 'must provide selector for kube-apiserver', 6 | 7 | certExpirationWarningSeconds: 7 * 24 * 3600, 8 | certExpirationCriticalSeconds: 1 * 24 * 3600, 9 | }, 10 | 11 | prometheusAlerts+:: { 12 | groups+: [ 13 | { 14 | name: 'kube-apiserver-slos', 15 | rules: [ 16 | { 17 | alert: 'KubeAPIErrorBudgetBurn', 18 | expr: ||| 19 | sum by(%s) (apiserver_request:burnrate%s) > (%.2f * %.5f) 20 | and on(%s) 21 | sum by(%s) (apiserver_request:burnrate%s) > (%.2f * %.5f) 22 | ||| % [ 23 | $._config.clusterLabel, 24 | w.long, 25 | w.factor, 26 | (1 - $._config.SLOs.apiserver.target), 27 | $._config.clusterLabel, 28 | $._config.clusterLabel, 29 | w.short, 30 | w.factor, 31 | (1 - $._config.SLOs.apiserver.target), 32 | ], 33 | labels: { 34 | severity: w.severity, 35 | short: '%(short)s' % w, 36 | long: '%(long)s' % w, 37 | }, 38 | annotations: { 39 | description: 'The API server is burning too much error budget%s.' % [ 40 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 41 | ], 42 | summary: 'The API server is burning too much error budget.', 43 | }, 44 | 'for': '%(for)s' % w, 45 | } 46 | for w in $._config.SLOs.apiserver.windows 47 | ], 48 | }, 49 | { 50 | name: 'kubernetes-system-apiserver', 51 | rules: [ 52 | { 53 | alert: 'KubeClientCertificateExpiration', 54 | expr: ||| 55 | histogram_quantile(0.01, sum without (%(namespaceLabel)s, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s 56 | and 57 | on(job, %(clusterLabel)s, instance) apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 58 | ||| % $._config, 59 | 'for': '5m', 60 | labels: { 61 | severity: 'warning', 62 | }, 63 | annotations: { 64 | description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than %s%s.' % [ 65 | (utils.humanizeSeconds($._config.certExpirationWarningSeconds)), 66 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 67 | ], 68 | summary: 'Client certificate is about to expire.', 69 | }, 70 | }, 71 | { 72 | alert: 'KubeClientCertificateExpiration', 73 | expr: ||| 74 | histogram_quantile(0.01, sum without (%(namespaceLabel)s, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s 75 | and 76 | on(job, %(clusterLabel)s, instance) apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 77 | ||| % $._config, 78 | 'for': '5m', 79 | labels: { 80 | severity: 'critical', 81 | }, 82 | annotations: { 83 | description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than %s%s.' % [ 84 | (utils.humanizeSeconds($._config.certExpirationCriticalSeconds)), 85 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 86 | ], 87 | summary: 'Client certificate is about to expire.', 88 | }, 89 | }, 90 | { 91 | alert: 'KubeAggregatedAPIErrors', 92 | expr: ||| 93 | sum by(%(clusterLabel)s, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{%(kubeApiserverSelector)s}[1m])) > 0 94 | ||| % $._config, 95 | 'for': '10m', 96 | labels: { 97 | severity: 'warning', 98 | }, 99 | annotations: { 100 | description: 'Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors%s.' % [ 101 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 102 | ], 103 | summary: 'Kubernetes aggregated API has reported errors.', 104 | }, 105 | }, 106 | { 107 | alert: 'KubeAggregatedAPIDown', 108 | expr: ||| 109 | (1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice{%(kubeApiserverSelector)s}[10m]))) * 100 < 85 110 | ||| % $._config, 111 | 'for': '5m', 112 | labels: { 113 | severity: 'warning', 114 | }, 115 | annotations: { 116 | description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}%% available over the last 10m%s.' % [ 117 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 118 | ], 119 | summary: 'Kubernetes aggregated API is down.', 120 | }, 121 | }, 122 | (import '../lib/absent_alert.libsonnet') { 123 | componentName:: 'KubeAPI', 124 | selector:: $._config.kubeApiserverSelector, 125 | }, 126 | { 127 | alert: 'KubeAPITerminatedRequests', 128 | expr: ||| 129 | sum by(%(clusterLabel)s) (rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) / ( sum by(%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) + sum by(%(clusterLabel)s) (rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) ) > 0.20 130 | ||| % $._config, 131 | labels: { 132 | severity: 'warning', 133 | }, 134 | annotations: { 135 | description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests%s.' % [ 136 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 137 | ], 138 | summary: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.', 139 | }, 140 | 'for': '5m', 141 | }, 142 | ], 143 | }, 144 | ], 145 | }, 146 | } 147 | -------------------------------------------------------------------------------- /alerts/kube_controller_manager.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | kubeControllerManagerSelector: error 'must provide selector for kube-controller-manager', 4 | }, 5 | 6 | prometheusAlerts+:: { 7 | groups+: [ 8 | { 9 | name: 'kubernetes-system-controller-manager', 10 | rules: [ 11 | (import '../lib/absent_alert.libsonnet') { 12 | componentName:: 'KubeControllerManager', 13 | selector:: $._config.kubeControllerManagerSelector, 14 | }, 15 | ], 16 | }, 17 | ], 18 | }, 19 | } 20 | -------------------------------------------------------------------------------- /alerts/kube_proxy.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | kubeProxySelector: error 'must provide selector for kube-proxy', 4 | }, 5 | 6 | prometheusAlerts+:: { 7 | groups+: [ 8 | { 9 | name: 'kubernetes-system-kube-proxy', 10 | rules: [ 11 | (import '../lib/absent_alert.libsonnet') { 12 | componentName:: 'KubeProxy', 13 | selector:: $._config.kubeProxySelector, 14 | }, 15 | ], 16 | }, 17 | ], 18 | }, 19 | } 20 | -------------------------------------------------------------------------------- /alerts/kube_scheduler.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | kubeSchedulerSelector: 'job="kube-scheduler"', 4 | }, 5 | 6 | prometheusAlerts+:: { 7 | groups+: [ 8 | { 9 | name: 'kubernetes-system-scheduler', 10 | rules: [ 11 | (import '../lib/absent_alert.libsonnet') { 12 | componentName:: 'KubeScheduler', 13 | selector:: $._config.kubeSchedulerSelector, 14 | }, 15 | ], 16 | }, 17 | ], 18 | }, 19 | } 20 | -------------------------------------------------------------------------------- /alerts/kubelet.libsonnet: -------------------------------------------------------------------------------- 1 | local utils = import '../lib/utils.libsonnet'; 2 | 3 | { 4 | _config+:: { 5 | kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics', 6 | kubeletSelector: error 'must provide selector for kubelet', 7 | kubeNodeUnreachableIgnoreKeys: [ 8 | 'ToBeDeletedByClusterAutoscaler', 9 | 'cloud.google.com/impending-node-termination', 10 | 'aws-node-termination-handler/spot-itn', 11 | ], 12 | 13 | kubeletCertExpirationWarningSeconds: 7 * 24 * 3600, 14 | kubeletCertExpirationCriticalSeconds: 1 * 24 * 3600, 15 | 16 | // Evictions per second that will trigger an alert. The default value will trigger on any evictions. 17 | KubeNodeEvictionRateThreshold: 0.0, 18 | }, 19 | 20 | prometheusAlerts+:: { 21 | groups+: [ 22 | { 23 | name: 'kubernetes-system-kubelet', 24 | rules: [ 25 | { 26 | expr: ||| 27 | kube_node_status_condition{%(kubeStateMetricsSelector)s,condition="Ready",status="true"} == 0 28 | and on (%(clusterLabel)s, node) 29 | kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0 30 | ||| % $._config, 31 | labels: { 32 | severity: 'warning', 33 | }, 34 | annotations: { 35 | description: '{{ $labels.node }} has been unready for more than 15 minutes%s.' % [ 36 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 37 | ], 38 | summary: 'Node is not ready.', 39 | }, 40 | 'for': '15m', 41 | alert: 'KubeNodeNotReady', 42 | }, 43 | { 44 | alert: 'KubeNodePressure', 45 | expr: ||| 46 | kube_node_status_condition{%(kubeStateMetricsSelector)s,condition=~"(MemoryPressure|DiskPressure|PIDPressure)",status="true"} == 1 47 | and on (%(clusterLabel)s, node) 48 | kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0 49 | ||| % $._config, 50 | labels: { 51 | severity: 'info', 52 | }, 53 | 'for': '10m', 54 | annotations: { 55 | description: '{{ $labels.node }}%s has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.' % [ 56 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 57 | ], 58 | summary: 'Node has as active Condition.', 59 | }, 60 | }, 61 | { 62 | expr: ||| 63 | (kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{%(kubeStateMetricsSelector)s,key=~"%(kubeNodeUnreachableIgnoreKeys)s"}) == 1 64 | ||| % $._config { 65 | kubeNodeUnreachableIgnoreKeys: std.join('|', super.kubeNodeUnreachableIgnoreKeys), 66 | }, 67 | labels: { 68 | severity: 'warning', 69 | }, 70 | annotations: { 71 | description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled%s.' % [ 72 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 73 | ], 74 | summary: 'Node is unreachable.', 75 | }, 76 | 'for': '15m', 77 | alert: 'KubeNodeUnreachable', 78 | }, 79 | { 80 | alert: 'KubeletTooManyPods', 81 | // Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it. 82 | // We have to ignore this special node in the KubeletTooManyPods alert. 83 | expr: ||| 84 | ( 85 | max by (%(clusterLabel)s, instance) ( 86 | kubelet_running_pods{%(kubeletSelector)s} > 1 87 | ) 88 | * on (%(clusterLabel)s, instance) group_left(node) 89 | max by (%(clusterLabel)s, instance, node) ( 90 | kubelet_node_name{%(kubeletSelector)s} 91 | ) 92 | ) 93 | / on (%(clusterLabel)s, node) group_left() 94 | max by (%(clusterLabel)s, node) ( 95 | kube_node_status_capacity{%(kubeStateMetricsSelector)s, resource="pods"} != 1 96 | ) > 0.95 97 | ||| % $._config, 98 | 'for': '15m', 99 | labels: { 100 | severity: 'info', 101 | }, 102 | annotations: { 103 | description: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity%s." % [ 104 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 105 | ], 106 | summary: 'Kubelet is running at capacity.', 107 | }, 108 | }, 109 | { 110 | alert: 'KubeNodeReadinessFlapping', 111 | expr: ||| 112 | sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2 113 | and on (%(clusterLabel)s, node) 114 | kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0 115 | ||| % $._config, 116 | 'for': '15m', 117 | labels: { 118 | severity: 'warning', 119 | }, 120 | annotations: { 121 | description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes%s.' % [ 122 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 123 | ], 124 | summary: 'Node readiness status is flapping.', 125 | }, 126 | }, 127 | { 128 | alert: 'KubeNodeEviction', 129 | expr: ||| 130 | sum(rate(kubelet_evictions{%(kubeletSelector)s}[15m])) by(%(clusterLabel)s, eviction_signal, instance) 131 | * on (%(clusterLabel)s, instance) group_left(node) 132 | max by (%(clusterLabel)s, instance, node) ( 133 | kubelet_node_name{%(kubeletSelector)s} 134 | ) 135 | > %(KubeNodeEvictionRateThreshold)s 136 | ||| % $._config, 137 | labels: { 138 | severity: 'info', 139 | }, 140 | 'for': '0s', 141 | annotations: { 142 | description: 'Node {{ $labels.node }}%s is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits.' % [ 143 | utils.ifShowMultiCluster($._config, ' on {{ $labels.%(clusterLabel)s }}' % $._config), 144 | ], 145 | summary: 'Node is evicting pods.', 146 | }, 147 | }, 148 | { 149 | alert: 'KubeletPlegDurationHigh', 150 | expr: ||| 151 | node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 152 | ||| % $._config, 153 | 'for': '5m', 154 | labels: { 155 | severity: 'warning', 156 | }, 157 | annotations: { 158 | description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}%s.' % [ 159 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 160 | ], 161 | summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.', 162 | }, 163 | }, 164 | { 165 | alert: 'KubeletPodStartUpLatencyHigh', 166 | expr: ||| 167 | histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (%(clusterLabel)s, instance, le)) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s} > 60 168 | ||| % $._config, 169 | 'for': '15m', 170 | labels: { 171 | severity: 'warning', 172 | }, 173 | annotations: { 174 | description: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}%s.' % [ 175 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 176 | ], 177 | summary: 'Kubelet Pod startup latency is too high.', 178 | }, 179 | }, 180 | { 181 | alert: 'KubeletClientCertificateExpiration', 182 | expr: ||| 183 | kubelet_certificate_manager_client_ttl_seconds < %(kubeletCertExpirationWarningSeconds)s 184 | ||| % $._config, 185 | labels: { 186 | severity: 'warning', 187 | }, 188 | annotations: { 189 | description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [ 190 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 191 | ], 192 | summary: 'Kubelet client certificate is about to expire.', 193 | }, 194 | }, 195 | { 196 | alert: 'KubeletClientCertificateExpiration', 197 | expr: ||| 198 | kubelet_certificate_manager_client_ttl_seconds < %(kubeletCertExpirationCriticalSeconds)s 199 | ||| % $._config, 200 | labels: { 201 | severity: 'critical', 202 | }, 203 | annotations: { 204 | description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [ 205 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 206 | ], 207 | summary: 'Kubelet client certificate is about to expire.', 208 | }, 209 | }, 210 | { 211 | alert: 'KubeletServerCertificateExpiration', 212 | expr: ||| 213 | kubelet_certificate_manager_server_ttl_seconds < %(kubeletCertExpirationWarningSeconds)s 214 | ||| % $._config, 215 | labels: { 216 | severity: 'warning', 217 | }, 218 | annotations: { 219 | description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [ 220 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 221 | ], 222 | summary: 'Kubelet server certificate is about to expire.', 223 | }, 224 | }, 225 | { 226 | alert: 'KubeletServerCertificateExpiration', 227 | expr: ||| 228 | kubelet_certificate_manager_server_ttl_seconds < %(kubeletCertExpirationCriticalSeconds)s 229 | ||| % $._config, 230 | labels: { 231 | severity: 'critical', 232 | }, 233 | annotations: { 234 | description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [ 235 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 236 | ], 237 | summary: 'Kubelet server certificate is about to expire.', 238 | }, 239 | }, 240 | { 241 | alert: 'KubeletClientCertificateRenewalErrors', 242 | expr: ||| 243 | increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 244 | ||| % $._config, 245 | labels: { 246 | severity: 'warning', 247 | }, 248 | 'for': '15m', 249 | annotations: { 250 | description: 'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [ 251 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 252 | ], 253 | summary: 'Kubelet has failed to renew its client certificate.', 254 | }, 255 | }, 256 | { 257 | alert: 'KubeletServerCertificateRenewalErrors', 258 | expr: ||| 259 | increase(kubelet_server_expiration_renew_errors[5m]) > 0 260 | ||| % $._config, 261 | labels: { 262 | severity: 'warning', 263 | }, 264 | 'for': '15m', 265 | annotations: { 266 | description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [ 267 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 268 | ], 269 | summary: 'Kubelet has failed to renew its server certificate.', 270 | }, 271 | }, 272 | (import '../lib/absent_alert.libsonnet') { 273 | componentName:: 'Kubelet', 274 | selector:: $._config.kubeletSelector, 275 | }, 276 | ], 277 | }, 278 | ], 279 | }, 280 | } 281 | -------------------------------------------------------------------------------- /alerts/resource_alerts.libsonnet: -------------------------------------------------------------------------------- 1 | local utils = import '../lib/utils.libsonnet'; 2 | 3 | { 4 | _config+:: { 5 | kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics', 6 | nodeExporterSelector: error 'must provide selector for node-exporter', 7 | namespaceSelector: null, 8 | prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '', 9 | 10 | // We alert when the aggregate (CPU, Memory) quota for all namespaces is 11 | // greater than the amount of the resources in the cluster. We do however 12 | // allow you to overcommit if you wish. 13 | namespaceOvercommitFactor: 1.5, 14 | cpuThrottlingPercent: 25, 15 | cpuThrottlingSelector: '', 16 | // Set this selector for seleting namespaces that contains resources used for overprovision 17 | // See https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-configure-overprovisioning-with-cluster-autoscaler 18 | // for more details. 19 | ignoringOverprovisionedWorkloadSelector: '', 20 | }, 21 | 22 | prometheusAlerts+:: { 23 | groups+: [ 24 | { 25 | name: 'kubernetes-resources', 26 | rules: [ 27 | { 28 | alert: 'KubeCPUOvercommit', 29 | labels: { 30 | severity: 'warning', 31 | }, 32 | annotations: { 33 | summary: 'Cluster has overcommitted CPU resource requests.', 34 | }, 35 | 'for': '10m', 36 | } + 37 | if $._config.showMultiCluster then { 38 | expr: ||| 39 | sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0 40 | and 41 | (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0 42 | ||| % $._config, 43 | annotations+: { 44 | description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % $._config, 45 | }, 46 | } else { 47 | expr: ||| 48 | sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0 49 | and 50 | (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0 51 | ||| % $._config, 52 | annotations+: { 53 | description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config, 54 | }, 55 | }, 56 | { 57 | alert: 'KubeMemoryOvercommit', 58 | labels: { 59 | severity: 'warning', 60 | }, 61 | annotations: { 62 | summary: 'Cluster has overcommitted memory resource requests.', 63 | }, 64 | 'for': '10m', 65 | } + 66 | if $._config.showMultiCluster then { 67 | expr: ||| 68 | sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0 69 | and 70 | (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0 71 | ||| % $._config, 72 | annotations+: { 73 | description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config, 74 | }, 75 | } else 76 | { 77 | expr: ||| 78 | sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0 79 | and 80 | (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0 81 | ||| % $._config, 82 | annotations+: { 83 | description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.', 84 | }, 85 | }, 86 | { 87 | alert: 'KubeCPUQuotaOvercommit', 88 | labels: { 89 | severity: 'warning', 90 | }, 91 | annotations: { 92 | summary: 'Cluster has overcommitted CPU resource requests.', 93 | }, 94 | 'for': '5m', 95 | } + 96 | if $._config.showMultiCluster then { 97 | expr: ||| 98 | sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"})) by (%(clusterLabel)s) 99 | / 100 | sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) 101 | > %(namespaceOvercommitFactor)s 102 | ||| % $._config, 103 | annotations+: { 104 | description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Namespaces.' % $._config, 105 | }, 106 | } else 107 | { 108 | expr: ||| 109 | sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"})) 110 | / 111 | sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) 112 | > %(namespaceOvercommitFactor)s 113 | ||| % $._config, 114 | annotations+: { 115 | description: 'Cluster has overcommitted CPU resource requests for Namespaces.', 116 | }, 117 | }, 118 | { 119 | alert: 'KubeMemoryQuotaOvercommit', 120 | labels: { 121 | severity: 'warning', 122 | }, 123 | annotations: { 124 | summary: 'Cluster has overcommitted memory resource requests.', 125 | }, 126 | 'for': '5m', 127 | } + 128 | if $._config.showMultiCluster then { 129 | expr: ||| 130 | sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"})) by (%(clusterLabel)s) 131 | / 132 | sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) 133 | > %(namespaceOvercommitFactor)s 134 | ||| % $._config, 135 | annotations+: { 136 | description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Namespaces.' % $._config, 137 | }, 138 | } else 139 | { 140 | expr: ||| 141 | sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"})) 142 | / 143 | sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) 144 | > %(namespaceOvercommitFactor)s 145 | ||| % $._config, 146 | annotations+: { 147 | description: 'Cluster has overcommitted memory resource requests for Namespaces.', 148 | }, 149 | }, 150 | { 151 | alert: 'KubeQuotaAlmostFull', 152 | expr: ||| 153 | kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"} 154 | / ignoring(instance, job, type) 155 | (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0) 156 | > 0.9 < 1 157 | ||| % $._config, 158 | 'for': '15m', 159 | labels: { 160 | severity: 'info', 161 | }, 162 | annotations: { 163 | description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [ 164 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 165 | ], 166 | summary: 'Namespace quota is going to be full.', 167 | }, 168 | }, 169 | { 170 | alert: 'KubeQuotaFullyUsed', 171 | expr: ||| 172 | kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"} 173 | / ignoring(instance, job, type) 174 | (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0) 175 | == 1 176 | ||| % $._config, 177 | 'for': '15m', 178 | labels: { 179 | severity: 'info', 180 | }, 181 | annotations: { 182 | description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [ 183 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 184 | ], 185 | summary: 'Namespace quota is fully used.', 186 | }, 187 | }, 188 | { 189 | alert: 'KubeQuotaExceeded', 190 | expr: ||| 191 | kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"} 192 | / ignoring(instance, job, type) 193 | (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0) 194 | > 1 195 | ||| % $._config, 196 | 'for': '15m', 197 | labels: { 198 | severity: 'warning', 199 | }, 200 | annotations: { 201 | description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [ 202 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 203 | ], 204 | summary: 'Namespace quota has exceeded the limits.', 205 | }, 206 | }, 207 | { 208 | alert: 'CPUThrottlingHigh', 209 | expr: ||| 210 | sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node) 211 | / on (%(clusterLabel)s, %(namespaceLabel)s, pod, container, instance) group_left 212 | sum(increase(container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node) 213 | > ( %(cpuThrottlingPercent)s / 100 ) 214 | ||| % $._config, 215 | 'for': '15m', 216 | labels: { 217 | severity: 'info', 218 | }, 219 | annotations: { 220 | description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}%s.' % [ 221 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 222 | ], 223 | summary: 'Processes experience elevated CPU throttling.', 224 | }, 225 | }, 226 | ], 227 | }, 228 | ], 229 | }, 230 | } 231 | -------------------------------------------------------------------------------- /alerts/storage_alerts.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics', 4 | kubeletSelector: error 'must provide selector for kubelet', 5 | namespaceSelector: null, 6 | prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '', 7 | 8 | // We alert when a disk is expected to fill up in four days. Depending on 9 | // the data-set it might be useful to change the sampling-time for the 10 | // prediction 11 | volumeFullPredictionSampleTime: '6h', 12 | }, 13 | 14 | prometheusAlerts+:: { 15 | groups+: [ 16 | { 17 | name: 'kubernetes-storage', 18 | rules: [ 19 | { 20 | alert: 'KubePersistentVolumeFillingUp', 21 | expr: ||| 22 | ( 23 | kubelet_volume_stats_available_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} 24 | / 25 | kubelet_volume_stats_capacity_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} 26 | ) < 0.03 27 | and 28 | kubelet_volume_stats_used_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0 29 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim) 30 | kube_persistentvolumeclaim_access_mode{%(prefixedNamespaceSelector)s access_mode="ReadOnlyMany"} == 1 31 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim) 32 | kube_persistentvolumeclaim_labels{%(prefixedNamespaceSelector)s%(pvExcludedSelector)s} == 1 33 | ||| % $._config, 34 | 'for': '1m', 35 | labels: { 36 | severity: 'critical', 37 | }, 38 | annotations: { 39 | description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.' % $._config, 40 | summary: 'PersistentVolume is filling up.', 41 | }, 42 | }, 43 | { 44 | alert: 'KubePersistentVolumeFillingUp', 45 | expr: ||| 46 | ( 47 | kubelet_volume_stats_available_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} 48 | / 49 | kubelet_volume_stats_capacity_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} 50 | ) < 0.15 51 | and 52 | kubelet_volume_stats_used_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0 53 | and 54 | predict_linear(kubelet_volume_stats_available_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}[%(volumeFullPredictionSampleTime)s], 4 * 24 * 3600) < 0 55 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim) 56 | kube_persistentvolumeclaim_access_mode{%(prefixedNamespaceSelector)s access_mode="ReadOnlyMany"} == 1 57 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim) 58 | kube_persistentvolumeclaim_labels{%(prefixedNamespaceSelector)s%(pvExcludedSelector)s} == 1 59 | ||| % $._config, 60 | 'for': '1h', 61 | labels: { 62 | severity: 'warning', 63 | }, 64 | annotations: { 65 | description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.' % $._config, 66 | summary: 'PersistentVolume is filling up.', 67 | }, 68 | }, 69 | { 70 | alert: 'KubePersistentVolumeInodesFillingUp', 71 | expr: ||| 72 | ( 73 | kubelet_volume_stats_inodes_free{%(prefixedNamespaceSelector)s%(kubeletSelector)s} 74 | / 75 | kubelet_volume_stats_inodes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} 76 | ) < 0.03 77 | and 78 | kubelet_volume_stats_inodes_used{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0 79 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim) 80 | kube_persistentvolumeclaim_access_mode{%(prefixedNamespaceSelector)s access_mode="ReadOnlyMany"} == 1 81 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim) 82 | kube_persistentvolumeclaim_labels{%(prefixedNamespaceSelector)s%(pvExcludedSelector)s} == 1 83 | ||| % $._config, 84 | 'for': '1m', 85 | labels: { 86 | severity: 'critical', 87 | }, 88 | annotations: { 89 | description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.' % $._config, 90 | summary: 'PersistentVolumeInodes are filling up.', 91 | }, 92 | }, 93 | { 94 | alert: 'KubePersistentVolumeInodesFillingUp', 95 | expr: ||| 96 | ( 97 | kubelet_volume_stats_inodes_free{%(prefixedNamespaceSelector)s%(kubeletSelector)s} 98 | / 99 | kubelet_volume_stats_inodes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} 100 | ) < 0.15 101 | and 102 | kubelet_volume_stats_inodes_used{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0 103 | and 104 | predict_linear(kubelet_volume_stats_inodes_free{%(prefixedNamespaceSelector)s%(kubeletSelector)s}[%(volumeFullPredictionSampleTime)s], 4 * 24 * 3600) < 0 105 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim) 106 | kube_persistentvolumeclaim_access_mode{%(prefixedNamespaceSelector)s access_mode="ReadOnlyMany"} == 1 107 | unless on(%(clusterLabel)s, namespace, persistentvolumeclaim) 108 | kube_persistentvolumeclaim_labels{%(prefixedNamespaceSelector)s%(pvExcludedSelector)s} == 1 109 | ||| % $._config, 110 | 'for': '1h', 111 | labels: { 112 | severity: 'warning', 113 | }, 114 | annotations: { 115 | description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free.' % $._config, 116 | summary: 'PersistentVolumeInodes are filling up.', 117 | }, 118 | }, 119 | { 120 | alert: 'KubePersistentVolumeErrors', 121 | expr: ||| 122 | kube_persistentvolume_status_phase{phase=~"Failed|Pending",%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0 123 | ||| % $._config, 124 | 'for': '5m', 125 | labels: { 126 | severity: 'critical', 127 | }, 128 | annotations: { 129 | description: 'The persistent volume {{ $labels.persistentvolume }} {{ with $labels.%(clusterLabel)s -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.' % $._config, 130 | summary: 'PersistentVolume is having issues with provisioning.', 131 | }, 132 | }, 133 | ], 134 | }, 135 | ], 136 | }, 137 | } 138 | -------------------------------------------------------------------------------- /alerts/system_alerts.libsonnet: -------------------------------------------------------------------------------- 1 | local utils = import '../lib/utils.libsonnet'; 2 | 3 | { 4 | _config+:: { 5 | notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"', 6 | kubeApiserverSelector: 'job="kube-apiserver"', 7 | }, 8 | 9 | prometheusAlerts+:: { 10 | groups+: [ 11 | { 12 | name: 'kubernetes-system', 13 | rules: [ 14 | { 15 | alert: 'KubeVersionMismatch', 16 | expr: ||| 17 | count by (%(clusterLabel)s) (count by (git_version, %(clusterLabel)s) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 18 | ||| % $._config, 19 | 'for': '15m', 20 | labels: { 21 | severity: 'warning', 22 | }, 23 | annotations: { 24 | description: 'There are {{ $value }} different semantic versions of Kubernetes components running%s.' % [ 25 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 26 | ], 27 | summary: 'Different semantic versions of Kubernetes components running.', 28 | }, 29 | }, 30 | { 31 | alert: 'KubeClientErrors', 32 | // Many clients use get requests to check the existence of objects, 33 | // this is normal and an expected error, therefore it should be 34 | // ignored in this alert. 35 | expr: ||| 36 | (sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace) 37 | / 38 | sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace)) 39 | > 0.01 40 | ||| % $._config, 41 | 'for': '15m', 42 | labels: { 43 | severity: 'warning', 44 | }, 45 | annotations: { 46 | description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors%s." % [ 47 | utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), 48 | ], 49 | summary: 'Kubernetes API server client is experiencing errors.', 50 | }, 51 | }, 52 | ], 53 | }, 54 | ], 55 | }, 56 | } 57 | -------------------------------------------------------------------------------- /config.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | SLOs: { 4 | apiserver: { 5 | days: 30, // The number of days we alert on burning too much error budget for. 6 | target: 0.99, // The target percentage of availability between 0-1. (0.99 = 99%, 0.999 = 99.9%) 7 | 8 | // Only change these windows when you really understand multi burn rate errors. 9 | // Even though you can change the days above (which will change availability calculations) 10 | // these windows will alert on a 30 days sliding window. We're looking into basing these windows on the given days too. 11 | windows: [ 12 | { severity: 'critical', 'for': '2m', long: '1h', short: '5m', factor: 14.4 }, 13 | { severity: 'critical', 'for': '15m', long: '6h', short: '30m', factor: 6 }, 14 | { severity: 'warning', 'for': '1h', long: '1d', short: '2h', factor: 3 }, 15 | { severity: 'warning', 'for': '3h', long: '3d', short: '6h', factor: 1 }, 16 | ], 17 | }, 18 | }, 19 | 20 | // Selectors are inserted between {} in Prometheus queries. 21 | cadvisorSelector: 'job="cadvisor"', 22 | kubeletSelector: 'job="kubelet"', 23 | kubeStateMetricsSelector: 'job="kube-state-metrics"', 24 | nodeExporterSelector: 'job="node-exporter"', 25 | kubeSchedulerSelector: 'job="kube-scheduler"', 26 | kubeControllerManagerSelector: 'job="kube-controller-manager"', 27 | kubeApiserverSelector: 'job="kube-apiserver"', 28 | kubeProxySelector: 'job="kube-proxy"', 29 | podLabel: 'pod', 30 | hostNetworkInterfaceSelector: 'device!~"veth.+"', 31 | hostMountpointSelector: 'mountpoint="/"', 32 | windowsExporterSelector: 'job="kubernetes-windows-exporter"', 33 | containerfsSelector: 'container!=""', 34 | 35 | // List of labels to join for different type of metrics 36 | // Only works if your environment has the labels kube_%s_labels (e.g. kube_pod_labels) available. 37 | common_join_labels: [], 38 | pods_join_labels: $._config.common_join_labels, 39 | statefulsets_join_labels: $._config.common_join_labels, 40 | deployments_join_labels: $._config.common_join_labels, 41 | daemonsets_join_labels: $._config.common_join_labels, 42 | horizontalpodautoscalers_join_labels: $._config.common_join_labels, 43 | jobs_join_labels: $._config.common_join_labels, 44 | 45 | // Grafana dashboard IDs are necessary for stable links for dashboards 46 | grafanaDashboardIDs: { 47 | 'apiserver.json': std.md5('apiserver.json'), 48 | 'cluster-total.json': std.md5('cluster-total.json'), 49 | 'controller-manager.json': std.md5('controller-manager.json'), 50 | 'k8s-resources-cluster.json': std.md5('k8s-resources-cluster.json'), 51 | 'k8s-resources-multicluster.json': std.md5('k8s-resources-multicluster.json'), 52 | 'k8s-resources-namespace.json': std.md5('k8s-resources-namespace.json'), 53 | 'k8s-resources-node.json': std.md5('k8s-resources-node.json'), 54 | 'k8s-resources-pod.json': std.md5('k8s-resources-pod.json'), 55 | 'k8s-resources-windows-cluster.json': std.md5('k8s-resources-windows-cluster.json'), 56 | 'k8s-resources-windows-namespace.json': std.md5('k8s-resources-windows-namespace.json'), 57 | 'k8s-resources-windows-pod.json': std.md5('k8s-resources-windows-pod.json'), 58 | 'k8s-resources-workload.json': std.md5('k8s-resources-workload.json'), 59 | 'k8s-resources-workloads-namespace.json': std.md5('k8s-resources-workloads-namespace.json'), 60 | 'k8s-windows-cluster-rsrc-use.json': std.md5('k8s-windows-cluster-rsrc-use.json'), 61 | 'k8s-windows-node-rsrc-use.json': std.md5('k8s-windows-node-rsrc-use.json'), 62 | 'kubelet.json': std.md5('kubelet.json'), 63 | 'namespace-by-pod.json': std.md5('namespace-by-pod.json'), 64 | 'namespace-by-workload.json': std.md5('namespace-by-workload.json'), 65 | 'persistentvolumesusage.json': std.md5('persistentvolumesusage.json'), 66 | 'pod-total.json': std.md5('pod-total.json'), 67 | 'proxy.json': std.md5('proxy.json'), 68 | 'scheduler.json': std.md5('scheduler.json'), 69 | 'workload-total.json': std.md5('workload-total.json'), 70 | }, 71 | 72 | // Support for Grafana 7.2+ `$__rate_interval` instead of `$__interval` 73 | grafana72: true, 74 | grafanaIntervalVar: if self.grafana72 then '$__rate_interval' else '$__interval', 75 | 76 | // Config for the Grafana dashboards in the Kubernetes Mixin 77 | grafanaK8s: { 78 | dashboardNamePrefix: 'Kubernetes / ', 79 | dashboardTags: ['kubernetes-mixin'], 80 | 81 | // For links between grafana dashboards, you need to tell us if your grafana 82 | // servers under some non-root path. 83 | linkPrefix: '', 84 | 85 | // The default refresh time for all dashboards, default to 10s 86 | refresh: '10s', 87 | minimumTimeInterval: '1m', 88 | 89 | // Timezone for Grafana dashboards:: UTC, browser, ... 90 | grafanaTimezone: 'UTC', 91 | }, 92 | 93 | // Opt-in to multiCluster dashboards by overriding this and the clusterLabel. 94 | showMultiCluster: false, 95 | clusterLabel: 'cluster', 96 | 97 | namespaceLabel: 'namespace', 98 | 99 | // Default datasource name 100 | datasourceName: 'default', 101 | 102 | // Datasource instance filter regex 103 | datasourceFilterRegex: '', 104 | 105 | // This list of filesystem is referenced in various expressions. 106 | fstypes: ['ext[234]', 'btrfs', 'xfs', 'zfs'], 107 | fstypeSelector: 'fstype=~"%s"' % std.join('|', self.fstypes), 108 | 109 | // This list of disk device names is referenced in various expressions. 110 | diskDevices: ['mmcblk.p.+', 'nvme.+', 'rbd.+', 'sd.+', 'vd.+', 'xvd.+', 'dm-.+', 'dasd.+'], 111 | diskDeviceSelector: 'device=~"(/dev.+)|%s"' % std.join('|', self.diskDevices), 112 | 113 | // Certain workloads (e.g. KubeVirt/CDI) will fully utilise the persistent volume they claim 114 | // the size of the PV will never grow since they consume the entirety of the volume by design. 115 | // This selector allows an admin to 'pre-mark' the PVC of such a workload (or for any other use case) 116 | // so that specific storage alerts will not fire.With the default selector, adding a label `excluded-from-alerts: 'true'` 117 | // to the PVC will have the desired effect. 118 | pvExcludedSelector: 'label_excluded_from_alerts="true"', 119 | 120 | // Default timeout value for k8s Jobs. The jobs which are active beyond this duration would trigger KubeJobNotCompleted alert. 121 | kubeJobTimeoutDuration: 12 * 60 * 60, 122 | }, 123 | } 124 | -------------------------------------------------------------------------------- /dashboards/controller-manager.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local prometheus = g.query.prometheus; 3 | local stat = g.panel.stat; 4 | local timeSeries = g.panel.timeSeries; 5 | local var = g.dashboard.variable; 6 | 7 | { 8 | local statPanel(title, unit, query) = 9 | stat.new(title) 10 | + stat.options.withColorMode('none') 11 | + stat.standardOptions.withUnit(unit) 12 | + stat.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval) 13 | + stat.queryOptions.withTargets([ 14 | prometheus.new('${datasource}', query) 15 | + prometheus.withInstant(true), 16 | ]), 17 | 18 | local tsPanel = 19 | timeSeries { 20 | new(title): 21 | timeSeries.new(title) 22 | + timeSeries.options.legend.withShowLegend() 23 | + timeSeries.options.legend.withAsTable() 24 | + timeSeries.options.legend.withDisplayMode('table') 25 | + timeSeries.options.legend.withPlacement('right') 26 | + timeSeries.options.legend.withCalcs(['lastNotNull']) 27 | + timeSeries.options.tooltip.withMode('single') 28 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') 29 | + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) 30 | + timeSeries.fieldConfig.defaults.custom.withSpanNulls(true) 31 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval), 32 | }, 33 | 34 | grafanaDashboards+:: { 35 | 'controller-manager.json': 36 | local variables = { 37 | datasource: 38 | var.datasource.new('datasource', 'prometheus') 39 | + var.datasource.withRegex($._config.datasourceFilterRegex) 40 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() 41 | + var.datasource.generalOptions.withLabel('Data source') 42 | + { 43 | current: { 44 | selected: true, 45 | text: $._config.datasourceName, 46 | value: $._config.datasourceName, 47 | }, 48 | }, 49 | 50 | cluster: 51 | var.query.new('cluster') 52 | + var.query.withDatasourceFromVariable(self.datasource) 53 | + var.query.queryTypes.withLabelValues( 54 | $._config.clusterLabel, 55 | 'up{%(kubeControllerManagerSelector)s}' % $._config, 56 | ) 57 | + var.query.generalOptions.withLabel('cluster') 58 | + var.query.refresh.onTime() 59 | + ( 60 | if $._config.showMultiCluster 61 | then var.query.generalOptions.showOnDashboard.withLabelAndValue() 62 | else var.query.generalOptions.showOnDashboard.withNothing() 63 | ) 64 | + var.query.withSort(type='alphabetical'), 65 | 66 | instance: 67 | var.query.new('instance') 68 | + var.query.withDatasourceFromVariable(self.datasource) 69 | + var.query.queryTypes.withLabelValues( 70 | 'instance', 71 | 'up{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s}' % $._config, 72 | ) 73 | + var.query.generalOptions.withLabel('instance') 74 | + var.query.refresh.onTime() 75 | + var.query.generalOptions.showOnDashboard.withLabelAndValue() 76 | + var.query.withSort(type='alphabetical') 77 | + var.query.selectionOptions.withIncludeAll(), 78 | }; 79 | 80 | local panels = [ 81 | statPanel( 82 | 'Up', 83 | 'none', 84 | 'sum(up{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s})' % $._config 85 | ) 86 | + stat.gridPos.withW(4), 87 | 88 | tsPanel.new('Work Queue Add Rate') 89 | + tsPanel.gridPos.withW(20) 90 | + tsPanel.standardOptions.withUnit('ops') 91 | + tsPanel.queryOptions.withTargets([ 92 | prometheus.new( 93 | '${datasource}', 94 | 'sum(rate(workqueue_adds_total{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, name)' % $._config 95 | ) 96 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} {{name}}' % $._config), 97 | ]), 98 | 99 | tsPanel.new('Work Queue Depth') 100 | + tsPanel.standardOptions.withUnit('short') 101 | + tsPanel.queryOptions.withTargets([ 102 | prometheus.new( 103 | '${datasource}', 104 | 'sum(rate(workqueue_depth{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, name)' % $._config 105 | ) 106 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} {{name}}' % $._config), 107 | ]), 108 | 109 | tsPanel.new('Work Queue Latency') 110 | + tsPanel.standardOptions.withUnit('s') 111 | + tsPanel.queryOptions.withTargets([ 112 | prometheus.new( 113 | '${datasource}', 114 | 'histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, name, le))' % $._config 115 | ) 116 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} {{name}}' % $._config), 117 | ]), 118 | 119 | tsPanel.new('Kube API Request Rate') 120 | + tsPanel.gridPos.withW(8) 121 | + tsPanel.standardOptions.withUnit('ops') 122 | + tsPanel.queryOptions.withTargets([ 123 | prometheus.new( 124 | '${datasource}', 125 | 'sum(rate(rest_client_requests_total{%(kubeControllerManagerSelector)s, instance=~"$instance",code=~"2.."}[%(grafanaIntervalVar)s]))' % $._config 126 | ) 127 | + prometheus.withLegendFormat('2xx'), 128 | 129 | prometheus.new( 130 | '${datasource}', 131 | 'sum(rate(rest_client_requests_total{%(kubeControllerManagerSelector)s, instance=~"$instance",code=~"3.."}[%(grafanaIntervalVar)s]))' % $._config 132 | ) 133 | + prometheus.withLegendFormat('3xx'), 134 | 135 | prometheus.new( 136 | '${datasource}', 137 | 'sum(rate(rest_client_requests_total{%(kubeControllerManagerSelector)s, instance=~"$instance",code=~"4.."}[%(grafanaIntervalVar)s]))' % $._config 138 | ) 139 | + prometheus.withLegendFormat('4xx'), 140 | 141 | prometheus.new( 142 | '${datasource}', 143 | 'sum(rate(rest_client_requests_total{%(kubeControllerManagerSelector)s, instance=~"$instance",code=~"5.."}[%(grafanaIntervalVar)s]))' % $._config 144 | ) 145 | + prometheus.withLegendFormat('5xx'), 146 | ]), 147 | 148 | tsPanel.new('Post Request Latency 99th Quantile') 149 | + tsPanel.gridPos.withW(16) 150 | + tsPanel.standardOptions.withUnit('s') 151 | + tsPanel.queryOptions.withTargets([ 152 | prometheus.new( 153 | '${datasource}', 154 | 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance", verb="POST"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config 155 | ) 156 | + prometheus.withLegendFormat('{{verb}}'), 157 | ]), 158 | 159 | tsPanel.new('Get Request Latency 99th Quantile') 160 | + tsPanel.standardOptions.withUnit('s') 161 | + tsPanel.queryOptions.withTargets([ 162 | prometheus.new( 163 | '${datasource}', 164 | 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s, instance=~"$instance", verb="GET"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config 165 | ) 166 | + prometheus.withLegendFormat('{{verb}}'), 167 | ]), 168 | 169 | tsPanel.new('Memory') 170 | + tsPanel.gridPos.withW(8) 171 | + tsPanel.standardOptions.withUnit('bytes') 172 | + tsPanel.queryOptions.withTargets([ 173 | prometheus.new( 174 | '${datasource}', 175 | 'process_resident_memory_bytes{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s,instance=~"$instance"}' % $._config 176 | ) 177 | + prometheus.withLegendFormat('{{instance}}'), 178 | ]), 179 | 180 | tsPanel.new('CPU usage') 181 | + tsPanel.gridPos.withW(8) 182 | + tsPanel.standardOptions.withUnit('short') 183 | + tsPanel.queryOptions.withTargets([ 184 | prometheus.new( 185 | '${datasource}', 186 | 'rate(process_cpu_seconds_total{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])' % $._config 187 | ) 188 | + prometheus.withLegendFormat('{{instance}}'), 189 | ]), 190 | 191 | tsPanel.new('Goroutines') 192 | + tsPanel.gridPos.withW(8) 193 | + tsPanel.standardOptions.withUnit('short') 194 | + tsPanel.queryOptions.withTargets([ 195 | prometheus.new( 196 | '${datasource}', 197 | 'go_goroutines{%(clusterLabel)s="$cluster", %(kubeControllerManagerSelector)s,instance=~"$instance"}' % $._config 198 | ) 199 | + prometheus.withLegendFormat('{{instance}}'), 200 | ]), 201 | ]; 202 | 203 | g.dashboard.new('%(dashboardNamePrefix)sController Manager' % $._config.grafanaK8s) 204 | + g.dashboard.withUid($._config.grafanaDashboardIDs['controller-manager.json']) 205 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags) 206 | + g.dashboard.withEditable(false) 207 | + g.dashboard.time.withFrom('now-1h') 208 | + g.dashboard.time.withTo('now') 209 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh) 210 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.instance]) 211 | + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=24, panelHeight=7)), 212 | }, 213 | } 214 | -------------------------------------------------------------------------------- /dashboards/dashboards.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'network.libsonnet') + 2 | (import 'persistentvolumesusage.libsonnet') + 3 | (import 'resources.libsonnet') + 4 | (import 'apiserver.libsonnet') + 5 | (import 'controller-manager.libsonnet') + 6 | (import 'scheduler.libsonnet') + 7 | (import 'proxy.libsonnet') + 8 | (import 'kubelet.libsonnet') + 9 | (import 'defaults.libsonnet') + 10 | (import 'windows.libsonnet') 11 | -------------------------------------------------------------------------------- /dashboards/defaults.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | local kubernetesMixin = self, 3 | local grafanaDashboards = super.grafanaDashboards, 4 | 5 | // Automatically add a uid to each dashboard based on the base64 encoding 6 | // of the file name and set the timezone to be 'default'. 7 | grafanaDashboards:: { 8 | [filename]: grafanaDashboards[filename] { 9 | uid: std.get(kubernetesMixin._config.grafanaDashboardIDs, filename, default=std.md5(filename)), 10 | timezone: kubernetesMixin._config.grafanaK8s.grafanaTimezone, 11 | refresh: kubernetesMixin._config.grafanaK8s.refresh, 12 | tags: kubernetesMixin._config.grafanaK8s.dashboardTags, 13 | links: [ 14 | { 15 | asDropdown: true, 16 | includeVars: true, 17 | keepTime: true, 18 | tags: kubernetesMixin._config.grafanaK8s.dashboardTags, 19 | targetBlank: false, 20 | title: 'Kubernetes', 21 | type: 'dashboards', 22 | }, 23 | ], 24 | 25 | [if 'rows' in super then 'rows']: [ 26 | row { 27 | panels: [ 28 | panel { 29 | // Modify tooltip to only show a single value 30 | tooltip+: { 31 | shared: false, 32 | }, 33 | // Modify legend to always show as table on right side 34 | legend+: { 35 | alignAsTable: true, 36 | rightSide: true, 37 | }, 38 | // Set minimum time interval for all panels 39 | interval: kubernetesMixin._config.grafanaK8s.minimumTimeInterval, 40 | } 41 | for panel in super.panels 42 | ], 43 | } 44 | for row in super.rows 45 | ], 46 | 47 | } 48 | for filename in std.objectFields(grafanaDashboards) 49 | }, 50 | } 51 | -------------------------------------------------------------------------------- /dashboards/network-usage/pod-total.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local gauge = g.panel.gauge; 3 | local prometheus = g.query.prometheus; 4 | local timeSeries = g.panel.timeSeries; 5 | local var = g.dashboard.variable; 6 | 7 | { 8 | local tsPanel = 9 | timeSeries { 10 | new(title): 11 | timeSeries.new(title) 12 | + timeSeries.options.legend.withShowLegend() 13 | + timeSeries.options.legend.withAsTable() 14 | + timeSeries.options.legend.withDisplayMode('table') 15 | + timeSeries.options.legend.withPlacement('right') 16 | + timeSeries.options.tooltip.withMode('single') 17 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') 18 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval), 19 | }, 20 | 21 | grafanaDashboards+:: { 22 | 'pod-total.json': 23 | local variables = { 24 | datasource: 25 | var.datasource.new('datasource', 'prometheus') 26 | + var.datasource.withRegex($._config.datasourceFilterRegex) 27 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() 28 | + var.datasource.generalOptions.withLabel('Data source') 29 | + { 30 | current: { 31 | selected: true, 32 | text: $._config.datasourceName, 33 | value: $._config.datasourceName, 34 | }, 35 | }, 36 | 37 | cluster: 38 | var.query.new('cluster') 39 | + var.query.withDatasourceFromVariable(self.datasource) 40 | + var.query.queryTypes.withLabelValues( 41 | $._config.clusterLabel, 42 | 'up{%(cadvisorSelector)s}' % $._config, 43 | ) 44 | + var.query.generalOptions.withLabel('cluster') 45 | + var.query.refresh.onTime() 46 | + ( 47 | if $._config.showMultiCluster 48 | then var.query.generalOptions.showOnDashboard.withLabelAndValue() 49 | else var.query.generalOptions.showOnDashboard.withNothing() 50 | ) 51 | + var.query.withSort(type='alphabetical'), 52 | 53 | namespace: 54 | var.query.new('namespace') 55 | + var.query.selectionOptions.withIncludeAll(true, '.+') 56 | + var.query.withDatasourceFromVariable(self.datasource) 57 | + var.query.queryTypes.withLabelValues( 58 | 'namespace', 59 | 'container_network_receive_packets_total{%(clusterLabel)s="$cluster"}' % $._config, 60 | ) 61 | + var.query.generalOptions.withCurrent('kube-system') 62 | + var.query.generalOptions.withLabel('namespace') 63 | + var.query.refresh.onTime() 64 | + var.query.generalOptions.showOnDashboard.withLabelAndValue() 65 | + var.query.withSort(type='alphabetical'), 66 | 67 | pod: 68 | var.query.new('pod') 69 | + var.query.withDatasourceFromVariable(self.datasource) 70 | + var.query.queryTypes.withLabelValues( 71 | 'pod', 72 | 'container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace=~"$namespace"}' % $._config, 73 | ) 74 | + var.query.generalOptions.withCurrent('kube-system') 75 | + var.query.generalOptions.withLabel('pod') 76 | + var.query.refresh.onTime() 77 | + var.query.generalOptions.showOnDashboard.withLabelAndValue() 78 | + var.query.withSort(type='alphabetical'), 79 | }; 80 | 81 | local panels = [ 82 | gauge.new('Current Rate of Bytes Received') 83 | + gauge.standardOptions.withDisplayName('$pod') 84 | + gauge.standardOptions.withUnit('Bps') 85 | + gauge.standardOptions.withMin(0) 86 | + gauge.standardOptions.withMax(10000000000) // 10GBs 87 | + gauge.standardOptions.thresholds.withSteps([ 88 | { 89 | color: 'dark-green', 90 | index: 0, 91 | value: null, // 0GBs 92 | }, 93 | { 94 | color: 'dark-yellow', 95 | index: 1, 96 | value: 5000000000, // 5GBs 97 | }, 98 | { 99 | color: 'dark-red', 100 | index: 2, 101 | value: 7000000000, // 7GBs 102 | }, 103 | ]) 104 | + gauge.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval) 105 | + gauge.queryOptions.withTargets([ 106 | prometheus.new( 107 | '${datasource}', 108 | 'sum(rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s]))' % $._config 109 | ) 110 | + prometheus.withLegendFormat('__auto'), 111 | ]), 112 | 113 | gauge.new('Current Rate of Bytes Transmitted') 114 | + gauge.standardOptions.withDisplayName('$pod') 115 | + gauge.standardOptions.withUnit('Bps') 116 | + gauge.standardOptions.withMin(0) 117 | + gauge.standardOptions.withMax(10000000000) // 10GBs 118 | + gauge.standardOptions.thresholds.withSteps([ 119 | { 120 | color: 'dark-green', 121 | index: 0, 122 | value: null, // 0GBs 123 | }, 124 | { 125 | color: 'dark-yellow', 126 | index: 1, 127 | value: 5000000000, // 5GBs 128 | }, 129 | { 130 | color: 'dark-red', 131 | index: 2, 132 | value: 7000000000, // 7GBs 133 | }, 134 | ]) 135 | + gauge.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval) 136 | + gauge.queryOptions.withTargets([ 137 | prometheus.new( 138 | '${datasource}', 139 | 'sum(rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s]))' % $._config 140 | ) 141 | + prometheus.withLegendFormat('__auto'), 142 | ]), 143 | 144 | tsPanel.new('Receive Bandwidth') 145 | + tsPanel.standardOptions.withUnit('binBps') 146 | + tsPanel.queryOptions.withTargets([ 147 | prometheus.new( 148 | '${datasource}', 149 | 'sum(rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config 150 | ) 151 | + prometheus.withLegendFormat('__auto'), 152 | ]), 153 | 154 | tsPanel.new('Transmit Bandwidth') 155 | + tsPanel.standardOptions.withUnit('binBps') 156 | + tsPanel.queryOptions.withTargets([ 157 | prometheus.new( 158 | '${datasource}', 159 | 'sum(rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config 160 | ) 161 | + prometheus.withLegendFormat('__auto'), 162 | ]), 163 | 164 | tsPanel.new('Rate of Received Packets') 165 | + tsPanel.standardOptions.withUnit('pps') 166 | + tsPanel.queryOptions.withTargets([ 167 | prometheus.new('${datasource}', 'sum(rate(container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config) 168 | + prometheus.withLegendFormat('__auto'), 169 | ]), 170 | 171 | tsPanel.new('Rate of Transmitted Packets') 172 | + tsPanel.standardOptions.withUnit('pps') 173 | + tsPanel.queryOptions.withTargets([ 174 | prometheus.new('${datasource}', 'sum(rate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config) 175 | + prometheus.withLegendFormat('__auto'), 176 | ]), 177 | 178 | tsPanel.new('Rate of Received Packets Dropped') 179 | + tsPanel.standardOptions.withUnit('pps') 180 | + tsPanel.queryOptions.withTargets([ 181 | prometheus.new('${datasource}', 'sum(rate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config) 182 | + prometheus.withLegendFormat('__auto'), 183 | ]), 184 | 185 | tsPanel.new('Rate of Transmitted Packets Dropped') 186 | + tsPanel.standardOptions.withUnit('pps') 187 | + tsPanel.queryOptions.withTargets([ 188 | prometheus.new('${datasource}', 'sum(rate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster",namespace=~"$namespace", pod=~"$pod"}[%(grafanaIntervalVar)s])) by (pod)' % $._config) 189 | + prometheus.withLegendFormat('__auto'), 190 | ]), 191 | ]; 192 | 193 | g.dashboard.new('%(dashboardNamePrefix)sNetworking / Pod' % $._config.grafanaK8s) 194 | + g.dashboard.withUid($._config.grafanaDashboardIDs['pod-total.json']) 195 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags) 196 | + g.dashboard.withEditable(false) 197 | + g.dashboard.time.withFrom('now-1h') 198 | + g.dashboard.time.withTo('now') 199 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh) 200 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.namespace, variables.pod]) 201 | + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=12, panelHeight=9)), 202 | }, 203 | } 204 | -------------------------------------------------------------------------------- /dashboards/network.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'network-usage/cluster-total.libsonnet') + 2 | (import 'network-usage/namespace-by-workload.libsonnet') + 3 | (import 'network-usage/namespace-by-pod.libsonnet') + 4 | (import 'network-usage/pod-total.libsonnet') + 5 | (import 'network-usage/workload-total.libsonnet') 6 | -------------------------------------------------------------------------------- /dashboards/persistentvolumesusage.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local prometheus = g.query.prometheus; 3 | local gauge = g.panel.gauge; 4 | local timeSeries = g.panel.timeSeries; 5 | local var = g.dashboard.variable; 6 | 7 | { 8 | local gaugePanel(title, unit, query) = 9 | gauge.new(title) 10 | + gauge.standardOptions.withUnit(unit) 11 | + gauge.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval) 12 | + gauge.queryOptions.withTargets([ 13 | prometheus.new('${datasource}', query) 14 | + prometheus.withInstant(true), 15 | ]), 16 | 17 | local tsPanel = 18 | timeSeries { 19 | new(title): 20 | timeSeries.new(title) 21 | + timeSeries.options.legend.withShowLegend() 22 | + timeSeries.options.legend.withAsTable() 23 | + timeSeries.options.legend.withDisplayMode('table') 24 | + timeSeries.options.legend.withPlacement('right') 25 | + timeSeries.options.legend.withCalcs(['lastNotNull']) 26 | + timeSeries.options.tooltip.withMode('single') 27 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') 28 | + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) 29 | + timeSeries.fieldConfig.defaults.custom.withSpanNulls(true) 30 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval), 31 | }, 32 | 33 | grafanaDashboards+:: { 34 | 'persistentvolumesusage.json': 35 | local variables = { 36 | datasource: 37 | var.datasource.new('datasource', 'prometheus') 38 | + var.datasource.withRegex($._config.datasourceFilterRegex) 39 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() 40 | + var.datasource.generalOptions.withLabel('Data source') 41 | + { 42 | current: { 43 | selected: true, 44 | text: $._config.datasourceName, 45 | value: $._config.datasourceName, 46 | }, 47 | }, 48 | 49 | cluster: 50 | var.query.new('cluster') 51 | + var.query.withDatasourceFromVariable(self.datasource) 52 | + var.query.queryTypes.withLabelValues( 53 | $._config.clusterLabel, 54 | 'kubelet_volume_stats_capacity_bytes{%(kubeletSelector)s}' % $._config, 55 | ) 56 | + var.query.generalOptions.withLabel('cluster') 57 | + var.query.refresh.onTime() 58 | + ( 59 | if $._config.showMultiCluster 60 | then var.query.generalOptions.showOnDashboard.withLabelAndValue() 61 | else var.query.generalOptions.showOnDashboard.withNothing() 62 | ) 63 | + var.query.withSort(type='alphabetical'), 64 | 65 | namespace: 66 | var.query.new('namespace') 67 | + var.query.withDatasourceFromVariable(self.datasource) 68 | + var.query.queryTypes.withLabelValues( 69 | 'namespace', 70 | 'kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s}' % $._config, 71 | ) 72 | + var.query.generalOptions.withLabel('Namespace') 73 | + var.query.refresh.onTime() 74 | + var.query.generalOptions.showOnDashboard.withLabelAndValue() 75 | + var.query.withSort(type='alphabetical'), 76 | 77 | volume: 78 | var.query.new('volume') 79 | + var.query.withDatasourceFromVariable(self.datasource) 80 | + var.query.queryTypes.withLabelValues( 81 | 'persistentvolumeclaim', 82 | 'kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace"}' % $._config, 83 | ) 84 | + var.query.generalOptions.withLabel('PersistentVolumeClaim') 85 | + var.query.refresh.onTime() 86 | + var.query.generalOptions.showOnDashboard.withLabelAndValue() 87 | + var.query.withSort(type='alphabetical'), 88 | }; 89 | 90 | local panels = { 91 | tsUsage: 92 | tsPanel.new('Volume Space Usage') 93 | + tsPanel.standardOptions.withUnit('bytes') 94 | + tsPanel.queryOptions.withTargets([ 95 | prometheus.new('${datasource}', ||| 96 | ( 97 | sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}))) 98 | - 99 | sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}))) 100 | ) 101 | ||| % $._config) 102 | + prometheus.withLegendFormat('Used Space'), 103 | 104 | prometheus.new('${datasource}', ||| 105 | sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}))) 106 | ||| % $._config) 107 | + prometheus.withLegendFormat('Free Space'), 108 | ]), 109 | gaugeUsage: 110 | gaugePanel( 111 | 'Volume Space Usage', 112 | 'percent', 113 | ||| 114 | max without(instance,node) ( 115 | ( 116 | topk(1, kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}) 117 | - 118 | topk(1, kubelet_volume_stats_available_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}) 119 | ) 120 | / 121 | topk(1, kubelet_volume_stats_capacity_bytes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}) 122 | * 100) 123 | ||| % $._config 124 | ) 125 | + gauge.standardOptions.withMin(0) 126 | + gauge.standardOptions.withMax(100) 127 | + gauge.standardOptions.color.withMode('thresholds') 128 | + gauge.standardOptions.thresholds.withMode('absolute') 129 | + gauge.standardOptions.thresholds.withSteps( 130 | [ 131 | gauge.thresholdStep.withColor('green') 132 | + gauge.thresholdStep.withValue(0), 133 | 134 | gauge.thresholdStep.withColor('orange') 135 | + gauge.thresholdStep.withValue(80), 136 | 137 | gauge.thresholdStep.withColor('red') 138 | + gauge.thresholdStep.withValue(90), 139 | ] 140 | ), 141 | 142 | tsInodes: 143 | tsPanel.new('Volume inodes Usage') 144 | + tsPanel.standardOptions.withUnit('none') 145 | + tsPanel.queryOptions.withTargets([ 146 | prometheus.new('${datasource}', 'sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"})))' % $._config) 147 | + prometheus.withLegendFormat('Used inodes'), 148 | 149 | prometheus.new('${datasource}', ||| 150 | ( 151 | sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}))) 152 | - 153 | sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}))) 154 | ) 155 | ||| % $._config) 156 | + prometheus.withLegendFormat('Free inodes'), 157 | ]), 158 | gaugeInodes: 159 | gaugePanel( 160 | 'Volume inodes Usage', 161 | 'percent', 162 | ||| 163 | max without(instance,node) ( 164 | topk(1, kubelet_volume_stats_inodes_used{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}) 165 | / 166 | topk(1, kubelet_volume_stats_inodes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, namespace="$namespace", persistentvolumeclaim="$volume"}) 167 | * 100) 168 | ||| % $._config 169 | ) 170 | + gauge.standardOptions.withMin(0) 171 | + gauge.standardOptions.withMax(100) 172 | + gauge.standardOptions.color.withMode('thresholds') 173 | + gauge.standardOptions.thresholds.withMode('absolute') 174 | + gauge.standardOptions.thresholds.withSteps( 175 | [ 176 | gauge.thresholdStep.withColor('green') 177 | + gauge.thresholdStep.withValue(0), 178 | 179 | gauge.thresholdStep.withColor('orange') 180 | + gauge.thresholdStep.withValue(80), 181 | 182 | gauge.thresholdStep.withColor('red') 183 | + gauge.thresholdStep.withValue(90), 184 | ] 185 | ), 186 | }; 187 | 188 | g.dashboard.new('%(dashboardNamePrefix)sPersistent Volumes' % $._config.grafanaK8s) 189 | + g.dashboard.withUid($._config.grafanaDashboardIDs['persistentvolumesusage.json']) 190 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags) 191 | + g.dashboard.withEditable(false) 192 | + g.dashboard.time.withFrom('now-1h') 193 | + g.dashboard.time.withTo('now') 194 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh) 195 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.namespace, variables.volume]) 196 | + g.dashboard.withPanels([ 197 | panels.tsUsage { gridPos+: { w: 18, h: 7, y: 0 } }, 198 | panels.gaugeUsage { gridPos+: { w: 6, h: 7, x: 18, y: 0 } }, 199 | panels.tsInodes { gridPos+: { w: 18, h: 7, y: 7 } }, 200 | panels.gaugeInodes { gridPos+: { w: 6, h: 7, x: 18, y: 7 } }, 201 | ]), 202 | }, 203 | } 204 | -------------------------------------------------------------------------------- /dashboards/proxy.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | local prometheus = g.query.prometheus; 4 | local stat = g.panel.stat; 5 | local timeSeries = g.panel.timeSeries; 6 | local var = g.dashboard.variable; 7 | 8 | { 9 | local statPanel(title, unit, query) = 10 | stat.new(title) 11 | + stat.options.withColorMode('none') 12 | + stat.standardOptions.withUnit(unit) 13 | + stat.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval) 14 | + stat.queryOptions.withTargets([ 15 | prometheus.new('${datasource}', query) 16 | + prometheus.withInstant(true), 17 | ]), 18 | 19 | local tsPanel = 20 | timeSeries { 21 | new(title): 22 | timeSeries.new(title) 23 | + timeSeries.options.legend.withShowLegend() 24 | + timeSeries.options.legend.withAsTable() 25 | + timeSeries.options.legend.withDisplayMode('table') 26 | + timeSeries.options.legend.withPlacement('right') 27 | + timeSeries.options.legend.withCalcs(['lastNotNull']) 28 | + timeSeries.options.tooltip.withMode('single') 29 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') 30 | + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) 31 | + timeSeries.fieldConfig.defaults.custom.withSpanNulls(true) 32 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval), 33 | }, 34 | 35 | grafanaDashboards+:: { 36 | 'proxy.json': 37 | local variables = { 38 | datasource: 39 | var.datasource.new('datasource', 'prometheus') 40 | + var.datasource.withRegex($._config.datasourceFilterRegex) 41 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() 42 | + var.datasource.generalOptions.withLabel('Data source') 43 | + { 44 | current: { 45 | selected: true, 46 | text: $._config.datasourceName, 47 | value: $._config.datasourceName, 48 | }, 49 | }, 50 | 51 | cluster: 52 | var.query.new('cluster') 53 | + var.query.withDatasourceFromVariable(self.datasource) 54 | + var.query.queryTypes.withLabelValues( 55 | $._config.clusterLabel, 56 | 'up{%(kubeProxySelector)s}' % $._config 57 | ) 58 | + var.query.generalOptions.withLabel('cluster') 59 | + var.query.refresh.onTime() 60 | + ( 61 | if $._config.showMultiCluster 62 | then var.query.generalOptions.showOnDashboard.withLabelAndValue() 63 | else var.query.generalOptions.showOnDashboard.withNothing() 64 | ) 65 | + var.query.withSort(type='alphabetical'), 66 | 67 | instance: 68 | var.query.new('instance') 69 | + var.query.withDatasourceFromVariable(self.datasource) 70 | + var.query.queryTypes.withLabelValues( 71 | 'instance', 72 | 'up{%(kubeProxySelector)s, %(clusterLabel)s="$cluster", %(kubeProxySelector)s}' % $._config, 73 | ) 74 | + var.query.generalOptions.withLabel('instance') 75 | + var.query.refresh.onTime() 76 | + var.query.generalOptions.showOnDashboard.withLabelAndValue() 77 | + var.query.selectionOptions.withIncludeAll(true, '.+'), 78 | }; 79 | 80 | local panels = [ 81 | statPanel('Up', 'none', 'sum(up{%(clusterLabel)s="$cluster", %(kubeProxySelector)s})' % $._config) 82 | + stat.gridPos.withW(4), 83 | 84 | tsPanel.new('Rules Sync Rate') 85 | + tsPanel.gridPos.withW(10) 86 | + tsPanel.standardOptions.withUnit('ops') 87 | + tsPanel.queryOptions.withTargets([ 88 | prometheus.new('${datasource}', 'sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s]))' % $._config) 89 | + prometheus.withLegendFormat('rate'), 90 | ]), 91 | 92 | tsPanel.new('Rules Sync Latency 99th Quantile') 93 | + tsPanel.gridPos.withW(10) 94 | + tsPanel.standardOptions.withUnit('s') 95 | + tsPanel.queryOptions.withTargets([ 96 | prometheus.new('${datasource}', 'histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s]))' % $._config) 97 | + prometheus.withLegendFormat('{{instance}}'), 98 | ]), 99 | 100 | tsPanel.new('Network Programming Rate') 101 | + tsPanel.standardOptions.withUnit('ops') 102 | + tsPanel.queryOptions.withTargets([ 103 | prometheus.new('${datasource}', 'sum(rate(kubeproxy_network_programming_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s]))' % $._config) 104 | + prometheus.withLegendFormat('rate'), 105 | ]), 106 | 107 | tsPanel.new('Network Programming Latency 99th Quantile') 108 | + tsPanel.standardOptions.withUnit('s') 109 | + tsPanel.queryOptions.withTargets([ 110 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (instance, le))' % $._config) 111 | + prometheus.withLegendFormat('{{instance}}'), 112 | ]), 113 | 114 | tsPanel.new('Kube API Request Rate') 115 | + tsPanel.gridPos.withW(8) 116 | + tsPanel.standardOptions.withUnit('ops') 117 | + tsPanel.queryOptions.withTargets([ 118 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeProxySelector)s, instance=~"$instance",code=~"2.."}[%(grafanaIntervalVar)s]))' % $._config) 119 | + prometheus.withLegendFormat('2xx'), 120 | 121 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeProxySelector)s, instance=~"$instance",code=~"3.."}[%(grafanaIntervalVar)s]))' % $._config) 122 | + prometheus.withLegendFormat('3xx'), 123 | 124 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeProxySelector)s, instance=~"$instance",code=~"4.."}[%(grafanaIntervalVar)s]))' % $._config) 125 | + prometheus.withLegendFormat('4xx'), 126 | 127 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeProxySelector)s, instance=~"$instance",code=~"5.."}[%(grafanaIntervalVar)s]))' % $._config) 128 | + prometheus.withLegendFormat('5xx'), 129 | ]), 130 | 131 | tsPanel.new('Post Request Latency 99th Quantile') 132 | + tsPanel.gridPos.withW(16) 133 | + tsPanel.standardOptions.withUnit('ops') 134 | + tsPanel.queryOptions.withTargets([ 135 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeProxySelector)s,instance=~"$instance",verb="POST"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config) 136 | + prometheus.withLegendFormat('{{verb}}'), 137 | ]), 138 | 139 | tsPanel.new('Get Request Latency 99th Quantile') 140 | + tsPanel.gridPos.withW(24) 141 | + tsPanel.standardOptions.withUnit('s') 142 | + tsPanel.queryOptions.withTargets([ 143 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeProxySelector)s, instance=~"$instance", verb="GET"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config) 144 | + prometheus.withLegendFormat('{{verb}}'), 145 | ]), 146 | 147 | 148 | tsPanel.new('Memory') 149 | + tsPanel.gridPos.withW(8) 150 | + tsPanel.standardOptions.withUnit('bytes') 151 | + tsPanel.queryOptions.withTargets([ 152 | prometheus.new('${datasource}', 'process_resident_memory_bytes{%(clusterLabel)s="$cluster", %(kubeProxySelector)s,instance=~"$instance"}' % $._config) 153 | + prometheus.withLegendFormat('{{instance}}'), 154 | ]), 155 | 156 | tsPanel.new('CPU usage') 157 | + tsPanel.gridPos.withW(8) 158 | + tsPanel.standardOptions.withUnit('short') 159 | + tsPanel.queryOptions.withTargets([ 160 | prometheus.new('${datasource}', 'rate(process_cpu_seconds_total{%(clusterLabel)s="$cluster", %(kubeProxySelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])' % $._config) 161 | + prometheus.withLegendFormat('{{instance}}'), 162 | ]), 163 | 164 | tsPanel.new('Goroutines') 165 | + tsPanel.gridPos.withW(8) 166 | + tsPanel.standardOptions.withUnit('short') 167 | + tsPanel.queryOptions.withTargets([ 168 | prometheus.new('${datasource}', 'go_goroutines{%(clusterLabel)s="$cluster", %(kubeProxySelector)s,instance=~"$instance"}' % $._config) 169 | + prometheus.withLegendFormat('{{instance}}'), 170 | ]), 171 | ]; 172 | 173 | g.dashboard.new('%(dashboardNamePrefix)sProxy' % $._config.grafanaK8s) 174 | + g.dashboard.withUid($._config.grafanaDashboardIDs['proxy.json']) 175 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags) 176 | + g.dashboard.withEditable(false) 177 | + g.dashboard.time.withFrom('now-1h') 178 | + g.dashboard.time.withTo('now') 179 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh) 180 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.instance]) 181 | + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=12, panelHeight=7)), 182 | }, 183 | } 184 | -------------------------------------------------------------------------------- /dashboards/resources.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'resources/cluster.libsonnet') + 2 | (import 'resources/multi-cluster.libsonnet') + 3 | (import 'resources/namespace.libsonnet') + 4 | (import 'resources/node.libsonnet') + 5 | (import 'resources/pod.libsonnet') + 6 | (import 'resources/workload-namespace.libsonnet') + 7 | (import 'resources/workload.libsonnet') 8 | -------------------------------------------------------------------------------- /dashboards/scheduler.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | local prometheus = g.query.prometheus; 4 | local stat = g.panel.stat; 5 | local timeSeries = g.panel.timeSeries; 6 | local var = g.dashboard.variable; 7 | 8 | { 9 | local statPanel(title, unit, query) = 10 | stat.new(title) 11 | + stat.options.withColorMode('none') 12 | + stat.standardOptions.withUnit(unit) 13 | + stat.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval) 14 | + stat.queryOptions.withTargets([ 15 | prometheus.new('${datasource}', query) 16 | + prometheus.withInstant(true), 17 | ]), 18 | 19 | local tsPanel = 20 | timeSeries { 21 | new(title): 22 | timeSeries.new(title) 23 | + timeSeries.options.legend.withShowLegend() 24 | + timeSeries.options.legend.withAsTable() 25 | + timeSeries.options.legend.withDisplayMode('table') 26 | + timeSeries.options.legend.withPlacement('right') 27 | + timeSeries.options.legend.withCalcs(['lastNotNull']) 28 | + timeSeries.options.tooltip.withMode('single') 29 | + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') 30 | + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) 31 | + timeSeries.fieldConfig.defaults.custom.withSpanNulls(true) 32 | + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval), 33 | }, 34 | 35 | grafanaDashboards+:: { 36 | 'scheduler.json': 37 | 38 | local variables = { 39 | datasource: 40 | var.datasource.new('datasource', 'prometheus') 41 | + var.datasource.withRegex($._config.datasourceFilterRegex) 42 | + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() 43 | + var.datasource.generalOptions.withLabel('Data source') 44 | + { 45 | current: { 46 | selected: true, 47 | text: $._config.datasourceName, 48 | value: $._config.datasourceName, 49 | }, 50 | }, 51 | 52 | cluster: 53 | var.query.new('cluster') 54 | + var.query.withDatasourceFromVariable(self.datasource) 55 | + var.query.queryTypes.withLabelValues( 56 | $._config.clusterLabel, 57 | 'up{%(kubeSchedulerSelector)s}' % $._config 58 | ) 59 | + var.query.generalOptions.withLabel('cluster') 60 | + var.query.refresh.onTime() 61 | + ( 62 | if $._config.showMultiCluster 63 | then var.query.generalOptions.showOnDashboard.withLabelAndValue() 64 | else var.query.generalOptions.showOnDashboard.withNothing() 65 | ) 66 | + var.query.withSort(type='alphabetical'), 67 | 68 | instance: 69 | var.query.new('instance') 70 | + var.query.withDatasourceFromVariable(self.datasource) 71 | + var.query.queryTypes.withLabelValues( 72 | 'instance', 73 | 'up{%(kubeSchedulerSelector)s, %(clusterLabel)s="$cluster"}' % $._config, 74 | ) 75 | + var.query.generalOptions.withLabel('instance') 76 | + var.query.refresh.onTime() 77 | + var.query.generalOptions.showOnDashboard.withLabelAndValue() 78 | + var.query.selectionOptions.withIncludeAll(true, '.+'), 79 | }; 80 | 81 | local panels = [ 82 | statPanel('Up', 'none', 'sum(up{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s})' % $._config) 83 | + stat.gridPos.withW(4), 84 | 85 | tsPanel.new('Scheduling Rate') 86 | + tsPanel.gridPos.withW(10) 87 | + tsPanel.standardOptions.withUnit('ops') 88 | + tsPanel.queryOptions.withTargets([ 89 | prometheus.new('${datasource}', 'sum(rate(scheduler_e2e_scheduling_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance)' % $._config) 90 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} e2e' % $._config), 91 | 92 | prometheus.new('${datasource}', 'sum(rate(scheduler_binding_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance)' % $._config) 93 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} binding' % $._config), 94 | 95 | prometheus.new('${datasource}', 'sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance)' % $._config) 96 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} scheduling algorithm' % $._config), 97 | 98 | prometheus.new('${datasource}', 'sum(rate(scheduler_volume_scheduling_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance)' % $._config) 99 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} volume' % $._config), 100 | ]), 101 | 102 | tsPanel.new('Scheduling latency 99th Quantile') 103 | + tsPanel.gridPos.withW(10) 104 | + tsPanel.standardOptions.withUnit('s') 105 | + tsPanel.queryOptions.withTargets([ 106 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, le))' % $._config) 107 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} e2e' % $._config), 108 | 109 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, le))' % $._config) 110 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} binding' % $._config), 111 | 112 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, le))' % $._config) 113 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} scheduling algorithm' % $._config), 114 | 115 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}[%(grafanaIntervalVar)s])) by (%(clusterLabel)s, instance, le))' % $._config) 116 | + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{instance}} volume' % $._config), 117 | ]), 118 | 119 | tsPanel.new('Kube API Request Rate') 120 | + tsPanel.gridPos.withW(8) 121 | + tsPanel.standardOptions.withUnit('ops') 122 | + tsPanel.queryOptions.withTargets([ 123 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance",code=~"2.."}[%(grafanaIntervalVar)s]))' % $._config) 124 | + prometheus.withLegendFormat('2xx'), 125 | 126 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance",code=~"3.."}[%(grafanaIntervalVar)s]))' % $._config) 127 | + prometheus.withLegendFormat('3xx'), 128 | 129 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance",code=~"4.."}[%(grafanaIntervalVar)s]))' % $._config) 130 | + prometheus.withLegendFormat('4xx'), 131 | 132 | prometheus.new('${datasource}', 'sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance",code=~"5.."}[%(grafanaIntervalVar)s]))' % $._config) 133 | + prometheus.withLegendFormat('5xx'), 134 | ]), 135 | 136 | tsPanel.new('Post Request Latency 99th Quantile') 137 | + tsPanel.gridPos.withW(16) 138 | + tsPanel.standardOptions.withUnit('ops') 139 | + tsPanel.queryOptions.withTargets([ 140 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance", verb="POST"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config) 141 | + prometheus.withLegendFormat('{{verb}}'), 142 | ]), 143 | 144 | tsPanel.new('Get Request Latency 99th Quantile') 145 | + tsPanel.gridPos.withW(24) 146 | + tsPanel.standardOptions.withUnit('s') 147 | + tsPanel.queryOptions.withTargets([ 148 | prometheus.new('${datasource}', 'histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance", verb="GET"}[%(grafanaIntervalVar)s])) by (verb, le))' % $._config) 149 | + prometheus.withLegendFormat('{{verb}}'), 150 | ]), 151 | 152 | 153 | tsPanel.new('Memory') 154 | + tsPanel.gridPos.withW(8) 155 | + tsPanel.standardOptions.withUnit('bytes') 156 | + tsPanel.queryOptions.withTargets([ 157 | prometheus.new('${datasource}', 'process_resident_memory_bytes{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}' % $._config) 158 | + prometheus.withLegendFormat('{{instance}}'), 159 | ]), 160 | 161 | tsPanel.new('CPU usage') 162 | + tsPanel.gridPos.withW(8) 163 | + tsPanel.standardOptions.withUnit('short') 164 | + tsPanel.queryOptions.withTargets([ 165 | prometheus.new('${datasource}', 'rate(process_cpu_seconds_total{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s, instance=~"$instance"}[%(grafanaIntervalVar)s])' % $._config) 166 | + prometheus.withLegendFormat('{{instance}}'), 167 | ]), 168 | 169 | tsPanel.new('Goroutines') 170 | + tsPanel.gridPos.withW(8) 171 | + tsPanel.standardOptions.withUnit('short') 172 | + tsPanel.queryOptions.withTargets([ 173 | prometheus.new('${datasource}', 'go_goroutines{%(clusterLabel)s="$cluster", %(kubeSchedulerSelector)s,instance=~"$instance"}' % $._config) 174 | + prometheus.withLegendFormat('{{instance}}'), 175 | ]), 176 | ]; 177 | 178 | g.dashboard.new('%(dashboardNamePrefix)sScheduler' % $._config.grafanaK8s) 179 | + g.dashboard.withUid($._config.grafanaDashboardIDs['scheduler.json']) 180 | + g.dashboard.withTags($._config.grafanaK8s.dashboardTags) 181 | + g.dashboard.withEditable(false) 182 | + g.dashboard.time.withFrom('now-1h') 183 | + g.dashboard.time.withTo('now') 184 | + g.dashboard.withRefresh($._config.grafanaK8s.refresh) 185 | + g.dashboard.withVariables([variables.datasource, variables.cluster, variables.instance]) 186 | + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=12, panelHeight=7)), 187 | }, 188 | } 189 | -------------------------------------------------------------------------------- /jsonnetfile.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/grafana/grafonnet.git", 8 | "subdir": "gen/grafonnet-latest" 9 | } 10 | }, 11 | "version": "main" 12 | } 13 | ], 14 | "legacyImports": false 15 | } 16 | -------------------------------------------------------------------------------- /lib/absent_alert.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | local absentAlert = self, 3 | componentName:: error 'must provide component name', 4 | selector:: error 'must provide selector for component', 5 | 6 | alert: '%sDown' % absentAlert.componentName, 7 | expr: ||| 8 | absent(up{%s} == 1) 9 | ||| % absentAlert.selector, 10 | 'for': '15m', 11 | labels: { 12 | severity: 'critical', 13 | }, 14 | annotations: { 15 | description: '%s has disappeared from Prometheus target discovery.' % absentAlert.componentName, 16 | summary: 'Target disappeared from Prometheus target discovery.', 17 | }, 18 | } 19 | -------------------------------------------------------------------------------- /lib/add-runbook-links.libsonnet: -------------------------------------------------------------------------------- 1 | local utils = import 'utils.libsonnet'; 2 | 3 | local lower(x) = 4 | local cp(c) = std.codepoint(c); 5 | local lowerLetter(c) = 6 | if cp(c) >= 65 && cp(c) < 91 7 | then std.char(cp(c) + 32) 8 | else c; 9 | std.join('', std.map(lowerLetter, std.stringChars(x))); 10 | 11 | { 12 | _config+:: { 13 | runbookURLPattern: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-%s', 14 | }, 15 | 16 | prometheusAlerts+:: 17 | local addRunbookURL(rule) = rule { 18 | [if 'alert' in rule && !('runbook_url' in rule.annotations) then 'annotations']+: { 19 | runbook_url: $._config.runbookURLPattern % lower(rule.alert), 20 | }, 21 | }; 22 | utils.mapRuleGroups(addRunbookURL), 23 | } 24 | -------------------------------------------------------------------------------- /lib/alerts.jsonnet: -------------------------------------------------------------------------------- 1 | std.manifestYamlDoc((import '../mixin.libsonnet').prometheusAlerts) 2 | -------------------------------------------------------------------------------- /lib/dashboards.jsonnet: -------------------------------------------------------------------------------- 1 | local dashboards = (import '../mixin.libsonnet').grafanaDashboards; 2 | 3 | { 4 | [name]: dashboards[name] 5 | for name in std.objectFields(dashboards) 6 | } 7 | -------------------------------------------------------------------------------- /lib/rules.jsonnet: -------------------------------------------------------------------------------- 1 | std.manifestYamlDoc((import '../mixin.libsonnet').prometheusRules) 2 | -------------------------------------------------------------------------------- /lib/utils.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | mapRuleGroups(f): { 3 | groups: [ 4 | group { 5 | rules: [ 6 | f(rule) 7 | for rule in super.rules 8 | ], 9 | } 10 | for group in super.groups 11 | ], 12 | }, 13 | 14 | humanizeSeconds(s):: 15 | if s > 60 * 60 * 24 16 | then '%.1f days' % (s / 60 / 60 / 24) 17 | else '%.1f hours' % (s / 60 / 60), 18 | 19 | // Handle adding `group left` to join labels into rule by wrapping the rule in () * on(xxx) group_left(xxx) kube_xxx_labels 20 | // If kind of rule is not defined try to detect rule type by alert name 21 | wrap_rule_for_labels(rule, config): 22 | // Detect Kind of rule from name unless hidden `kind field is passed in the rule` 23 | local kind = 24 | if 'kind' in rule then rule.kind 25 | // Handle Alerts 26 | else if std.objectHas(rule, 'alert') then 27 | if std.startsWith(rule.alert, 'KubePod') then 'pod' 28 | else if std.startsWith(rule.alert, 'KubeContainer') then 'pod' 29 | else if std.startsWith(rule.alert, 'KubeStateful') then 'statefulset' 30 | else if std.startsWith(rule.alert, 'KubeDeploy') then 'deployment' 31 | else if std.startsWith(rule.alert, 'KubeDaemon') then 'daemonset' 32 | else if std.startsWith(rule.alert, 'KubeHpa') then 'horizontalpodautoscaler' 33 | else if std.startsWith(rule.alert, 'KubeJob') then 'job' 34 | else 'none' 35 | else 'none'; 36 | 37 | local labels = { 38 | join_labels: config['%ss_join_labels' % kind], 39 | // since the label 'job' is reserved, the resource with kind Job uses the label 'job_name' instead 40 | on_labels: ['%s' % (if kind == 'job' then 'job_name' else kind), '%s' % config.namespaceLabel, '%s' % config.clusterLabel], 41 | metric: 'kube_%s_labels' % kind, 42 | }; 43 | 44 | // Failed to identify kind - return raw rule 45 | if kind == 'none' then rule 46 | // No join labels passed in the config - return raw rule 47 | else if std.length(labels.join_labels) == 0 then rule 48 | // Wrap expr with join group left 49 | else 50 | rule { 51 | local expr = super.expr, 52 | expr: '(%(expr)s) * on (%(on)s) group_left(%(join)s) %(metric)s' % { 53 | expr: expr, 54 | on: std.join(',', labels.on_labels), 55 | join: std.join(',', labels.join_labels), 56 | metric: labels.metric, 57 | }, 58 | }, 59 | 60 | // if showMultiCluster is true in config, return the string, otherwise return an empty string 61 | ifShowMultiCluster(config, string):: 62 | if config.showMultiCluster then string else '', 63 | } 64 | -------------------------------------------------------------------------------- /mixin.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'alerts/alerts.libsonnet') + 2 | (import 'dashboards/dashboards.libsonnet') + 3 | (import 'rules/rules.libsonnet') + 4 | (import 'config.libsonnet') 5 | -------------------------------------------------------------------------------- /rules/kube_apiserver-availability.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | prometheusRules+:: { 3 | local SLODays = $._config.SLOs.apiserver.days + 'd', 4 | local verbs = [ 5 | { type: 'read', selector: $._config.kubeApiserverReadSelector }, 6 | { type: 'write', selector: $._config.kubeApiserverWriteSelector }, 7 | ], 8 | 9 | groups+: [ 10 | { 11 | name: 'kube-apiserver-availability.rules', 12 | interval: '3m', 13 | rules: [ 14 | { 15 | record: 'code_verb:apiserver_request_total:increase%s' % SLODays, 16 | expr: ||| 17 | avg_over_time(code_verb:apiserver_request_total:increase1h[%s]) * 24 * %d 18 | ||| % [SLODays, $._config.SLOs.apiserver.days], 19 | }, 20 | ] + [ 21 | { 22 | record: 'code:apiserver_request_total:increase%s' % SLODays, 23 | expr: ||| 24 | sum by (%s, code) (code_verb:apiserver_request_total:increase%s{%s}) 25 | ||| % [$._config.clusterLabel, SLODays, verb.selector], 26 | labels: { 27 | verb: verb.type, 28 | }, 29 | } 30 | for verb in verbs 31 | ] + [ 32 | { 33 | record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h', 34 | expr: ||| 35 | sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) 36 | ||| % $._config, 37 | }, 38 | { 39 | record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%s' % SLODays, 40 | expr: ||| 41 | sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[%s]) * 24 * %s) 42 | ||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days], 43 | }, 44 | { 45 | record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h', 46 | expr: ||| 47 | sum by (%(clusterLabel)s, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le="+Inf"}) 48 | ||| % $._config, 49 | }, 50 | { 51 | record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%s' % SLODays, 52 | expr: ||| 53 | sum by (%s, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%s{le="+Inf"}) 54 | ||| % [$._config.clusterLabel, SLODays], 55 | }, 56 | { 57 | record: 'apiserver_request:availability%s' % SLODays, 58 | expr: ||| 59 | 1 - ( 60 | ( 61 | # write too slow 62 | sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) 63 | - 64 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le=~"%(kubeApiserverWriteLatency)s"} or vector(0)) 65 | ) + 66 | ( 67 | # read too slow 68 | sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) 69 | - 70 | ( 71 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le=~"%(kubeApiserverReadResourceLatency)s"} or vector(0)) 72 | + 73 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le=~"%(kubeApiserverReadNamespaceLatency)s"} or vector(0)) 74 | + 75 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le=~"%(kubeApiserverReadClusterLatency)s"} or vector(0)) 76 | ) 77 | ) + 78 | # errors 79 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{code=~"5.."} or vector(0)) 80 | ) 81 | / 82 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s) 83 | ||| % ($._config { SLODays: SLODays }), 84 | labels: { 85 | verb: 'all', 86 | }, 87 | }, 88 | { 89 | record: 'apiserver_request:availability%s' % SLODays, 90 | expr: ||| 91 | 1 - ( 92 | sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) 93 | - 94 | ( 95 | # too slow 96 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le=~"%(kubeApiserverReadResourceLatency)s"} or vector(0)) 97 | + 98 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le=~"%(kubeApiserverReadNamespaceLatency)s"} or vector(0)) 99 | + 100 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le=~"%(kubeApiserverReadClusterLatency)s"} or vector(0)) 101 | ) 102 | + 103 | # errors 104 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="read",code=~"5.."} or vector(0)) 105 | ) 106 | / 107 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="read"}) 108 | ||| % ($._config { SLODays: SLODays, days: $._config.SLOs.apiserver.days }), 109 | labels: { 110 | verb: 'read', 111 | }, 112 | }, 113 | { 114 | record: 'apiserver_request:availability%s' % SLODays, 115 | expr: ||| 116 | 1 - ( 117 | ( 118 | # too slow 119 | sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) 120 | - 121 | sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le=~"%(kubeApiserverWriteLatency)s"} or vector(0)) 122 | ) 123 | + 124 | # errors 125 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="write",code=~"5.."} or vector(0)) 126 | ) 127 | / 128 | sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="write"}) 129 | ||| % ($._config { SLODays: SLODays, days: $._config.SLOs.apiserver.days }), 130 | labels: { 131 | verb: 'write', 132 | }, 133 | }, 134 | ] + [ 135 | { 136 | record: 'code_resource:apiserver_request_total:rate5m', 137 | expr: ||| 138 | sum by (%s,code,resource) (rate(apiserver_request_total{%s}[5m])) 139 | ||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector])], 140 | labels: { 141 | verb: verb.type, 142 | }, 143 | } 144 | for verb in verbs 145 | ] + [ 146 | { 147 | record: 'code_verb:apiserver_request_total:increase1h', 148 | expr: ||| 149 | sum by (%s, code, verb) (increase(apiserver_request_total{%s,verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"%s"}[1h])) 150 | ||| % [$._config.clusterLabel, $._config.kubeApiserverSelector, code], 151 | } 152 | for code in ['2..', '3..', '4..', '5..'] 153 | ], 154 | }, 155 | ], 156 | }, 157 | } 158 | -------------------------------------------------------------------------------- /rules/kube_apiserver-burnrate.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | prometheusRules+:: { 3 | groups+: [ 4 | { 5 | name: 'kube-apiserver-burnrate.rules', 6 | rules: [ 7 | { 8 | record: 'apiserver_request:burnrate%(window)s' % w, 9 | expr: ||| 10 | ( 11 | ( 12 | # too slow 13 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) 14 | - 15 | ( 16 | ( 17 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le=~"%(kubeApiserverReadResourceLatency)s"}[%(window)s])) 18 | or 19 | vector(0) 20 | ) 21 | + 22 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le=~"%(kubeApiserverReadNamespaceLatency)s"}[%(window)s])) 23 | + 24 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le=~"%(kubeApiserverReadClusterLatency)s"}[%(window)s])) 25 | ) 26 | ) 27 | + 28 | # errors 29 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,code=~"5.."}[%(window)s])) 30 | ) 31 | / 32 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(window)s])) 33 | ||| % { 34 | clusterLabel: $._config.clusterLabel, 35 | window: w, 36 | kubeApiserverSelector: $._config.kubeApiserverSelector, 37 | kubeApiserverReadSelector: $._config.kubeApiserverReadSelector, 38 | kubeApiserverNonStreamingSelector: $._config.kubeApiserverNonStreamingSelector, 39 | kubeApiserverReadResourceLatency: $._config.kubeApiserverReadResourceLatency, 40 | kubeApiserverReadNamespaceLatency: $._config.kubeApiserverReadNamespaceLatency, 41 | kubeApiserverReadClusterLatency: $._config.kubeApiserverReadClusterLatency, 42 | }, 43 | labels: { 44 | verb: 'read', 45 | }, 46 | } 47 | for w in std.set([ // Get the unique array of short and long window rates 48 | w.short 49 | for w in $._config.SLOs.apiserver.windows 50 | ] + [ 51 | w.long 52 | for w in $._config.SLOs.apiserver.windows 53 | ]) 54 | ] + [ 55 | { 56 | record: 'apiserver_request:burnrate%(window)s' % w, 57 | expr: ||| 58 | ( 59 | ( 60 | # too slow 61 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) 62 | - 63 | sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le=~"%(kubeApiserverWriteLatency)s"}[%(window)s])) 64 | ) 65 | + 66 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s])) 67 | ) 68 | / 69 | sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s}[%(window)s])) 70 | ||| % { 71 | clusterLabel: $._config.clusterLabel, 72 | window: w, 73 | kubeApiserverSelector: $._config.kubeApiserverSelector, 74 | kubeApiserverWriteSelector: $._config.kubeApiserverWriteSelector, 75 | kubeApiserverNonStreamingSelector: $._config.kubeApiserverNonStreamingSelector, 76 | kubeApiserverWriteLatency: $._config.kubeApiserverWriteLatency, 77 | }, 78 | labels: { 79 | verb: 'write', 80 | }, 81 | } 82 | for w in std.set([ // Get the unique array of short and long window rates 83 | w.short 84 | for w in $._config.SLOs.apiserver.windows 85 | ] + [ 86 | w.long 87 | for w in $._config.SLOs.apiserver.windows 88 | ]) 89 | ], 90 | }, 91 | ], 92 | }, 93 | } 94 | -------------------------------------------------------------------------------- /rules/kube_apiserver-config.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | kubeApiserverSelector: 'job="kube-apiserver"', 4 | podLabel: 'pod', 5 | kubeApiserverReadSelector: 'verb=~"LIST|GET"', 6 | kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"', 7 | kubeApiserverNonStreamingSelector: 'subresource!~"proxy|attach|log|exec|portforward"', 8 | // These are buckets that exist on the apiserver_request_sli_duration_seconds_bucket histogram. 9 | // They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters. 10 | // If you want to change these, make sure the "le" buckets exist on the histogram! 11 | kubeApiserverReadResourceLatency: '1(\\\\.0)?', 12 | kubeApiserverReadNamespaceLatency: '5(\\\\.0)?', 13 | kubeApiserverReadClusterLatency: '30(\\\\.0)?', 14 | kubeApiserverWriteLatency: '1(\\\\.0)?', 15 | }, 16 | } 17 | -------------------------------------------------------------------------------- /rules/kube_apiserver-histogram.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | prometheusRules+:: { 3 | local verbs = [ 4 | { type: 'read', selector: $._config.kubeApiserverReadSelector }, 5 | { type: 'write', selector: $._config.kubeApiserverWriteSelector }, 6 | ], 7 | 8 | groups+: [ 9 | { 10 | name: 'kube-apiserver-histogram.rules', 11 | rules: 12 | [ 13 | { 14 | record: 'cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile', 15 | expr: ||| 16 | histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{%s}[5m]))) > 0 17 | ||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector, $._config.kubeApiserverNonStreamingSelector])], 18 | labels: { 19 | verb: verb.type, 20 | quantile: '0.99', 21 | }, 22 | } 23 | for verb in verbs 24 | ], 25 | }, 26 | ], 27 | }, 28 | } 29 | -------------------------------------------------------------------------------- /rules/kube_apiserver.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'kube_apiserver-config.libsonnet') + 2 | (import 'kube_apiserver-availability.libsonnet') + 3 | (import 'kube_apiserver-burnrate.libsonnet') + 4 | (import 'kube_apiserver-histogram.libsonnet') 5 | -------------------------------------------------------------------------------- /rules/kube_scheduler.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | kubeSchedulerSelector: 'job="kube-scheduler"', 4 | podLabel: 'pod', 5 | }, 6 | 7 | prometheusRules+:: { 8 | groups+: [ 9 | { 10 | name: 'kube-scheduler.rules', 11 | rules: [ 12 | { 13 | record: 'cluster_quantile:%s:histogram_quantile' % metric, 14 | expr: ||| 15 | histogram_quantile(%(quantile)s, sum(rate(%(metric)s_bucket{%(kubeSchedulerSelector)s}[5m])) without(instance, %(podLabel)s)) 16 | ||| % ({ quantile: quantile, metric: metric } + $._config), 17 | labels: { 18 | quantile: quantile, 19 | }, 20 | } 21 | for quantile in ['0.99', '0.9', '0.5'] 22 | for metric in [ 23 | 'scheduler_e2e_scheduling_duration_seconds', 24 | 'scheduler_scheduling_algorithm_duration_seconds', 25 | 'scheduler_binding_duration_seconds', 26 | ] 27 | ], 28 | }, 29 | ], 30 | }, 31 | } 32 | -------------------------------------------------------------------------------- /rules/kubelet.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | kubeletSelector: 'job="kubelet"', 4 | }, 5 | 6 | prometheusRules+:: { 7 | groups+: [ 8 | { 9 | name: 'kubelet.rules', 10 | rules: [ 11 | { 12 | record: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile', 13 | expr: ||| 14 | histogram_quantile(%(quantile)s, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (%(clusterLabel)s, instance, le) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s}) 15 | ||| % ({ quantile: quantile } + $._config), 16 | labels: { 17 | quantile: quantile, 18 | }, 19 | } 20 | for quantile in ['0.99', '0.9', '0.5'] 21 | ], 22 | }, 23 | ], 24 | }, 25 | } 26 | -------------------------------------------------------------------------------- /rules/node.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | _config+:: { 3 | kubeStateMetricsSelector: 'job="kube-state-metrics"', 4 | nodeExporterSelector: 'job="node-exporter"', 5 | podLabel: 'pod', 6 | }, 7 | 8 | prometheusRules+:: { 9 | groups+: [ 10 | { 11 | name: 'node.rules', 12 | rules: [ 13 | { 14 | // This rule results in the tuples (node, namespace, instance) => 1. 15 | // It is used to calculate per-node metrics, given namespace & instance. 16 | // We use the topk() aggregator to ensure that each (namespace, 17 | // instance) tuple is only associated to one node and thus avoid 18 | // "many-to-many matching not allowed" errors when joining with 19 | // other timeseries on (namespace, instance). See node:node_num_cpu:sum 20 | // below for instance. 21 | record: 'node_namespace_pod:kube_pod_info:', 22 | expr: ||| 23 | topk by(%(clusterLabel)s, namespace, %(podLabel)s) (1, 24 | max by (%(clusterLabel)s, node, namespace, %(podLabel)s) ( 25 | label_replace(kube_pod_info{%(kubeStateMetricsSelector)s,node!=""}, "%(podLabel)s", "$1", "pod", "(.*)") 26 | )) 27 | ||| % $._config, 28 | }, 29 | { 30 | // This rule gives the number of CPUs per node. 31 | record: 'node:node_num_cpu:sum', 32 | expr: ||| 33 | count by (%(clusterLabel)s, node) ( 34 | node_cpu_seconds_total{mode="idle",%(nodeExporterSelector)s} 35 | * on (%(clusterLabel)s, namespace, %(podLabel)s) group_left(node) 36 | topk by(%(clusterLabel)s, namespace, %(podLabel)s) (1, node_namespace_pod:kube_pod_info:) 37 | ) 38 | ||| % $._config, 39 | }, 40 | // Add separate rules for Available memory, so we can aggregate across clusters in dashboards. 41 | { 42 | record: ':node_memory_MemAvailable_bytes:sum', 43 | expr: ||| 44 | sum( 45 | node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} or 46 | ( 47 | node_memory_Buffers_bytes{%(nodeExporterSelector)s} + 48 | node_memory_Cached_bytes{%(nodeExporterSelector)s} + 49 | node_memory_MemFree_bytes{%(nodeExporterSelector)s} + 50 | node_memory_Slab_bytes{%(nodeExporterSelector)s} 51 | ) 52 | ) by (%(clusterLabel)s) 53 | ||| % $._config, 54 | }, 55 | { 56 | // This rule gives cpu utilization per node. 57 | record: 'node:node_cpu_utilization:ratio_rate5m', 58 | expr: ||| 59 | avg by (%(clusterLabel)s, node) ( 60 | sum without (mode) ( 61 | rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",%(nodeExporterSelector)s}[5m]) 62 | ) 63 | ) 64 | ||| % $._config, 65 | }, 66 | { 67 | // This rule gives cpu utilization per cluster 68 | record: 'cluster:node_cpu:ratio_rate5m', 69 | expr: ||| 70 | avg by (%(clusterLabel)s) ( 71 | node:node_cpu_utilization:ratio_rate5m 72 | ) 73 | ||| % $._config, 74 | }, 75 | ], 76 | }, 77 | ], 78 | }, 79 | } 80 | -------------------------------------------------------------------------------- /rules/rules.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'kube_apiserver.libsonnet') + 2 | (import 'apps.libsonnet') + 3 | (import 'kube_scheduler.libsonnet') + 4 | (import 'node.libsonnet') + 5 | (import 'kubelet.libsonnet') 6 | -------------------------------------------------------------------------------- /rules/windows.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | prometheusRules+:: { 3 | groups+: [ 4 | { 5 | name: 'windows.node.rules', 6 | rules: [ 7 | { 8 | // This rule gives the number of windows nodes 9 | record: 'node:windows_node:sum', 10 | expr: ||| 11 | count by (%(clusterLabel)s) ( 12 | windows_system_boot_time_timestamp_seconds{%(windowsExporterSelector)s} 13 | ) 14 | ||| % $._config, 15 | }, 16 | { 17 | // This rule gives the number of CPUs per node. 18 | record: 'node:windows_node_num_cpu:sum', 19 | expr: ||| 20 | count by (%(clusterLabel)s, instance) (sum by (%(clusterLabel)s, instance, core) ( 21 | windows_cpu_time_total{%(windowsExporterSelector)s} 22 | )) 23 | ||| % $._config, 24 | }, 25 | { 26 | // CPU utilisation is % CPU is not idle. 27 | record: ':windows_node_cpu_utilisation:avg1m', 28 | expr: ||| 29 | 1 - avg by (%(clusterLabel)s) (rate(windows_cpu_time_total{%(windowsExporterSelector)s,mode="idle"}[1m])) 30 | ||| % $._config, 31 | }, 32 | { 33 | // CPU utilisation is % CPU is not idle. 34 | record: 'node:windows_node_cpu_utilisation:avg1m', 35 | expr: ||| 36 | 1 - avg by (%(clusterLabel)s, instance) ( 37 | rate(windows_cpu_time_total{%(windowsExporterSelector)s,mode="idle"}[1m]) 38 | ) 39 | ||| % $._config, 40 | }, 41 | { 42 | record: ':windows_node_memory_utilisation:', 43 | expr: ||| 44 | 1 - 45 | sum by (%(clusterLabel)s) (windows_memory_available_bytes{%(windowsExporterSelector)s}) 46 | / 47 | sum by (%(clusterLabel)s) (windows_os_visible_memory_bytes{%(windowsExporterSelector)s}) 48 | ||| % $._config, 49 | }, 50 | // Add separate rules for Free & Total, so we can aggregate across clusters 51 | // in dashboards. 52 | { 53 | record: ':windows_node_memory_MemFreeCached_bytes:sum', 54 | expr: ||| 55 | sum by (%(clusterLabel)s) (windows_memory_available_bytes{%(windowsExporterSelector)s} + windows_memory_cache_bytes{%(windowsExporterSelector)s}) 56 | ||| % $._config, 57 | }, 58 | { 59 | record: 'node:windows_node_memory_totalCached_bytes:sum', 60 | expr: ||| 61 | (windows_memory_cache_bytes{%(windowsExporterSelector)s} + windows_memory_modified_page_list_bytes{%(windowsExporterSelector)s} + windows_memory_standby_cache_core_bytes{%(windowsExporterSelector)s} + windows_memory_standby_cache_normal_priority_bytes{%(windowsExporterSelector)s} + windows_memory_standby_cache_reserve_bytes{%(windowsExporterSelector)s}) 62 | ||| % $._config, 63 | }, 64 | { 65 | record: ':windows_node_memory_MemTotal_bytes:sum', 66 | expr: ||| 67 | sum by (%(clusterLabel)s) (windows_os_visible_memory_bytes{%(windowsExporterSelector)s}) 68 | ||| % $._config, 69 | }, 70 | { 71 | // Available memory per node 72 | // SINCE 2018-02-08 73 | record: 'node:windows_node_memory_bytes_available:sum', 74 | expr: ||| 75 | sum by (%(clusterLabel)s, instance) ( 76 | (windows_memory_available_bytes{%(windowsExporterSelector)s}) 77 | ) 78 | ||| % $._config, 79 | }, 80 | { 81 | // Total memory per node 82 | record: 'node:windows_node_memory_bytes_total:sum', 83 | expr: ||| 84 | sum by (%(clusterLabel)s, instance) ( 85 | windows_os_visible_memory_bytes{%(windowsExporterSelector)s} 86 | ) 87 | ||| % $._config, 88 | }, 89 | { 90 | // Memory utilisation per node, normalized by per-node memory 91 | record: 'node:windows_node_memory_utilisation:ratio', 92 | expr: ||| 93 | (node:windows_node_memory_bytes_total:sum - node:windows_node_memory_bytes_available:sum) 94 | / 95 | scalar(sum(node:windows_node_memory_bytes_total:sum)) 96 | |||, 97 | }, 98 | { 99 | record: 'node:windows_node_memory_utilisation:', 100 | expr: ||| 101 | 1 - (node:windows_node_memory_bytes_available:sum / node:windows_node_memory_bytes_total:sum) 102 | ||| % $._config, 103 | }, 104 | { 105 | record: 'node:windows_node_memory_swap_io_pages:irate', 106 | expr: ||| 107 | irate(windows_memory_swap_page_operations_total{%(windowsExporterSelector)s}[5m]) 108 | ||| % $._config, 109 | }, 110 | { 111 | // Disk utilisation (ms spent, by rate() it's bound by 1 second) 112 | record: ':windows_node_disk_utilisation:avg_irate', 113 | expr: ||| 114 | avg by (%(clusterLabel)s) (irate(windows_logical_disk_read_seconds_total{%(windowsExporterSelector)s}[1m]) + 115 | irate(windows_logical_disk_write_seconds_total{%(windowsExporterSelector)s}[1m]) 116 | ) 117 | ||| % $._config, 118 | }, 119 | { 120 | // Disk utilisation (ms spent, by rate() it's bound by 1 second) 121 | record: 'node:windows_node_disk_utilisation:avg_irate', 122 | expr: ||| 123 | avg by (%(clusterLabel)s, instance) ( 124 | (irate(windows_logical_disk_read_seconds_total{%(windowsExporterSelector)s}[1m]) + 125 | irate(windows_logical_disk_write_seconds_total{%(windowsExporterSelector)s}[1m])) 126 | ) 127 | ||| % $._config, 128 | }, 129 | { 130 | record: 'node:windows_node_filesystem_usage:', 131 | expr: ||| 132 | max by (%(clusterLabel)s,instance,volume)( 133 | (windows_logical_disk_size_bytes{%(windowsExporterSelector)s} 134 | - windows_logical_disk_free_bytes{%(windowsExporterSelector)s}) 135 | / windows_logical_disk_size_bytes{%(windowsExporterSelector)s} 136 | ) 137 | ||| % $._config, 138 | }, 139 | { 140 | record: 'node:windows_node_filesystem_avail:', 141 | expr: ||| 142 | max by (%(clusterLabel)s, instance, volume) (windows_logical_disk_free_bytes{%(windowsExporterSelector)s} / windows_logical_disk_size_bytes{%(windowsExporterSelector)s}) 143 | ||| % $._config, 144 | }, 145 | { 146 | record: ':windows_node_net_utilisation:sum_irate', 147 | expr: ||| 148 | sum by (%(clusterLabel)s) (irate(windows_net_bytes_total{%(windowsExporterSelector)s}[1m])) 149 | ||| % $._config, 150 | }, 151 | { 152 | record: 'node:windows_node_net_utilisation:sum_irate', 153 | expr: ||| 154 | sum by (%(clusterLabel)s, instance) ( 155 | (irate(windows_net_bytes_total{%(windowsExporterSelector)s}[1m])) 156 | ) 157 | ||| % $._config, 158 | }, 159 | { 160 | record: ':windows_node_net_saturation:sum_irate', 161 | expr: ||| 162 | sum by (%(clusterLabel)s) (irate(windows_net_packets_received_discarded_total{%(windowsExporterSelector)s}[1m])) + 163 | sum by (%(clusterLabel)s) (irate(windows_net_packets_outbound_discarded_total{%(windowsExporterSelector)s}[1m])) 164 | ||| % $._config, 165 | }, 166 | { 167 | record: 'node:windows_node_net_saturation:sum_irate', 168 | expr: ||| 169 | sum by (%(clusterLabel)s, instance) ( 170 | (irate(windows_net_packets_received_discarded_total{%(windowsExporterSelector)s}[1m]) + 171 | irate(windows_net_packets_outbound_discarded_total{%(windowsExporterSelector)s}[1m])) 172 | ) 173 | ||| % $._config, 174 | }, 175 | ], 176 | }, 177 | { 178 | name: 'windows.pod.rules', 179 | rules: [ 180 | { 181 | record: 'windows_pod_container_available', 182 | expr: ||| 183 | windows_container_available{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) 184 | ||| % $._config, 185 | }, 186 | { 187 | record: 'windows_container_total_runtime', 188 | expr: ||| 189 | windows_container_cpu_usage_seconds_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) 190 | ||| % $._config, 191 | }, 192 | { 193 | record: 'windows_container_memory_usage', 194 | expr: ||| 195 | windows_container_memory_usage_commit_bytes{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) 196 | ||| % $._config, 197 | }, 198 | { 199 | record: 'windows_container_private_working_set_usage', 200 | expr: ||| 201 | windows_container_memory_usage_private_working_set_bytes{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) 202 | ||| % $._config, 203 | }, 204 | { 205 | record: 'windows_container_network_received_bytes_total', 206 | expr: ||| 207 | windows_container_network_receive_bytes_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) 208 | ||| % $._config, 209 | }, 210 | { 211 | record: 'windows_container_network_transmitted_bytes_total', 212 | expr: ||| 213 | windows_container_network_transmit_bytes_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id, %(clusterLabel)s) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) 214 | ||| % $._config, 215 | }, 216 | { 217 | record: 'kube_pod_windows_container_resource_memory_request', 218 | expr: ||| 219 | max by (%(clusterLabel)s, namespace, pod, container) ( 220 | kube_pod_container_resource_requests{resource="memory",%(kubeStateMetricsSelector)s} 221 | ) * on(container,pod,namespace,%(clusterLabel)s) (windows_pod_container_available) 222 | ||| % $._config, 223 | }, 224 | { 225 | record: 'kube_pod_windows_container_resource_memory_limit', 226 | expr: ||| 227 | kube_pod_container_resource_limits{resource="memory",%(kubeStateMetricsSelector)s} * on(container,pod,namespace,%(clusterLabel)s) (windows_pod_container_available) 228 | ||| % $._config, 229 | }, 230 | { 231 | record: 'kube_pod_windows_container_resource_cpu_cores_request', 232 | expr: ||| 233 | max by (%(clusterLabel)s, namespace, pod, container) ( 234 | kube_pod_container_resource_requests{resource="cpu",%(kubeStateMetricsSelector)s} 235 | ) * on(container,pod,namespace,%(clusterLabel)s) (windows_pod_container_available) 236 | ||| % $._config, 237 | }, 238 | { 239 | record: 'kube_pod_windows_container_resource_cpu_cores_limit', 240 | expr: ||| 241 | kube_pod_container_resource_limits{resource="cpu",%(kubeStateMetricsSelector)s} * on(container,pod,namespace,%(clusterLabel)s) (windows_pod_container_available) 242 | ||| % $._config, 243 | }, 244 | { 245 | record: 'namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate', 246 | expr: ||| 247 | sum by (%(clusterLabel)s, namespace, pod, container) ( 248 | rate(windows_container_total_runtime{}[5m]) 249 | ) 250 | ||| % $._config, 251 | }, 252 | ], 253 | }, 254 | ], 255 | }, 256 | } 257 | -------------------------------------------------------------------------------- /scripts/check-selectors-ksm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Set -u to error out if we use an unset variable. 4 | # Set -o pipefail to propagate errors in a pipeline. 5 | set -uo pipefail 6 | 7 | # Remove kube-state-metrics directory if it exists. 8 | rm -rf kube-state-metrics 9 | 10 | # Clone kube-state-metrics repository. 11 | git clone https://github.com/kubernetes/kube-state-metrics --depth 1 12 | 13 | # Set the repository root. 14 | repository_root=$(git rev-parse --show-toplevel) 15 | 16 | # Change directory to kube-state-metrics. 17 | cd kube-state-metrics || exit 18 | 19 | # Grep all metrics in the codebase. 20 | find internal/store -type f -not -name '*_test.go' -exec sed -nE 's/.*"(kube_[^"]+)".*/\1/p' {} \; | sort -u > metrics.txt 21 | 22 | # Set the KSM selector specifier. 23 | ksm_selector="kubeStateMetricsSelector" 24 | 25 | # Set the paths to the alerts, lib and rules directories. 26 | alerts_path="$repository_root/alerts" 27 | lib_path="$repository_root/lib" 28 | rules_path="$repository_root/rules" 29 | 30 | # Read metrics.txt line by line. 31 | while IFS= read -r metric; do 32 | selector_misses=$(\ 33 | grep --only-matching --color=always --line-number "$metric{[^}]*}" --directories=recurse "$alerts_path" "$lib_path" "$rules_path" |\ 34 | grep --invert-match "$ksm_selector" \ 35 | ) 36 | if [ -n "$selector_misses" ]; then 37 | echo "The following $metric metrics are missing the $ksm_selector specifier:" 38 | echo "$selector_misses" 39 | fi 40 | done < metrics.txt 41 | 42 | # Clean artefacts. 43 | rm metrics.txt 44 | cd .. || exit 45 | rm -rf kube-state-metrics 46 | 47 | # TODO: Currently, there are only two possible states the workflow can report: success or failure. 48 | # We could benefit from a third "warning" state, for cases where we observe an overlap of selectors for the same metric. 49 | # Ref: https://docs.github.com/en/actions/creating-actions/setting-exit-codes-for-actions#about-exit-codes 50 | -------------------------------------------------------------------------------- /scripts/tools.go: -------------------------------------------------------------------------------- 1 | //go:build tools 2 | // +build tools 3 | 4 | // Packae tols tracks dependencies for tools that used in the build process. 5 | // See https://github.com/golang/go/issues/25922 6 | package tools 7 | 8 | import ( 9 | _ "github.com/Kunde21/markdownfmt/v3/cmd/markdownfmt" 10 | _ "github.com/cloudflare/pint/cmd/pint" 11 | _ "github.com/errata-ai/vale/v3/cmd/vale" 12 | _ "github.com/google/go-jsonnet/cmd/jsonnet" 13 | _ "github.com/google/go-jsonnet/cmd/jsonnet-lint" 14 | _ "github.com/google/go-jsonnet/cmd/jsonnetfmt" 15 | _ "github.com/grafana/dashboard-linter" 16 | _ "github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb" 17 | _ "github.com/prometheus/prometheus/cmd/promtool" 18 | ) 19 | -------------------------------------------------------------------------------- /tests/apiserver-availability-test.yaml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - ../prometheus_alerts.yaml 3 | - ../prometheus_rules.yaml 4 | 5 | evaluation_interval: 1m 6 | 7 | tests: 8 | - name: calculate apiserver request total increase 1h rate 9 | interval: 1m 10 | input_series: 11 | # 100 requests in the overall interval, 99 successful and 1 error 12 | - series: 'apiserver_request_total{job="kube-apiserver",verb="GET",code="200"}' 13 | values: '0 10 20 50 90 99' 14 | - series: 'apiserver_request_total{job="kube-apiserver",verb="GET",code="500"}' 15 | values: '0x2 1x2' 16 | 17 | promql_expr_test: 18 | - eval_time: 5m 19 | expr: code_verb:apiserver_request_total:increase1h{verb="GET"} 20 | exp_samples: 21 | - labels: 'code_verb:apiserver_request_total:increase1h{code="200", verb="GET"}' 22 | value: 99.0 23 | - labels: 'code_verb:apiserver_request_total:increase1h{code="500", verb="GET"}' 24 | value: 1.0 25 | 26 | - name: calculate apiserver request total increase 30d rate 27 | interval: 1m 28 | input_series: 29 | - series: code_verb:apiserver_request_total:increase1h{verb="GET",code="200"} 30 | values: '10+10x9' 31 | - series: code_verb:apiserver_request_total:increase1h{verb="GET",code="500"} 32 | values: '0+1x9' 33 | 34 | promql_expr_test: 35 | - eval_time: 10m 36 | expr: code_verb:apiserver_request_total:increase30d{verb="GET"} 37 | exp_samples: 38 | - labels: 'code_verb:apiserver_request_total:increase30d{code="200", verb="GET"}' 39 | value: 3.96e+4 # average of the input series values times 24 (hours) times 30 (days) 40 | - labels: 'code_verb:apiserver_request_total:increase30d{code="500", verb="GET"}' 41 | value: 3.24e+3 42 | -------------------------------------------------------------------------------- /tests/apps_alerts-test.yaml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - ../prometheus_alerts.yaml 3 | 4 | tests: 5 | - interval: 1m 6 | name: KubePdbNotEnoughHealthyPods fires when current healthly pods are less than desired 7 | input_series: 8 | - series: 'kube_poddisruptionbudget_status_desired_healthy{cluster="cluster1", namespace="ns1", poddisruptionbudget="pdb1", job="kube-state-metrics"}' 9 | values: '4x15' 10 | - series: 'kube_poddisruptionbudget_status_current_healthy{cluster="cluster1", namespace="ns1", poddisruptionbudget="pdb1", job="kube-state-metrics"}' 11 | values: '3x15' 12 | alert_rule_test: 13 | - eval_time: 14m 14 | alertname: KubePdbNotEnoughHealthyPods 15 | - eval_time: 15m 16 | alertname: KubePdbNotEnoughHealthyPods 17 | exp_alerts: 18 | - exp_labels: 19 | severity: "warning" 20 | cluster: "cluster1" 21 | namespace: "ns1" 22 | poddisruptionbudget: "pdb1" 23 | job: "kube-state-metrics" 24 | exp_annotations: 25 | description: "PDB ns1/pdb1 expects 1 more healthy pods. The desired number of healthy pods has not been met for at least 15m." 26 | runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepdbnotenoughhealthypods" 27 | summary: "PDB does not have enough healthy pods." 28 | 29 | - interval: 1m 30 | name: KubeStatefulSetUpdateNotRolledOut still fires even if another label (e.g. instance) is present 31 | input_series: 32 | - series: 'kube_statefulset_status_current_revision{job="kube-state-metrics", cluster="c1", namespace="ns1", statefulset="ss1", revision="foo", instance="custom"}' 33 | values: '1x15' 34 | - series: 'kube_statefulset_status_update_revision{job="kube-state-metrics", cluster="c1", namespace="ns1", statefulset="ss1", revision="bar", instance="custom"}' 35 | values: '1x15' 36 | - series: 'kube_statefulset_replicas{job="kube-state-metrics", cluster="c1", namespace="ns1", statefulset="ss1", instance="custom"}' 37 | values: '5x15' 38 | - series: 'kube_statefulset_status_replicas_updated{job="kube-state-metrics", cluster="c1", namespace="ns1", statefulset="ss1", instance="custom"}' 39 | values: '1x15' 40 | alert_rule_test: 41 | - eval_time: 14m 42 | alertname: KubeStatefulSetUpdateNotRolledOut 43 | - eval_time: 15m 44 | alertname: KubeStatefulSetUpdateNotRolledOut 45 | exp_alerts: 46 | - exp_labels: 47 | cluster: "c1" 48 | job: "kube-state-metrics" 49 | namespace: "ns1" 50 | severity: "warning" 51 | statefulset: "ss1" 52 | exp_annotations: 53 | description: "StatefulSet ns1/ss1 update has not been rolled out." 54 | runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout" 55 | summary: "StatefulSet update has not been rolled out." 56 | --------------------------------------------------------------------------------