├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── archetypes
    └── default.md
├── config.yaml
├── content
    ├── _index.md
    ├── docs
    │   └── add-runbook.md
    └── runbooks
    │   ├── _index.md
    │   ├── alertmanager
    │       ├── AlertmanagerClusterCrashlooping.md
    │       ├── AlertmanagerClusterDown.md
    │       ├── AlertmanagerClusterFailedToSendAlerts.md
    │       ├── AlertmanagerConfigInconsistent.md
    │       ├── AlertmanagerFailedReload.md
    │       ├── AlertmanagerFailedToSendAlerts.md
    │       ├── AlertmanagerMembersInconsistent.md
    │       └── _index.md
    │   ├── etcd
    │       ├── _index.md
    │       ├── etcdBackendQuotaLowSpace.md
    │       ├── etcdGRPCRequestsSlow.md
    │       ├── etcdHighFsyncDurations.md
    │       ├── etcdHighNumberOfFailedGRPCRequests.md
    │       ├── etcdInsufficientMembers.md
    │       ├── etcdMembersDown.md
    │       └── etcdNoLeader.md
    │   ├── general
    │       ├── InfoInhibitor.md
    │       ├── NodeNetworkInterfaceFlapping.md
    │       ├── TargetDown.md
    │       ├── Watchdog.md
    │       └── _index.md
    │   ├── kube-state-metrics
    │       ├── KubeStateMetricsListErrors.md
    │       ├── KubeStateMetricsShardingMismatch.md
    │       ├── KubeStateMetricsShardsMissing.md
    │       ├── KubeStateMetricsWatchErrors.md
    │       └── _index.md
    │   ├── kubernetes
    │       ├── CPUThrottlingHigh.md
    │       ├── KubeAPIDown.md
    │       ├── KubeAPIErrorBudgetBurn.md
    │       ├── KubeAPITerminatedRequests.md
    │       ├── KubeAggregatedAPIDown.md
    │       ├── KubeAggregatedAPIErrors.md
    │       ├── KubeCPUOvercommit.md
    │       ├── KubeCPUQuotaOvercommit.md
    │       ├── KubeClientCertificateExpiration.md
    │       ├── KubeClientErrors.md
    │       ├── KubeContainerWaiting.md
    │       ├── KubeControllerManagerDown.md
    │       ├── KubeDaemonSetMisScheduled.md
    │       ├── KubeDaemonSetNotScheduled.md
    │       ├── KubeDaemonSetRolloutStuck.md
    │       ├── KubeDeploymentGenerationMismatch.md
    │       ├── KubeDeploymentReplicasMismatch.md
    │       ├── KubeHpaMaxedOut.md
    │       ├── KubeHpaReplicasMismatch.md
    │       ├── KubeJobCompletion.md
    │       ├── KubeJobFailed.md
    │       ├── KubeMemoryOvercommit.md
    │       ├── KubeMemoryQuotaOvercommit.md
    │       ├── KubeNodeNotReady.md
    │       ├── KubeNodeReadinessFlapping.md
    │       ├── KubeNodeUnreachable.md
    │       ├── KubePersistentVolumeErrors.md
    │       ├── KubePersistentVolumeFillingUp.md
    │       ├── KubePodCrashLooping.md
    │       ├── KubePodNotReady.md
    │       ├── KubeProxyDown.md
    │       ├── KubeQuotaAlmostFull.md
    │       ├── KubeQuotaExceeded.md
    │       ├── KubeQuotaFullyUsed.md
    │       ├── KubeSchedulerDown.md
    │       ├── KubeStatefulSetGenerationMismatch.md
    │       ├── KubeStatefulSetReplicasMismatch.md
    │       ├── KubeStatefulSetUpdateNotRolledOut.md
    │       ├── KubeVersionMismatch.md
    │       ├── KubeletClientCertificateExpiration.md
    │       ├── KubeletClientCertificateRenewalErrors.md
    │       ├── KubeletDown.md
    │       ├── KubeletPlegDurationHigh.md
    │       ├── KubeletPodStartUpLatencyHigh.md
    │       ├── KubeletServerCertificateExpiration.md
    │       ├── KubeletServerCertificateRenewalErrors.md
    │       ├── KubeletTooManyPods.md
    │       └── _index.md
    │   ├── node
    │       ├── NodeClockNotSynchronising.md
    │       ├── NodeClockSkewDetected.md
    │       ├── NodeFileDescriptorLimit.md
    │       ├── NodeFilesystemAlmostOutOfFiles.md
    │       ├── NodeFilesystemAlmostOutOfSpace.md
    │       ├── NodeFilesystemFilesFillingUp.md
    │       ├── NodeFilesystemSpaceFillingUp.md
    │       ├── NodeHighNumberConntrackEntriesUsed.md
    │       ├── NodeNetworkReceiveErrs.md
    │       ├── NodeNetworkTransmitErrs.md
    │       ├── NodeRAIDDegraded.md
    │       ├── NodeRAIDDiskFailure.md
    │       ├── NodeTextFileCollectorScrapeError.md
    │       └── _index.md
    │   ├── prometheus-operator
    │       ├── ConfigReloaderSidecarErrors.md
    │       ├── PrometheusOperatorListErrors.md
    │       ├── PrometheusOperatorNodeLookupErrors.md
    │       ├── PrometheusOperatorNotReady.md
    │       ├── PrometheusOperatorReconcileErrors.md
    │       ├── PrometheusOperatorRejectedResources.md
    │       ├── PrometheusOperatorSyncFailed
    │       ├── PrometheusOperatorSyncFailed.md
    │       ├── PrometheusOperatorWatchErrors.md
    │       └── _index.md
    │   └── prometheus
    │       ├── PrometheusBadConfig.md
    │       ├── PrometheusDuplicateTimestamps.md
    │       ├── PrometheusErrorSendingAlertsToAnyAlertmanager.md
    │       ├── PrometheusErrorSendingAlertsToSomeAlertmanagers.md
    │       ├── PrometheusLabelLimitHit.md
    │       ├── PrometheusMissingRuleEvaluations.md
    │       ├── PrometheusNotConnectedToAlertmanagers.md
    │       ├── PrometheusNotIngestingSamples.md
    │       ├── PrometheusNotificationQueueRunningFull.md
    │       ├── PrometheusOutOfOrderTimestamps.md
    │       ├── PrometheusRemoteStorageFailures.md
    │       ├── PrometheusRemoteWriteBehind.md
    │       ├── PrometheusRemoteWriteDesiredShards.md
    │       ├── PrometheusRuleFailures.md
    │       ├── PrometheusTSDBCompactionsFailing.md
    │       ├── PrometheusTSDBReloadsFailing.md
    │       ├── PrometheusTargetLimitHit.md
    │       ├── PrometheusTargetSyncFailure.md
    │       └── _index.md
└── layouts
    └── 404.html


/.gitignore:
--------------------------------------------------------------------------------
1 | public/
2 | resources/
3 | .hugo_build.lock
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "themes/book"]
2 | 	path = themes/book
3 | 	url = https://github.com/alex-shpak/hugo-book
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Prometheus-Operator Runbooks
2 | 
3 | This repo contains the official [runbooks](https://en.wikipedia.org/wiki/Runbook) for the various alerts sent out by components of the prometheus-operator ecosystem.
4 | 
5 | The live version can be found at https://runbooks.prometheus-operator.dev/
6 | 
7 | For information about contributing, see [add-runbook.md](./content/docs/add-runbook.md).
8 | 


--------------------------------------------------------------------------------
/archetypes/default.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "{{ replace .Name "-" " " | title }}"
3 | date: {{ .Date }}
4 | draft: true
5 | ---
6 | 
7 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | baseURL: "https://runbooks.prometheus-operator.dev"
 2 | languageCode: "en-us"
 3 | title: "kube-prometheus runbooks"
 4 | 
 5 | theme: book
 6 | 
 7 | params:
 8 |   BookRepo: "https://github.com/prometheus-operator/runbooks"
 9 |   BookEditPath: "edit/main"
10 |   BookSearch: true
11 |   BookSection: "runbooks"
12 |   #BookMenuBundle: "/menu"
13 | 
14 | markup:
15 |   goldmark:
16 |     renderer:
17 |       # Allows rendering HTML tags. This would only be truly "unsafe" if we were accepting untrusted user input and rendering it, which we aren't.
18 |       unsafe: true
19 | 


--------------------------------------------------------------------------------
/content/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Introduction
 3 | type: docs
 4 | ---
 5 | 
 6 | # Welcome!
 7 | 
 8 | Welcome to the site hosting runbooks for alerts shipped with
 9 | [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project.
10 | 
11 | ## Reason
12 | 
13 | Kube-prometheus was always meant to provide the complete monitoring solution for kubernetes environments. The project
14 | already includes a lot of various components to fullfill this goal and one crucial part is including alerting rules.
15 | However what good are those alerting rules when one don't know what to do when the alert fires?
16 | 
17 | ## Goal
18 | 
19 | We aim to ship meaningful runbook for every alert in kube-prometheus project and provide enough insight to help
20 | kube-prometheus users during incidents.
21 | 
22 | ## How to contribute?
23 | 
24 | If you find any issues with current runbooks, please use the `Edit this page` link at the bottom of the runbook page.
25 | 
26 | For adding a new runbook please follow [add runbook](/docs/add-runbook) guide.
27 | 
28 | If you find any other issues, please [open an issue on GitHub](https://github.com/prometheus-operator/runbooks/issues/new)
29 | or ask questions in [prometheus-operator slack channel](https://kubernetes.slack.com/archives/CFFDS2Z7F).
30 | 


--------------------------------------------------------------------------------
/content/docs/add-runbook.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Add Runbook
 3 | menu:
 4 |   after:
 5 |     weight: -100
 6 | ---
 7 | 
 8 | # Adding new runbook
 9 | 
10 | ## How?
11 | 
12 | 1. [Figure out alert category from the alert name](#finding-correct-component)
13 | 2. Open a PR with new file placed in correct component subdirectory. You can use
14 | [links below to open a PR directly](#pr-links)
15 | 3. Name the new file the same as the alert it describes
16 | 4. Fill in the new file following [a template below](#template).
17 | 5. Remember to put alert name at the top of the file
18 | 
19 | ### Finding correct component
20 | 
21 | All alerts are prefixed with a name of the component. If the alert is not prefixed, it should go into "general"
22 | component category.
23 | 
24 | For example `KubeStateMetricsListErrors` suggest it is a kube-state-metrics alert, but `Watchdog` is a "general" one.
25 | 
26 | ### PR links
27 | 
28 | - [New alertmanager runbook]({{< param BookRepo >}}/new/main/content/runbooks/alertmanager)
29 | - [New kube-state-metrics runbook]({{< param BookRepo >}}/new/main/content/runbooks/kube-state-metrics)
30 | - [New kubernetes runbook]({{< param BookRepo >}}/new/main/content/runbooks/kubernetes)
31 | - [New node runbook]({{< param BookRepo >}}/new/main/content/runbooks/node)
32 | - [New prometheus runbook]({{< param BookRepo >}}/new/main/content/runbooks/prometheus)
33 | - [New prometheus-operator runbook]({{< param BookRepo >}}/new/main/content/runbooks/prometheus-operator)
34 | - [New general runbook]({{< param BookRepo >}}/new/main/content/runbooks/general)
35 | 
36 | ## Template
37 | 
38 | Runbook example based on a NodeFilesystemSpaceFillingUp (thanks to @beorn7):
39 | 
40 | ```
41 | # NodeFilesystemSpaceFillingUp
42 | 
43 | ## Meaning
44 | 
45 | This alert is based on an extrapolation of the space used in a file system. It fires if both the current usage is above a certain threshold _and_ the extrapolation predicts to run out of space in a certain time. This is a warning-level alert if that time is less than 24h. It's a critical alert if that time is less than 4h.
46 | 
47 | <details>
48 | <summary>Full context</summary>
49 | 
50 | Here is where you can optionally describe some more details about the alert. The "meaning" is the short version for an on-call engineer to quickly read through. The "details" are for learning about the bigger picture or the finer details.
51 | 
52 | > NOTE: The blank lines above and below the text inside this `<details>` tag are [required to use markdown inside of html tags][1]
53 | 
54 | </details>
55 | 
56 | ## Impact
57 | 
58 | A filesystem running completely full is obviously very bad for any process in need to write to the filesystem. But even before a filesystem runs completely full, performance is usually degrading.
59 | 
60 | ## Diagnosis
61 | 
62 | Study the recent trends of filesystem usage on a dashboard. Sometimes a periodic pattern of writing and cleaning up can trick the linear prediction into a false alert.
63 | 
64 | Use the usual OS tools to investigate what directories are the worst and/or recent offenders.
65 | 
66 | Is this some irregular condition, e.g. a process fails to clean up behind itself, or is this organic growth?
67 | 
68 | ## Mitigation
69 | 
70 | <Insert site specific measures, for example to grow a persistent volume.>
71 | 
72 | 
73 | [1]: https://github.github.com/gfm/#html-block
74 | ```
75 | 
76 | ### Guidelines
77 | 
78 | The purpose of this repository is to have a documentation about every alert shipped by kube-prometheus (not only by prometheus-operator). In the long run, we are aiming to support as many k8s flavors as possible. If possible try to ensure the 'Diagnosis/Mitigation' sections are applicable to all certified kubernetes distributions.
79 | 
80 | The primary target for these runbooks are folks who are novices and don't have much insight into what to do with alerts shipped in kube-prometheus. As a result, try to avoid excessive jargon and abbreviations.
81 | 
82 | ### Testing locally
83 | 
84 | To test your changes locally:
85 | 
86 | 1. Install [Hugo](https://gohugo.io/getting-started/installing/)
87 | 2. Run `git submodule init` and `git submodule update` to clone the Hugo theme
88 | 3. Run `hugo server` and navigate to http://localhost:1313/ in your browser
89 | 


--------------------------------------------------------------------------------
/content/runbooks/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | bookHidden: true
3 | ---
4 | 


--------------------------------------------------------------------------------
/content/runbooks/alertmanager/AlertmanagerClusterCrashlooping.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Alertmanager Cluster Crashlooping
 3 | weight: 20
 4 | ---
 5 | 
 6 | # AlertmanagerClusterCrashlooping
 7 | 
 8 | ## Meaning
 9 | 
10 | Half or more of the Alertmanager instances within the same cluster are crashlooping.
11 | 
12 | ## Impact
13 | 
14 | Alerts could be notified multiple time unless pods are crashing to fast and no alerts can be sent.
15 | 
16 | ## Diagnosis
17 | 
18 | ```shell
19 | kubectl get pod -l app=alertmanager
20 | 
21 | NAMESPACE   NAME                    READY   STATUS              RESTARTS    AGE
22 | default     alertmanager-main-0     1/2     CrashLoopBackOff    37107 2d
23 | default     alertmanager-main-1     2/2     Running             0 43d
24 | default     alertmanager-main-2     2/2     Running             0 43d 
25 | ```
26 | 
27 | Find the root cause by looking to events for a given pod/deployement
28 | 
29 | ```shell
30 | kubectl get events --field-selector involvedObject.name=alertmanager-main-0
31 | ```
32 | 
33 | ## Mitigation
34 | 
35 | Make sure pods have enough resources (CPU, MEM) to work correctly.
36 | 


--------------------------------------------------------------------------------
/content/runbooks/alertmanager/AlertmanagerClusterDown.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Alertmanager Cluster Down
 3 | weight: 20
 4 | ---
 5 | 
 6 | # AlertmanagerClusterDown
 7 | 
 8 | ## Meaning
 9 | 
10 | Half or more of the Alertmanager instances within the same cluster are down. 
11 | 
12 | ## Impact
13 | 
14 | You have an unstable cluster, if everything goes wrong you will lose the whole cluster.
15 | 
16 | ## Diagnosis
17 | 
18 | Verify why pods are not running.
19 | You can get a big picture with `events`.
20 | 
21 | ```shell
22 | $ kubectl get events --field-selector involvedObject.kind=Pod | grep alertmanager
23 | ```
24 | 
25 | ## Mitigation
26 | 
27 | There are no cheap options to mitigate this risk.
28 | Verifying any new changes in preprod before production environment should improve stability.  
29 | 


--------------------------------------------------------------------------------
/content/runbooks/alertmanager/AlertmanagerClusterFailedToSendAlerts.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Alertmanager Cluster Failed To Send Alerts
 3 | weight: 20
 4 | ---
 5 | 
 6 | # AlertmanagerClusterFailedToSendAlerts
 7 | 
 8 | ## Meaning
 9 | 
10 | All instances failed to send notification to an integration. 
11 | 
12 | ## Impact
13 | 
14 | You will not receive a notification when an alert is raised.
15 | 
16 | ## Diagnosis
17 | 
18 | No alerts are received at the integration level from the cluster. 
19 | 
20 | ## Mitigation
21 | 
22 | Depending on the integration, correct the integration with the faulty instance (network, authorization token, firewall...)
23 | 


--------------------------------------------------------------------------------
/content/runbooks/alertmanager/AlertmanagerConfigInconsistent.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Alertmanager ConfigInconsistent
 3 | weight: 20
 4 | ---
 5 | 
 6 | # AlertmanagerConfigInconsistent
 7 | 
 8 | ## Meaning
 9 | 
10 | The configuration between instances inside a cluster is inconsistent.
11 | 
12 | ## Impact
13 | 
14 | Configuration inconsistency can be multiple and impact is hard to predict. 
15 | Nevertheless, in most cases the alert might be lost or routed to the incorrect integration. 
16 | 
17 | ## Diagnosis
18 | 
19 | Run a `diff` tool between all `alertmanager.yml` that are deployed to find what is wrong.
20 | You could run a job within your CI to avoid this issue in the future.
21 | 
22 | ## Mitigation
23 | 
24 | Delete the incorrect secret and deploy the correct one.
25 | 


--------------------------------------------------------------------------------
/content/runbooks/alertmanager/AlertmanagerFailedReload.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Alertmanager Failed Reload
 3 | weight: 20
 4 | ---
 5 | 
 6 | # AlertmanagerFailedReload
 7 | 
 8 | ## Meaning
 9 | 
10 | The alert `AlertmanagerFailedReload` is triggered when the Alertmanager instance
11 | for the cluster monitoring stack has consistently failed to reload its
12 | configuration for a certain period.
13 | 
14 | ## Impact
15 | 
16 | The impact depends on the type of the error you will find in the logs.
17 | Most of the time, previous configuration is still working, thanks to multiple
18 | instances, so avoid deleting existing pods.
19 | 
20 | ## Diagnosis
21 | 
22 | Verify if there is an error in `config-reloader` container logs.
23 | Here an example with network issues.
24 | 
25 | ```shell
26 | $ kubectl logs sts/alertmanager-main -c config-reloader
27 | 
28 | level=error ts=2021-09-24T11:24:52.69629226Z caller=runutil.go:101 msg="function failed. Retrying in next tick" err="trigger reload: reload request failed: Post \"http://localhost:9093/alertmanager/-/reload\": dial tcp [::1]:9093: connect: connection refused"
29 | ```
30 | 
31 | You can also verify directly `alertmanager.yaml` file (default: `/etc/alertmanager/config/alertmanager.yaml`).
32 | 
33 | ## Mitigation
34 | 
35 | Running [amtool check-config alertmanager.yaml](https://github.com/prometheus/alertmanager#amtool)
36 | on your configuration file will help you detect problem related to syntax.
37 | You could also rollback `alertmanager.yaml` to the previous version in order
38 | to get back to a stable version.
39 | 


--------------------------------------------------------------------------------
/content/runbooks/alertmanager/AlertmanagerFailedToSendAlerts.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Alertmanager Failed To Send Alerts
 3 | weight: 20
 4 | ---
 5 | 
 6 | # AlertmanagerFailedToSendAlerts
 7 | 
 8 | ## Meaning
 9 | 
10 | At least one instance is unable to routed alert to the corresponding integration.
11 | 
12 | ## Impact
13 | 
14 | No impact since another instance should be able to send the notification,
15 | unless `AlertmanagerClusterFailedToSendAlerts` is also triggerd for the same integration.
16 | 
17 | ## Diagnosis
18 | 
19 | Verify the amount of failed notification per alert-manager-[instance] for
20 | a specific integration.
21 | 
22 | You can look metrics exposed in prometheus console using promQL.
23 | For exemple the following query will display the number of failed
24 | notifications per instance for pager duty integration.
25 | We have 3 instances involved in the example bellow.
26 | 
27 | ```promql
28 | rate(alertmanager_notifications_total{integration="pagerduty"}[5m])
29 | ```
30 | 
31 | ![image](https://user-images.githubusercontent.com/3153333/143552468-ff573f1a-19a6-44ea-9c85-631687d01bf9.png)
32 | 
33 | 
34 | ## Mitigation
35 | 
36 | Depending on the integration, you can have a look to alert-manager logs and act (network, authorization token, firewall...)
37 | 
38 | Depending on the integration, you can have a look to alert-manager logs
39 | and act (network, authorization token, firewall...)
40 | 
41 | ```shell
42 | kubectl -n monitoring logs -l 'alertmanager=main' -c alertmanager
43 | ```
44 | 


--------------------------------------------------------------------------------
/content/runbooks/alertmanager/AlertmanagerMembersInconsistent.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Alertmanager Members Inconsistent
 3 | weight: 20
 4 | ---
 5 | 
 6 | # AlertmanagerMembersInconsistent
 7 | 
 8 | ## Meaning
 9 | 
10 | At least one of alertmanager cluster members cannot be found.
11 | 
12 | ## Impact
13 | 
14 | ## Diagnosis
15 | 
16 | Check if IP addresses discovered by alertmanager cluster are the same ones as in alertmanager Service. Following example show possible inconsistency in Endpoint IP addresses:
17 | 
18 | ```shell
19 | $ kubectl describe svc alertmanager-main
20 | 
21 | Name:              alertmanager-main
22 | Namespace:         monitoring
23 | ...
24 | Endpoints:         10.128.2.3:9095,10.129.2.5:9095,10.131.0.44:9095
25 | 
26 | $ kubectl get pod -o wide | grep alertmanager-main
27 | 
28 | alertmanager-main-0                            5/5     Running   0          11d     10.129.2.6
29 | alertmanager-main-1                            5/5     Running   0          2d16h   10.131.0.44     
30 | alertmanager-main-2                            5/5     Running   0          6d      10.128.2.3  
31 | ```
32 | 
33 | ## Mitigation
34 | 
35 | Deleting an incorrect Endpoint should trigger its recreation with a correct IP address.
36 | 


--------------------------------------------------------------------------------
/content/runbooks/alertmanager/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: alertmanager
3 | bookCollapseSection: true
4 | bookFlatSection: true
5 | weight: 10
6 | ---
7 | 
8 | 


--------------------------------------------------------------------------------
/content/runbooks/etcd/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: etcd
3 | bookCollapseSection: true
4 | bookFlatSection: true
5 | weight: 10
6 | ---
7 | 
8 | 


--------------------------------------------------------------------------------
/content/runbooks/etcd/etcdBackendQuotaLowSpace.md:
--------------------------------------------------------------------------------
 1 | # etcdBackendQuotaLowSpace
 2 | 
 3 | ## Meaning
 4 | 
 5 | This alert fires when the total existing DB size exceeds 95% of the maximum
 6 | DB quota. The consumed space is in Prometheus represented by the metric
 7 | `etcd_mvcc_db_total_size_in_bytes`, and the DB quota size is defined by
 8 | `etcd_server_quota_backend_bytes`.
 9 | 
10 | ## Impact
11 | 
12 | In case the DB size exceeds the DB quota, no writes can be performed anymore on
13 | the etcd cluster. This further prevents any updates in the cluster, such as the
14 | creation of pods.
15 | 
16 | ## Diagnosis
17 | 
18 | The following two approaches can be used for the diagnosis.
19 | 
20 | ### CLI Checks
21 | 
22 | To run `etcdctl` commands, we need to `rsh` into the `etcdctl` container of any
23 | etcd pod.
24 | 
25 | ```shell
26 | $ NAMESPACE="kube-etcd"
27 | $ kubectl rsh -c etcdctl -n $NAMESPACE $(kubectl get po -l app=etcd -oname -n $NAMESPACE | awk -F"/" 'NR==1{ print $2 }')
28 | ```
29 | 
30 | Validate that the `etcdctl` command is available:
31 | 
32 | ```shell
33 | $ etcdctl version
34 | ```
35 | 
36 | `etcdctl` can be used to fetch the DB size of the etcd endpoints.
37 | 
38 | ```shell
39 | $ etcdctl endpoint status -w table
40 | ```
41 | 
42 | ### PromQL queries
43 | 
44 | Check the percentage consumption of etcd DB with the following query in the
45 | metrics console:
46 | 
47 | ```promql
48 | (etcd_mvcc_db_total_size_in_bytes / etcd_server_quota_backend_bytes) * 100
49 | ```
50 | 
51 | Check the DB size in MB that can be reduced after defragmentation:
52 | 
53 | ```promql
54 | (etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes)/1024/1024
55 | ```
56 | 
57 | ## Mitigation
58 | 
59 | ### Capacity planning
60 | 
61 | If the `etcd_mvcc_db_total_size_in_bytes` shows that you are growing close to
62 | the `etcd_server_quota_backend_bytes`, etcd almost reached max capacity and it's
63 | start planning for new cluster.
64 | 
65 | In the meantime before migration happens, you can use defrag to gain some time.
66 | 
67 | ### Defrag
68 | 
69 | When the etcd DB size increases, we can defragment existing etcd DB to optimize
70 | DB consumption as described in [etcdDefragmentation](https://etcd.io/dkubectls/v3.4.0/op-guide/maintenance/).
71 | Run the following command in all etcd pods.
72 | 
73 | ```shell
74 | $ etcdctl defrag
75 | ```
76 | 
77 | As validation, check the endpoint status of etcd members to know the reduced
78 | size of etcd DB. Use for this purpose the same diagnostic approaches as listed
79 | above. More space should be available now.
80 | 


--------------------------------------------------------------------------------
/content/runbooks/etcd/etcdGRPCRequestsSlow.md:
--------------------------------------------------------------------------------
 1 | # etcdGRPCRequestsSlow
 2 | 
 3 | ## Meaning
 4 | 
 5 | This alert fires when the 99th percentile of etcd gRPC requests are too slow.
 6 | 
 7 | ## Impact
 8 | 
 9 | When requests are too slow, they can lead to various scenarios like leader
10 | election failure, slow reads and writes.
11 | 
12 | ## Diagnosis
13 | 
14 | This could be result of slow disk (due to fragmented state) or CPU contention.
15 | 
16 | ### Slow disk
17 | 
18 | One of the most common reasons for slow gRPC requests is disk. Checking disk
19 | related metrics and dashboards should provide a more clear picture.
20 | 
21 | #### PromQL queries used to troubleshoot
22 | 
23 | Verify the value of how slow the etcd gRPC requests are by using the following
24 | query in the metrics console:
25 | 
26 | ```promql
27 | histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
28 | ```
29 | 
30 | That result should give a rough timeline of when the issue started.
31 | 
32 | `etcd_disk_wal_fsync_duration_seconds_bucket` reports the etcd disk fsync
33 | duration, `etcd_server_leader_changes_seen_total` reports the leader changes. To
34 | rule out a slow disk and confirm that the disk is reasonably fast, 99th
35 | percentile of the `etcd_disk_wal_fsync_duration_seconds_bucket` should be less
36 | than 10ms. Query in metrics UI:
37 | 
38 | ```promql
39 | histogram_quantile(0.99, sum by (instance, le) (irate(etcd_disk_wal_fsync_duration_seconds_bucket{job="etcd"}[5m])))
40 | ```
41 | 
42 | #### Console dashboards
43 | 
44 | In the OpenShift dashboard console under Observe section, select the etcd
45 | dashboard. There are both RPC rate as well as Disk Sync Duration dashboards
46 | which will assist with further issues.
47 | 
48 | ### Resource exhaustion
49 | 
50 | It can happen that etcd responds slower due to CPU resource exhaustion.
51 | This was seen in some cases when one application was requesting too much CPU
52 | which led to this alert firing for multiple methods.
53 | 
54 | Often if this is the case, we also see
55 | `etcd_disk_wal_fsync_duration_seconds_bucket` slower as well.
56 | 
57 | To confirm this is the cause of the slow requests either:
58 | 
59 | 1. In OpenShift console on primary page under "Cluster utilization" view the
60 |    requested CPU vs available.
61 | 
62 | 2. PromQL query is the following to see top consumers of CPU:
63 | 
64 | ```promql
65 |       topk(25, sort_desc(
66 |         sum by (namespace) (
67 |           (
68 |             sum(avg_over_time(pod:container_cpu_usage:sum{container="",pod!=""}[5m])) BY (namespace, pod)
69 |             *
70 |             on(pod,namespace) group_left(node) (node_namespace_pod:kube_pod_info:)
71 |           )
72 |           *
73 |           on(node) group_left(role) (max by (node) (kube_node_role{role=~".+"}))
74 |         )
75 |       ))
76 | ```
77 | 
78 | ## Mitigation
79 | 
80 | ### Fragmented state
81 | 
82 | In the case of slow fisk or when the etcd DB size increases, we can defragment
83 | existing etcd DB to optimize DB consumption as described in
84 | [etcdDefragmentation](https://etcd.io/docs/v3.4.0/op-guide/maintenance/).
85 | Run the following command in all etcd pods.
86 | 
87 | ```console
88 | $ etcdctl defrag
89 | ```
90 | 
91 | As validation, check the endpoint status of etcd members to know the reduced
92 | size of etcd DB. Use for this purpose the same diagnostic approaches as listed
93 | above. More space should be available now.
94 | 
95 | Further info on etcd best practices can be found in the [etcdPractices](https://docs.openshift.com/container-platform/4.7/scalability_and_performance/recommended-host-practices.html#recommended-etcd-practices).
96 | 


--------------------------------------------------------------------------------
/content/runbooks/etcd/etcdHighFsyncDurations.md:
--------------------------------------------------------------------------------
 1 | # etcdHighFsyncDurations
 2 | 
 3 | ## Meaning
 4 | 
 5 | This alert fires when the 99th percentile of etcd disk fsync duration is too
 6 | high for 10 minutes.
 7 | 
 8 | <details>
 9 | <summary>Full context</summary>
10 | 
11 | Every write request sent to etcd has to be [fsync'd][fsync] to disk by the leader node, transmitted to its peers, and fsync'd to those disks as well before etcd can tell the client that the write request succeeded (as part of the [Raft consensus algorithm][raft]). As a result of all those fsync's, etcd cares a LOT about disk latency, which this alert picks up on.
12 | 
13 | Etcd instances perform poorly on network-attached storage. Directly-attached spinning disks may work, but solid-state disks or better [are recommended][etcd-disks] for larger clusters. For very large clusters, you may even consider a [separate etcd cluster just for events][etcd-events] to reduce the write load.
14 | 
15 | </details>
16 | 
17 | ## Impact
18 | 
19 | When this happens it can lead to various scenarios like leader election failure,
20 | frequent leader elections, slow reads and writes.
21 | 
22 | ## Diagnosis
23 | 
24 | This could be result of slow disk possibly due to fragmented state in etcd or
25 | simply due to slow disk.
26 | 
27 | ### Slow disk
28 | 
29 | Checking disk related metrics and dashboards should provide a more clear
30 | picture.
31 | 
32 | #### PromQL queries used to troubleshoot
33 | 
34 | `etcd_disk_wal_fsync_duration_seconds_bucket` reports the etcd disk fsync
35 | duration, `etcd_server_leader_changes_seen_total` reports the leader changes. To
36 | rule out a slow disk and confirm that the disk is reasonably fast, 99th
37 | percentile of the `etcd_disk_wal_fsync_duration_seconds_bucket` should be less
38 | than 10ms. Query in metrics UI:
39 | 
40 | ```promql
41 | histogram_quantile(0.99, sum by (instance, le) (irate(etcd_disk_wal_fsync_duration_seconds_bucket{job="etcd"}[5m])))
42 | ```
43 | 
44 | ## Mitigation
45 | 
46 | ### Fragmented state
47 | 
48 | In the case of slow fisk or when the etcd DB size increases, we can defragment
49 | existing etcd DB to optimize DB consumption as described in
50 | [here][etcdDefragmentation]. Run the following command in all etcd pods.
51 | 
52 | ```console
53 | $ etcdctl defrag
54 | ```
55 | 
56 | As validation, check the endpoint status of etcd members to know the reduced
57 | size of etcd DB. Use for this purpose the same diagnostic approaches as listed
58 | above. More space should be available now.
59 | 
60 | Further info on etcd best practices can be found in the [OpenShift docs
61 | here][etcdPractices].
62 | 
63 | - [fsync](https://man7.org/linux/man-pages/man2/fsync.2.html)
64 | - [raft](https://en.wikipedia.org/wiki/Raft_(algorithm)#Log_replication)
65 | - [etcd-disks](https://etcd.io/docs/v3.5/op-guide/hardware/#disks)
66 | - [etcd-events](https://github.com/kubernetes/kubernetes/issues/4432)
67 | - [etcdDefragmentation](https://etcd.io/docs/v3.4.0/op-guide/maintenance/)
68 | - [etcdPractices](https://docs.openshift.com/container-platform/4.7/scalability_and_performance/recommended-host-practices.html#recommended-etcd-practices_)
69 | 


--------------------------------------------------------------------------------
/content/runbooks/etcd/etcdHighNumberOfFailedGRPCRequests.md:
--------------------------------------------------------------------------------
 1 | # etcdHighNumberOfFailedGRPCRequests
 2 | 
 3 | ## Meaning
 4 | 
 5 | This alert fires when at least 5% of etcd gRPC requests failed in the past 10
 6 | minutes.
 7 | 
 8 | ## Impact
 9 | 
10 | First establish which gRPC method is failing, this will be visible in the alert.
11 | If it's not part of the alert, the following query will display method and etcd
12 | instance that has failing requests:
13 | 
14 | ```promql
15 | 100 * sum without(grpc_type, grpc_code)
16 | (rate(grpc_server_handled_total{grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded",job="etcd"}[5m]))
17 | / sum without(grpc_type, grpc_code)
18 | (rate(grpc_server_handled_total{job="etcd"}[5m])) > 5 and on()
19 | (sum(cluster_infrastructure_provider{type!~"ipi|BareMetal"} == bool 1))
20 | ```
21 | 
22 | ## Diagnosis
23 | 
24 | All the gRPC errors should also be logged in each respective etcd instance logs.
25 | You can get the instance name from the alert that is firing or by running the
26 | query detailed above. Those etcd instance logs should serve as further insight
27 | into what is wrong.
28 | 
29 | To get logs of etcd containers either check the instance from the alert and
30 | check logs directly or run the following:
31 | 
32 | ```shell
33 | NAMESPACE="kube-etcd"
34 | kubectl logs -n $NAMESPACE -lapp=etcd etcd
35 | ```
36 | 
37 | ## Mitigation
38 | 
39 | Depending on the above diagnosis, the issue will most likely be described in the
40 | error log line of either etcd or openshift-etcd-operator. Most likely causes
41 | tend to be networking issues.
42 | 


--------------------------------------------------------------------------------
/content/runbooks/etcd/etcdInsufficientMembers.md:
--------------------------------------------------------------------------------
 1 | # etcdInsufficientMembers
 2 | 
 3 | ## Meaning
 4 | 
 5 | This alert fires when there are fewer instances available than are needed by
 6 | etcd to be healthy.
 7 | This means that etcd cluster has not enough members in the cluster to create quorum.
 8 | 
 9 | ## Impact
10 | 
11 | When etcd does not have a majority of instances available the Kubernetes and
12 | OpenShift APIs will reject read and write requests and operations that preserve
13 | the health of workloads cannot be performed.
14 | 
15 | In general loosing quorum will switch etcd to read only, which effectively renders k8s api read only.
16 | It is possible to read the current state, but not possible to update it.
17 | 
18 | ## Diagnosis
19 | 
20 | This can occur when multiple control plane nodes are powered off or are unable to
21 | connect to each other via the network. Check that all control plane nodes are
22 | powered on and that network connections between each machine are functional.
23 | 
24 | Check any other critical, warning or info alerts firing that can assist with the
25 | diagnosis.
26 | 
27 | Login to the cluster. Check health of master nodes if any of them is in
28 | `NotReady` state or not.
29 | 
30 | ```shell
31 | $ kubectl get nodes -l node-role.kubernetes.io/master=
32 | ```
33 | 
34 | ### General etcd health
35 | 
36 | To run `etcdctl` commands, we need to `exec` into the `etcdctl` container of any
37 | etcd pod.
38 | 
39 | ```shell
40 | $ kubectl exec -c etcdctl -n openshift-etcd $(kubectl get po -l app=etcd -oname -n openshift-etcd | awk -F"/" 'NR==1{ print $2 }')
41 | ```
42 | 
43 | Validate that the `etcdctl` command is available:
44 | 
45 | ```shell
46 | $ etcdctl version
47 | ```
48 | 
49 | Run the following command to get the health of etcd:
50 | 
51 | ```shell
52 | $ etcdctl endpoint health -w table
53 | ```
54 | 
55 | ## Mitigation
56 | 
57 | ### Disaster and recovery
58 | 
59 | If an upgrade is in progress, the alert may automatically resolve in some time
60 | when the master node comes up again. If MCO is not working on the master node,
61 | check the cloud provider to verify if the master node instances are running or not.
62 | 
63 | In the case when you are running on AWS, the AWS instance retirement might need
64 | a manual reboot of the master node.
65 | 
66 | As a last resort if none of the above fix the issue and the alert is still
67 | firing, for etcd specific issues follow the steps described in the
68 | [disaster-recovery](https://docs.openshift.com/container-platform/4.7/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/about-disaster-recovery.html).
69 | 


--------------------------------------------------------------------------------
/content/runbooks/etcd/etcdMembersDown.md:
--------------------------------------------------------------------------------
 1 | # etcdMembersDown
 2 | 
 3 | ## Meaning
 4 | 
 5 | This alert fires when one or more etcd member goes down and evaluates the
 6 | number of etcd members that are currently down. Often, this alert was observed
 7 | as part of a cluster upgrade when a master node is being upgraded and requires a
 8 | reboot.
 9 | 
10 | ## Impact
11 | 
12 | In etcd a majority of (n/2)+1 has to agree on membership changes or key-value
13 | upgrade proposals. With this approach, a split-brain inconsistency can be
14 | avoided. In the case that only one member is down in a 3-member cluster, it
15 | still can make forward progress. Due to the fact that the quorum is 2 and 2
16 | members are still alive. However, when more members are down, the cluster
17 | becomes unrecoverable.
18 | 
19 | ## Diagnosis
20 | 
21 | Login to the cluster. Check health of master nodes if any of them is in
22 | `NotReady` state or not.
23 | 
24 | ```shell
25 | $ kubectl get nodes -l node-role.kubernetes.io/master=
26 | ```
27 | 
28 | In case there is no upgrade going on, but there is a change in the
29 | `machineconfig` for the master pool causing a rolling reboot of each master
30 | node, this alert can be triggered as well. We can check if the
31 | `machineconfiguration.openshift.io/state : Working` annotation is set for any of
32 | the master nodes. This is the case when the [machine-config-operator
33 | (MCO)](https://github.com/openshift/machine-config-operator) is working on it.
34 | 
35 | ```shell
36 | $ kubectl get nodes -l node-role.kubernetes.io/master= -o template --template='{{range .items}}{{"===> node:> "}}{{.metadata.name}}{{"\n"}}{{range $k, $v := .metadata.annotations}}{{println $k ":" $v}}{{end}}{{"\n"}}{{end}}'
37 | ```
38 | 
39 | ### General etcd health
40 | 
41 | To run `etcdctl` commands, we need to `exec` into the `etcdctl` container of any
42 | etcd pod.
43 | 
44 | ```shell
45 | $ kubectl exec -c etcdctl -n openshift-etcd $(kubectl get po -l app=etcd -oname -n openshift-etcd | awk -F"/" 'NR==1{ print $2 }')
46 | ```
47 | 
48 | Validate that the `etcdctl` command is available:
49 | 
50 | ```shell
51 | $ etcdctl version
52 | ```
53 | 
54 | Run the following command to get the health of etcd:
55 | 
56 | ```shell
57 | $ etcdctl endpoint health -w table
58 | ```
59 | 
60 | ## Mitigation
61 | 
62 | If an upgrade is in progress, the alert may automatically resolve in some time
63 | when the master node comes up again. If MCO is not working on the master node,
64 | check the cloud provider to verify if the master node instances are running or not.
65 | 
66 | In the case when you are running on AWS, the AWS instance retirement might need
67 | a manual reboot of the master node.
68 | 


--------------------------------------------------------------------------------
/content/runbooks/etcd/etcdNoLeader.md:
--------------------------------------------------------------------------------
 1 | # etcdNoLeader
 2 | 
 3 | ## Meaning
 4 | 
 5 | This alert is triggered when etcd cluster does not have a leader for more than 1
 6 | minute.
 7 | This can happen if nodes from the cluster are orphaned - they were part of the cluster
 8 | but now they are in minority and thus can not form a cluster,
 9 | for example due to network partition. 
10 | 
11 | ## Impact
12 | 
13 | When there is no leader, Kubernetes API will not be able to work
14 | as expected and cluster cannot process any writes or reads, and any write
15 | requests are queued for processing until a new leader is elected. Operations
16 | that preserve the health of the workloads cannot be performed.
17 | 
18 | In general loosing quorum will switch etcd to read only, which effectively renders k8s api read only.
19 | It is possible to read the current state, but not possible to update it.
20 | 
21 | ## Diagnosis
22 | 
23 | ### Control plane nodes issue
24 | 
25 | This can occur multiple control plane nodes are powered off or are unable to
26 | connect each other via the network. Check that all control plane nodes are
27 | powered and that network connections between each machine are functional.
28 | 
29 | ### Slow disk issue
30 | 
31 | Another potential cause could be slow disk, inspect the `Disk Sync
32 | Duration`dashboard, as well as the `Total Leader Elections Per Day` to get more
33 | insight and help with diagnosis.
34 | 
35 | ### Other
36 | 
37 | Check the logs of etcd containers to see any further information and to verify
38 | that etcd does not have leader.
39 | Logs should contain something like `etcdserver: no leader`.
40 | 
41 | ## Mitigation
42 | 
43 | ### Disaster and recovery
44 | 
45 | Follow the steps described in the [disaster-recovery](https://docs.openshift.com/container-platform/4.7/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/about-disaster-recovery.html)
46 | 


--------------------------------------------------------------------------------
/content/runbooks/general/InfoInhibitor.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Info Inhibitor
 3 | weight: 20
 4 | ---
 5 | 
 6 | # InfoInhibitor
 7 | 
 8 | ## Meaning
 9 | 
10 | This is an alert that is used to inhibit info alerts.
11 | 
12 | By themselves, the info-level alerts are sometimes very noisy,
13 | but they are relevant when combined with other alerts.
14 | 
15 | <details>
16 | <summary>Full context</summary>
17 | 
18 | More information about the alert and design considerations can be found in a [kube-prometheus issue](https://github.com/prometheus-operator/kube-prometheus/issues/861)
19 | </details>
20 | 
21 | ## Impact
22 | 
23 | Alert does not have any impact and it is used only as a workaround to a missing feature in alertmanager.
24 | 
25 | ## Diagnosis
26 | 
27 | This alert fires whenever there's a `severity="info"` alert,
28 | and stops firing when another alert with severity of `warning` or
29 | `critical` starts firing on the same namespace.
30 | 
31 | ## Mitigation
32 | 
33 | This alert should be routed to a null receiver and configured to inhibit
34 | alerts with `severity="info"`. Such configuration is available at https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/alertmanager-secret.yaml
35 | 


--------------------------------------------------------------------------------
/content/runbooks/general/NodeNetworkInterfaceFlapping.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Network Interface Flapping
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeNetworkInterfaceFlapping
 7 | 
 8 | ## Meaning
 9 | 
10 | Network interface is often changing its status
11 | 
12 | ## Impact
13 | 
14 | Applications on the node may no longer be able to operate with other services.
15 | Network attached storage performance issues or even data loss.
16 | 
17 | ## Diagnosis
18 | 
19 | Investigate networking issues on the node and to connected hardware.
20 | Check physical cables, check networking firewall rules and so on.
21 | 
22 | ## Mitigation
23 | 
24 | Cordon and drain node to migrate apps from it.
25 | 


--------------------------------------------------------------------------------
/content/runbooks/general/TargetDown.md:
--------------------------------------------------------------------------------
 1 | # TargetDown
 2 | 
 3 | ## Meaning
 4 | 
 5 | The alert means that one or more prometheus scrape targets are down. It fires when at least 10% of scrape targets in a Service are unreachable.
 6 | 
 7 | <details>
 8 | <summary>Full context</summary>
 9 | 
10 | Prometheus works by sending an HTTP GET request to all of its "targets" every few seconds. So TargetDown really means that Prometheus just can't access your service, which may or may not mean it's actually down. If your service appears to be running fine, a common cause could be a misconfigured ServiceMonitor (maybe the port or path is incorrect), a misconfigured NetworkPolicy, or Service with incorrect labelSelectors that isn't selecting any Pods.
11 | 
12 | </details>
13 | 
14 | ## Impact
15 | 
16 | Metrics from a particular target cannot be scraped as such there is no data for this target in Prometheus and alerting can be hindered.
17 | 
18 | ## Diagnosis
19 | 
20 | `/targets` page in Prometheus UI can be used to check the scrape error for the particular target.
21 | 
22 | `up == 0` query can be used to check the trend over time.
23 | 
24 | ## Mitigation
25 | 
26 | Mitigation depends on the error reported by prometheus and there is no generic one.
27 | 


--------------------------------------------------------------------------------
/content/runbooks/general/Watchdog.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Watchdog
 3 | weight: 20
 4 | ---
 5 | 
 6 | # Watchdog
 7 | 
 8 | ## Meaning
 9 | 
10 | This is an alert meant to ensure that the entire alerting pipeline is functional.
11 | This alert is always firing, therefore it should always be firing in Alertmanager
12 | and always fire against a receiver.
13 | 
14 | ## Impact
15 | 
16 | If not firing then it should alert external systems that this alerting system
17 | is no longer working.
18 | 
19 | ## Diagnosis
20 | 
21 | Misconfigured alertmanager, bad credentials, bad endpoint, firewalls..
22 | Check alertmanager logs.
23 | 
24 | ## Mitigation
25 | 
26 | There are integrations with various notification
27 | mechanisms that send a notification when this alert is not firing.
28 | For example the `DeadMansSnitch` integration in PagerDuty.
29 | 


--------------------------------------------------------------------------------
/content/runbooks/general/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: general
3 | bookCollapseSection: true
4 | bookFlatSection: true
5 | weight: 1
6 | ---
7 | 
8 | 


--------------------------------------------------------------------------------
/content/runbooks/kube-state-metrics/KubeStateMetricsListErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube State Metrics List Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeStateMetricsListErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | kube-state-metrics is experiencing errors in list operations.
11 | 
12 | ## Impact
13 | 
14 | Unable to get metrics for certain resources
15 | 
16 | ## Diagnosis
17 | 
18 | Check kube-state-metric container logs.
19 | Check service account token.
20 | Check networking rules and network policies.
21 | 
22 | ## Mitigation
23 | 
24 | TODO
25 | 


--------------------------------------------------------------------------------
/content/runbooks/kube-state-metrics/KubeStateMetricsShardingMismatch.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube State Metrics Sharding Mismatch
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeStateMetricsShardingMismatch
 7 | 
 8 | ## Meaning
 9 | 
10 | kube-state-metrics sharding is misconfigured.
11 | 
12 | ## Impact
13 | 
14 | Unable to get metrics for certain resources.
15 | Some metrics can be unavailable.
16 | 
17 | ## Diagnosis
18 | 
19 | Check kube-state-metric container logs for each shard.
20 | Check service account token.
21 | Check networking rules and network policies.
22 | 
23 | ## Mitigation
24 | 
25 | TODO
26 | 


--------------------------------------------------------------------------------
/content/runbooks/kube-state-metrics/KubeStateMetricsShardsMissing.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube State Metrics Shards Missing
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeStateMetricsShardsMissing
 7 | 
 8 | ## Meaning
 9 | 
10 | kube-state-metrics shards are missing.
11 | 
12 | ## Impact
13 | 
14 | Unable to get metrics for certain resources.
15 | Some metrics can be unavailable.
16 | 
17 | ## Diagnosis
18 | 
19 | Check kube-state-metric container logs for each shard.
20 | Check if certain pods were forcefully evicted.
21 | Check service account token.
22 | Check networking rules and network policies.
23 | 
24 | ## Mitigation
25 | 
26 | TODO
27 | 


--------------------------------------------------------------------------------
/content/runbooks/kube-state-metrics/KubeStateMetricsWatchErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube State Metric sWatch Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeStateMetricsWatchErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | kube-state-metrics is experiencing errors in watch operations.
11 | 
12 | ## Impact
13 | 
14 | Unable to get metrics for certain resources
15 | 
16 | ## Diagnosis
17 | 
18 | Check kube-state-metric container logs.
19 | Check service account token.
20 | Check networking rules and network policies.
21 | 
22 | ## Mitigation
23 | 
24 | TODO
25 | 


--------------------------------------------------------------------------------
/content/runbooks/kube-state-metrics/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: kube-state-metrics
3 | bookCollapseSection: true
4 | bookFlatSection: true
5 | weight: 10
6 | ---
7 | 
8 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/CPUThrottlingHigh.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: CPU Throttling High
 3 | weight: 20
 4 | ---
 5 | 
 6 | # CPU Throttling High
 7 | 
 8 | ## Meaning
 9 | 
10 | Processes experience elevated CPU throttling.
11 | 
12 | ## Impact
13 | 
14 | The alert is purely informative and unless there is some other issue with
15 | the application, it can be skipped.
16 | 
17 | ## Diagnosis
18 | 
19 | - Check if application is performing normally
20 | - Check if CPU resource requests are adjusted accordingly to the app usage
21 | - Check kernel version in the node
22 | 
23 | ## Mitigation
24 | 
25 | **Notice**:
26 | User shouldn't increase CPU limits unless the application is behaving
27 | erratically (another alert firing).
28 | 
29 | For this particular reason, the alert is inhibited by default in
30 | kube-prometheus and can be sent only if another alert in the same namespace
31 | is firing.
32 | 
33 | **When mixed with other alerts**:
34 | 
35 | Give specific container in the pod more CPU limits. Requests can stay the same.
36 | 
37 | In specific cases kubernetes node has too old kernel which is known to have
38 | issues with assigning CPU resources to the process [see here](https://github.com/kubernetes/kubernetes/issues/67577)
39 | 
40 | In certain scenarios ensure to use CPU Pinning and isolation - in short give
41 | to the container full CPU cores.
42 | Also ensure to update app so that it is aware it runs in cgroups,
43 | or explicitly set number of CPU it can use, or limit number of threads.
44 | 
45 | Longer and more detailed info - [PDF from Intel](https://builders.intel.com/docs/networkbuilders/cpu-pin-and-isolation-in-kubernetes-app-note.pdf)
46 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeAPIDown.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube API Down
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeAPIDown
 7 | 
 8 | ## Meaning
 9 | 
10 | The `KubeAPIDown` alert is triggered when all Kubernetes API servers have not
11 | been reachable by the monitoring system for more than 15 minutes.
12 | 
13 | ## Impact
14 | 
15 | This is a critical alert. The Kubernetes API is not responding. The
16 | cluster may partially or fully non-functional.
17 | 
18 | Applications, which do not use kubernetes API directly, will continue to work. Changing kubernetes resources is not possible.
19 | in the cluster.
20 | 
21 | Services using Kubernetes API directly will start to behave erratically.
22 | 
23 | ## Diagnosis
24 | 
25 | Check the status of the API server targets in the Prometheus UI.
26 | 
27 | Then, confirm whether the API is also unresponsive for you:
28 | 
29 | ```shell
30 | $ kubectl cluster-info
31 | ```
32 | 
33 | If you can still reach the API server, there may be a network issue between the
34 | Prometheus instances and the API server pods. Check the status of the API server
35 | pods.
36 | 
37 | ```shell
38 | $ kubectl -n kube-system get pods
39 | $ kubectl -n kube-system logs -l 'component=kube-apiserver'
40 | ```
41 | 
42 | - Check networking on the node.
43 | - Check firewall on the node.
44 | - Investigate kube proxy logs.
45 | - Investigate NetworkPolicies if prometheus/kubeApi was not filtered out.
46 | 
47 | 
48 | ## Mitigation
49 | 
50 | If you can still reach the API server intermittently, you may be able treat this
51 | like any other failing deployment. If not, it's possible you may have to refer
52 | to the disaster recovery documentation.
53 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeAPIErrorBudgetBurn.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Kube API Error Budget Burn
  3 | weight: 20
  4 | ---
  5 | 
  6 | # KubeAPIErrorBudgetBurn
  7 | 
  8 | ## Impact
  9 | 
 10 | The overall availability of your Kubernetes cluster isn't guaranteed any more.
 11 | There may be **too many errors** returned by the APIServer and/or **responses take too long** for guarantee proper reconciliation.
 12 | 
 13 | **This is always important; the only deciding factor is how urgent it is at the current rate**
 14 | 
 15 | <details>
 16 | <summary>Full context</summary>
 17 | 
 18 | This alert essentially means that a higher-than-expected percentage of the operations kube-apiserver is performing are erroring. Since random errors are inevitable, kube-apiserver has a "budget" of errors that it is allowed to make before triggering this alert.
 19 | 
 20 | Learn more about Multiple Burn Rate Alerts in the [SRE Workbook Chapter 5](https://sre.google/workbook/alerting-on-slos/#recommended_time_windows_and_burn_rates_f).
 21 | 
 22 | </details>
 23 | 
 24 | ## Critical
 25 | 
 26 | First check the labels `long` and `short`.
 27 | 
 28 | * `long: 1h` and `short: 5m`: less than **~2 days** -- You should fix the problem as soon as possible!
 29 | * `long: 6h` and `short: 30m`: less than **~5 days** -- Track this down now but no immediate fix required.
 30 | 
 31 | ## Warning
 32 | 
 33 | First check the labels `long` and `short`.
 34 | 
 35 | * `long: 1d` and `short: 2h`: less than **~10 days** -- This is problematic in the long run. You should take a look in the next 24-48 hours.
 36 | * `long: 3d` and `short: 6h`: less than **~30 days** -- (the entire window of the error budget) at this rate. This means that at the end of the next 30 days there won't be any error budget left at this rate. It's fine to leave this over the weekend and have someone take a look in the coming days at working hours.
 37 | 
 38 | _Example: If you have a 99% availability target this means that at the end of 30 days you're going to be below 99% at this rate._
 39 | 
 40 | ## Runbook
 41 | 
 42 | 1. Take a look at the APIServer Grafana dashboard.
 43 |     1. At the very top check your current availability and how much percent of error budget is left. This should indicate the severity too.
 44 |     1. Do you see an elevated error rate in reads or writes?
 45 |     1. Do you see too many slow requests in reads or writes?
 46 | 1. Check the logs for kube-apiserver using the following Loki query: `{component="kube-apiserver"}`
 47 | 1. Run debugging queries in Prometheus or Grafana Explore to dig deeper.
 48 |     1. If you don't see anything obvious with the error rates, it might be too many slow requests. [Check the queries below!](#example-queries-for-slow-requests)
 49 | 1. Maybe it's some dependency of the APIServer? etcd?
 50 | 
 51 | ### Example Queries for slow requests
 52 | 
 53 | Change the rate window according to your `long` label from the alert.
 54 | Make sure to update the alert threshold too, like `> 0.01` to `> 14.4 * 0.01` for example.
 55 | 
 56 | #### Slow Read Requests
 57 | 
 58 | If you don't get any results back then there aren't too many slow requests - that's good.
 59 | If you get results than you know what type of requests are too slow.
 60 | 
 61 | Cluster scoped:
 62 | 
 63 | ```promql
 64 | (
 65 | sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="40",scope="cluster",verb=~"LIST|GET"}[3d]))
 66 | -
 67 | sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
 68 | )
 69 | /
 70 | sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
 71 | > 0.01
 72 | ```
 73 | 
 74 | Namespace scoped:
 75 | 
 76 | ```promql
 77 | (
 78 | sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",verb=~"LIST|GET"}[3d]))
 79 | -
 80 | sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
 81 | )
 82 | /
 83 | sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
 84 | > 0.01
 85 | ```
 86 | 
 87 | Resource scoped:
 88 | 
 89 | ```promql
 90 | (
 91 | sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",verb=~"LIST|GET"}[3d])) or vector(0)
 92 | -
 93 | sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
 94 | )
 95 | /
 96 | sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
 97 | > 0.01
 98 | ```
 99 | 
100 | #### Slow Write Requests
101 | 
102 | ```promql
103 | (
104 | sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
105 | -
106 | sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
107 | )
108 | /
109 | sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
110 | > 0.01
111 | ```
112 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeAPITerminatedRequests.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube API Terminated Requests
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeAPITerminatedRequests
 7 | 
 8 | ## Meaning
 9 | 
10 | The apiserver has terminated over 20% of its incoming requests.
11 | 
12 | ## Impact
13 | 
14 | Client will not be able to interact with the cluster.
15 | Some in-cluster services this may degrade or make service unavailable.
16 | 
17 | ## Diagnosis
18 | 
19 | Use the `apiserver_flowcontrol_rejected_requests_total` metric to determine
20 | which flow schema is throttling the traffic to the API Server.
21 | The flow schema also provides information on the affected resources and subjects.
22 | 
23 | ## Mitigation
24 | 
25 | TODO
26 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeAggregatedAPIDown.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Aggregated API Down
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeAggregatedAPIDown
 7 | 
 8 | ## Meaning
 9 | 
10 | Kubernetes aggregated API has reported errors.
11 | It has appeared unavailable X times averaged over the past 10m.
12 | 
13 | ## Impact
14 | 
15 | From minor such as inability to see cluster metrics to more severe such as
16 | unable to use custom metrics to scale or even unable to use cluster.
17 | 
18 | ## Diagnosis
19 | 
20 | - Check networking on the node.
21 | - Check firewall on the node.
22 | - Investigate additional API logs.
23 | - Investigate NetworkPolicies if kubeApi - additional API was not filtered out.
24 | - Investigate NetworkPolicies if prometheus/additional api was not filtered out.
25 | 
26 | ## Mitigation
27 | 
28 | TODO
29 | 
30 | See [APIServer aggregation](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/apiserver-aggregation/)
31 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeAggregatedAPIErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Aggregated API Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeAggregatedAPIErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Kubernetes aggregated API has reported errors.
11 | It has appeared unavailable over 4 times averaged over the past 10m.
12 | 
13 | ## Impact
14 | 
15 | From minor such as inability to see cluster metrics to more severe such as
16 | unable to use custom metrics to scale or even unable to use cluster.
17 | 
18 | ## Diagnosis
19 | 
20 | - Check networking on the node.
21 | - Check firewall on the node.
22 | - Investigate additional API logs.
23 | - Investigate NetworkPolicies if kubeApi - additional API was not filtered out.
24 | - Investigate NetworkPolicies if prometheus/additional API was not filtered out.
25 | 
26 | ## Mitigation
27 | 
28 | TODO
29 | 
30 | See [APIServer aggregation](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/apiserver-aggregation/)
31 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeCPUOvercommit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube CPU Overcommit
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeCPUOvercommit
 7 | 
 8 | ## Meaning
 9 | 
10 | Cluster has overcommitted CPU resource requests for Pods
11 | and cannot tolerate node failure.
12 | 
13 | <details>
14 | <summary>Full context</summary>
15 | 
16 | Total number of CPU requests for pods exceeds cluster capacity.
17 | In case of node failure some pods will not fit in the remaining nodes.
18 | 
19 | </details>
20 | 
21 | ## Impact
22 | 
23 | The cluster cannot tolerate node failure. In the event of a node failure, some Pods will be in `Pending` state.
24 | 
25 | ## Diagnosis
26 | 
27 | - Check if CPU resource requests are adjusted to the app usage
28 | - Check if some nodes are available and not cordoned
29 | - Check if cluster-autoscaler has issues with adding new nodes
30 | 
31 | ## Mitigation
32 | 
33 | - Add more nodes to the cluster - usually it is better to have more smaller
34 |   nodes, than few bigger.
35 | 
36 | - Add different node pools with different instance types to avoid problem
37 |   when using only one instance type in the cloud.
38 | 
39 | - Use pod priorities to avoid important services from losing performance,
40 |   see [pod priority and preemption](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/)
41 | 
42 | - Fine tune settings for special pods used with [cluster-autoscaler](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-does-cluster-autoscaler-work-with-pod-priority-and-preemption)
43 | 
44 | - Prepare performance tests for the expected workload, plan cluster capacity
45 |   accordingly.
46 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeCPUQuotaOvercommit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube CPU Quota Overcommit
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeCPUQuotaOvercommit
 7 | 
 8 | ## Meaning
 9 | 
10 | Cluster has overcommitted CPU resource requests for Namespaces and cannot tolerate node failure.
11 | 
12 | ## Impact
13 | 
14 | In the event of a node failure, some Pods will be in `Pending` state due to a lack of available CPU resources.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check if CPU resource requests are adjusted to the app usage
19 | - Check if some nodes are available and not cordoned
20 | - Check if cluster-autoscaler has issues with adding new nodes
21 | - Check if the given namespace usage grows in time more than expected
22 | 
23 | ## Mitigation
24 | 
25 | - Review existing quota for given namespace and adjust it accordingly.
26 | 
27 | - Add more nodes to the cluster - usually it is better to have more smaller
28 |   nodes, than few bigger.
29 | 
30 | - Add different node pools with different instance types to avoid problem
31 |   when using only one instance type in the cloud.
32 | 
33 | - Use pod priorities to avoid important services from losing performance,
34 |   see [pod priority and preemption](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/)
35 | 
36 | - Fine tune settings for special pods used with [cluster-autoscaler](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-does-cluster-autoscaler-work-with-pod-priority-and-preemption)
37 | 
38 | - Prepare performance tests for the expected workload, plan cluster capacity
39 |   accordingly.
40 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeClientCertificateExpiration.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Client Certificate Expiration
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeClientCertificateExpiration
 7 | 
 8 | ## Meaning
 9 | 
10 | A client certificate used to authenticate to the apiserver is expiring in less than 7 days (warning alert) or 24 hours (critical alert).
11 | 
12 | ## Impact
13 | 
14 | Client will not be able to interact with the cluster.
15 | In cluster services communicating with Kubernetes API may degrade or become unavailable.
16 | 
17 | ## Diagnosis
18 | 
19 | Check when certificate was issued and when it expires.
20 | Check serviceAccounts and service account tokens.
21 | 
22 | ## Mitigation
23 | 
24 | Update client certificate.
25 | 
26 | For in-cluster clients recreate pods.
27 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeClientErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Client Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeClientErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Kubernetes API server client is experiencing over 1% error rate in the last 15 minutes.
11 | 
12 | ## Impact
13 | 
14 | Specific kubernetes client may malfunction. Service degradation.
15 | 
16 | ## Diagnosis
17 | 
18 | Usual issues:
19 | 
20 | - networking errors
21 | - too low resources to perform given API calls (usually too low CPU/memory requests)
22 | - wrong api client (old libraries)
23 | - investigate if the app does not request more data than it really requires
24 |   from kubernetes API, for example it has too wide permissions and scans for
25 |   resources in all namespaces.
26 | 
27 | Check logs from client side (sometimes app logs).
28 | 
29 | 
30 | ## Mitigation
31 | 
32 | TODO
33 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeContainerWaiting.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Container Waiting
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeContainerWaiting
 7 | 
 8 | ## Meaning
 9 | 
10 | Container in pod is in Waiting state for too long.
11 | 
12 | ## Impact
13 | 
14 | Service degradation or unavailability.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check pod events via `kubectl -n $NAMESPACE describe pod $POD`.
19 | - Check pod logs via `kubectl -n $NAMESPACE logs $POD -c $CONTAINER`
20 | - Check for missing files such as configmaps/secrets/volumes
21 | - Check for pod requests, especially special ones such as GPU.
22 | - Check for node taints and capabilities.
23 | 
24 | ## Mitigation
25 | 
26 | See [Container waiting](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#my-pod-stays-waiting)
27 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeControllerManagerDown.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Controller Manager Down
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeControllerManagerDown
 7 | 
 8 | ## Meaning
 9 | 
10 | KubeControllerManager has disappeared from Prometheus target discovery.
11 | 
12 | ## Impact
13 | 
14 | The cluster is not functional and Kubernetes resources cannot be reconciled.
15 | 
16 | <details>
17 | <summary>Full context</summary>
18 | 
19 | More about kube-controller-manager function can be found at https://kubernetes.io/docs/reference/command-line-tools-reference/kube-controller-manager/
20 | 
21 | </details>
22 | 
23 | ## Diagnosis
24 | 
25 | TODO
26 | 
27 | ## Mitigation
28 | 
29 | See old CoreOS docs in [Web Archive](http://web.archive.org/web/20201026205154/https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html)
30 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeDaemonSetMisScheduled.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube DaemonSet MisScheduled
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeDaemonSetMisScheduled
 7 | 
 8 | ## Meaning
 9 | 
10 | A number of pods of daemonset are running where they are not supposed to run.
11 | 
12 | ## Impact
13 | 
14 | Service degradation or unavailability.
15 | Excessive resource usage where they could be used by other apps.
16 | 
17 | ## Diagnosis
18 | 
19 | Usually happens when specifying wrong pod nodeSelector/taints/affinities or
20 | node (node pools) were tainted and existing pods were not scheduled for eviction.
21 | 
22 | - Check daemonset status via `kubectl -n $NAMESPACE describe daemonset $NAME`.
23 | - Check [DaemonSet update strategy](https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/)
24 | - Check the status of the pods which belong to the replica sets under the deployment.
25 | - Check pod template parameters such as:
26 |   - pod priority - maybe it was evicted by other more important pods
27 |   - affinity rules - maybe due to affinities and not enough nodes it is not
28 |     possible to schedule pods
29 | - Check node taints and labels
30 | - Check logs for [node-feature-discovery](https://kubernetes-sigs.github.io/node-feature-discovery/master/get-started/index.html)
31 |   and other supporting tools such as gpu-feature-discovery
32 | 
33 | ## Mitigation
34 | 
35 | Update DaemonSet and apply change, delete pods manually.
36 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeDaemonSetNotScheduled.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube DaemonSet Not Scheduled
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeDaemonSetNotScheduled
 7 | 
 8 | ## Meaning
 9 | 
10 | A number of pods of daemonset are not scheduled.
11 | 
12 | ## Impact
13 | 
14 | Service degradation or unavailability.
15 | 
16 | ## Diagnosis
17 | 
18 | Usually happens when specifying wrong pod taints/affinities or lack of
19 | resources on the nodes.
20 | 
21 | - Check daemonset status via `kubectl -n $NAMESPACE describe daemonset $NAME`.
22 | - Check [DaemonSet update strategy](https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/)
23 | - Check the status of the pods which belong to the replica sets under the deployment.
24 | - Check pod template parameters such as:
25 |   - pod priority - maybe it was evicted by other more important pods
26 |   - resources - maybe it tries to use unavailable resource, such as GPU but
27 |     there is limited number of nodes with GPU
28 |   - affinity rules - maybe due to affinities and not enough nodes it is not
29 |     possible to schedule pods
30 | - Check if Horizontal Pod Autoscaler (HPA) is not triggered due to untested
31 |   values (requests values).
32 | - Check if cluster-autoscaler is able to create new nodes - see its logs or
33 |   cluster-autoscaler status configmap.
34 | 
35 | ## Mitigation
36 | 
37 | Set proper priority class for important dameonsets to system-node-critical.
38 | 
39 | See [DaemonSet rolling update is stuck](https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/#daemonset-rolling-update-is-stuck)
40 | 
41 | In some rare cases you may need to change node affinities or delete pod
42 | manually if this is special daemonset which has specific pod priority class
43 | and is limited to only 1 replica (so it runs on specific node only)
44 | 
45 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
46 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeDaemonSetRolloutStuck.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube DaemonSet Rollout Stuck
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeDaemonSetRolloutStuck
 7 | 
 8 | ## Meaning
 9 | 
10 | DaemonSet update is stuck waiting for replaced pod.
11 | 
12 | 
13 | ## Impact
14 | 
15 | Service degradation or unavailability.
16 | 
17 | ## Diagnosis
18 | 
19 | - Check daemonset status via `kubectl -n $NAMESPACE describe daemonset $NAME`.
20 | - Check [DaemonSet update strategy](https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/)
21 | - Check the status of the pods which belong to the replica sets under the deployment.
22 | - Check pod template parameters such as:
23 |   - pod priority - maybe it was evicted by other more important pods
24 |   - resources - maybe it tries to use unavailable resource, such as GPU but
25 |     there is limited number of nodes with GPU
26 |   - affinity rules - maybe due to affinities and not enough nodes it is not
27 |     possible to schedule pods
28 |   - pod termination grace period - if too long then pods may be for too long
29 |     in terminating state
30 | - Check if Horizontal Pod Autoscaler (HPA) is not triggered due to untested
31 |   values (requests values).
32 | - Check if cluster-autoscaler is able to create new nodes - see its logs or
33 |   cluster-autoscaler status configmap.
34 | 
35 | ## Mitigation
36 | 
37 | See [DaemonSet rolling update is stuck](https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/#daemonset-rolling-update-is-stuck)
38 | 
39 | In some rare cases you may need to change node affinities or delete pod
40 | manually if this is special daemonset
41 | which has pod priority class system-cluster-critical and is limited to only
42 | 1 replica (so it runs on specific node only)
43 | 
44 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
45 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeDeploymentGenerationMismatch.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Deployment Generation Mismatch
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeDeploymentGenerationMismatch
 7 | 
 8 | ## Meaning
 9 | 
10 | Deployment generation mismatch due to possible roll-back.
11 | 
12 | ## Impact
13 | 
14 | Service degradation or unavailability.
15 | 
16 | ## Diagnosis
17 | 
18 | See [Kubernetes Docs - Failed Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#failed-deployment)
19 | 
20 | - Check out rollout history `kubectl -n $NAMESPACE rollout history deployment $NAME`
21 | - Check rollout status if it is not paused
22 | - Check deployment status via `kubectl -n $NAMESPACE describe deployment $NAME`.
23 | - Check how many replicas are there declared.
24 | - Investigate if new pods are not crashing.
25 | - Check the status of the pods which belong to the replica sets under the deployment.
26 | - Check pod template parameters such as:
27 |   - pod priority - maybe it was evicted by other more important pods
28 |   - resources - maybe it tries to use unavailable resource, such as GPU
29 |     but there is limited number of nodes with GPU
30 |   - affinity rules - maybe due to affinities and not enough nodes it is
31 |     not possible to schedule pods
32 |   - pod termination grace period - if too long then pods may be for too long
33 |     in terminating state
34 | - Check if Horizontal Pod Autoscaler (HPA) is not triggered due to untested
35 |   values (requests values).
36 | - Check if cluster-autoscaler is able to create new nodes - see its logs or
37 |   cluster-autoscaler status configmap.
38 | 
39 | ## Mitigation
40 | 
41 | Depending on the conditions usually adding new nodes solves the issue.
42 | 
43 | Otherwise probably deployment or HPA definition needs to be fixed.
44 | If you can not add nodes then you can change rolling update strategy to `Recreate`.
45 | Sometimes manually deleting pod helps :)
46 | 
47 | In rare cases roll back to previous version - see [Kubernetes Docs - Rolling Back](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#rolling-back-to-a-previous-revision)
48 | 
49 | In extremely rare situations scale oldest ReplicaSets to 0 and delete them.
50 | 
51 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
52 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeDeploymentReplicasMismatch.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Deployment Replicas Mismatch
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeDeploymentReplicasMismatch
 7 | 
 8 | ## Meaning
 9 | 
10 | Deployment has not matched the expected number of replicas.
11 | 
12 | <details>
13 | <summary>Full context</summary>
14 | 
15 | Kubernetes Deployment resource does not have number of replicas which were
16 | declared to be in operation.
17 | For example deployment is expected to have 3 replicas, but it has less than
18 | that for a noticeable period of time.
19 | 
20 | In rare occasions there may be more replicas than it should and system did
21 | not clean it up.
22 | </details>
23 | 
24 | ## Impact
25 | 
26 | Service degradation or unavailability.
27 | 
28 | ## Diagnosis
29 | 
30 | - Check deployment status via `kubectl -n $NAMESPACE describe deployment $NAME`.
31 | - Check how many replicas are there declared.
32 | - Check the status of the pods which belong to the replica sets under the deployment.
33 | - Check pod template parameters such as:
34 |   - pod priority - maybe it was evicted by other more important pods
35 |   - resources - maybe it tries to use unavailable resource, such as GPU
36 |     but there is limited number of nodes with GPU
37 |   - affinity rules - maybe due to affinities and not enough nodes it is
38 |     not possible to schedule pods
39 |   - pod termination grace period - if too long then pods may be for too long
40 |     in terminating state
41 | - Check if Horizontal Pod Autoscaler (HPA) is not triggered due to untested
42 |   values (requests values).
43 | - Check if cluster-autoscaler is able to create new nodes - see its logs or
44 |   cluster-autoscaler status configmap.
45 | 
46 | ## Mitigation
47 | 
48 | Depending on the conditions usually adding new nodes solves the issue.
49 | 
50 | Otherwise probably deployment or HPA definition needs to be fixed.
51 | 
52 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
53 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeHpaMaxedOut.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube HPA Maxed Out
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeHpaMaxedOut
 7 | 
 8 | ## Meaning
 9 | 
10 | Horizontal Pod Autoscaler has been running at max replicas for longer
11 | than 15 minutes.
12 | 
13 | ## Impact
14 | 
15 | Horizontal Pod Autoscaler won't be able to add new pods and thus scale application.
16 | **Notice** for some services maximizing HPA is in fact desired.
17 | 
18 | ## Diagnosis
19 | 
20 | Check why HPA was unable to scale:
21 | 
22 | - max replicas too low
23 | - too low value for requests such as CPU?
24 | 
25 | ## Mitigation
26 | 
27 | If using basic metrics like CPU/Memory then ensure to set proper values for
28 | `requests`.
29 | For memory based scaling ensure there are no memory leaks.
30 | If using custom metrics then fine-tune how app scales accordingly to it.
31 | 
32 | Use performance tests to see how the app scales.
33 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeHpaReplicasMismatch.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube HPA  Replicas Mismatch
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeHpaReplicasMismatch
 7 | 
 8 | ## Meaning
 9 | 
10 | Horizontal Pod Autoscaler has not matched the desired number of replicas for
11 | longer than 15 minutes.
12 | 
13 | ## Impact
14 | 
15 | HPA was unable to schedule desired number of pods.
16 | 
17 | ## Diagnosis
18 | 
19 | Check why HPA was unable to scale:
20 | 
21 | - not enough nodes in the cluster
22 | - hitting resource quotas in the cluster
23 | - pods evicted due to pod priority
24 | 
25 | ## Mitigation
26 | 
27 | In case of cluster-autoscaler you may need to set up preemtive pod pools to
28 | ensure nodes are created on time.
29 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeJobCompletion.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Job Completion
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeJobCompletion
 7 | 
 8 | ## Meaning
 9 | 
10 | Job is taking more than 1h to complete.
11 | 
12 | ## Impact
13 | 
14 | - Long processing of batch jobs.
15 | - Possible issues with scheduling next Job
16 | 
17 | ## Diagnosis
18 | 
19 | - Check job via `kubectl -n $NAMESPACE describe jobs $JOB`.
20 | - Check pod events via `kubectl -n $NAMESPACE describe job $JOB`.
21 | 
22 | ## Mitigation
23 | 
24 | - Give it more resources so it finishes faster, if applicable.
25 | - See [Job patterns](https://kubernetes.io/docs/tasks/job/)
26 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeJobFailed.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Job Failed
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeJobFailed
 7 | 
 8 | ## Meaning
 9 | 
10 | Job failed complete.
11 | 
12 | ## Impact
13 | 
14 | Failure of processing of a scheduled task.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check job via `kubectl -n $NAMESPACE describe jobs $JOB`.
19 | - Check pod events via `kubectl -n $NAMESPACE describe pod $POD_FROM_JOB`.
20 | - Check pod logs via `kubectl -n $NAMESPACE log pod $POD_FROM_JOB`.
21 | 
22 | ## Mitigation
23 | 
24 | - See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
25 | - See [Job patterns](https://kubernetes.io/docs/tasks/job/)
26 | - redesign job so that it is idempotent (can be re-run many times which will
27 |   always produce the same output even if input differs)
28 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeMemoryOvercommit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Memory Overcommit
 3 | weight: 20
 4 | aliases:
 5 |   - /kubememovercommit/
 6 | ---
 7 | 
 8 | # KubeMemoryOvercommit
 9 | 
10 | ## Meaning
11 | 
12 | Cluster has overcommitted Memory resource requests for Pods
13 | and cannot tolerate node failure.
14 | 
15 | <details>
16 | <summary>Full context</summary>
17 | 
18 | Total number of Memory requests for pods exceeds cluster capacity.
19 | In case of node failure some pods will not fit in the remaining nodes.
20 | 
21 | </details>
22 | 
23 | ## Impact
24 | 
25 | The cluster cannot tolerate node failure. In the event of a node failure,
26 | some Pods will be in `Pending` state.
27 | 
28 | ## Diagnosis
29 | 
30 | - Check if Memory resource requests are adjusted to the app usage
31 | - Check if some nodes are available and not cordoned
32 | - Check if cluster-autoscaler has issues with adding new nodes
33 | 
34 | ## Mitigation
35 | 
36 | - Add more nodes to the cluster - usually it is better to have more smaller
37 |   nodes, than few bigger.
38 | 
39 | - Add different node pools with different instance types to avoid problem
40 |   when using only one instance type in the cloud.
41 | 
42 | - Use pod priorities to avoid important services from losing performance,
43 |   see [pod priority and preemption](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/)
44 | 
45 | - Fine tune settings for special pods used with [cluster-autoscaler](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-does-cluster-autoscaler-work-with-pod-priority-and-preemption)
46 | 
47 | - Prepare performance tests for the expected workload, plan cluster capacity
48 |   accordingly.
49 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeMemoryQuotaOvercommit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Memory Quota Overcommit
 3 | weight: 20
 4 | aliases:
 5 |   - /kubememquotaovercommit/
 6 | ---
 7 | 
 8 | # KubeMemoryQuotaOvercommit
 9 | 
10 | ## Meaning
11 | 
12 | Cluster has overcommitted memory resource requests for Namespaces.
13 | 
14 | ## Impact
15 | 
16 | Various services degradation or unavailability in case of single node failure.
17 | 
18 | ## Diagnosis
19 | 
20 | - Check if Memory resource requests are adjusted to the app usage
21 | - Check if some nodes are available and not cordoned
22 | - Check if cluster-autoscaler has issues with adding new nodes
23 | - Check if the given namespace usage grows in time more than expected
24 | 
25 | ## Mitigation
26 | 
27 | - Review existing quota for given namespace and adjust it accordingly.
28 | 
29 | - Add more nodes to the cluster - usually it is better to have more smaller
30 |   nodes, than few bigger.
31 | 
32 | - Add different node pools with different instance types to avoid problem
33 |   when using only one instance type in the cloud.
34 | 
35 | - Use pod priorities to avoid important services from losing performance,
36 |   see [pod priority and preemption](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/)
37 | 
38 | - Fine tune settings for special pods used with [cluster-autoscaler](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-does-cluster-autoscaler-work-with-pod-priority-and-preemption)
39 | 
40 | - Prepare performance tests for the expected workload, plan cluster capacity
41 |   accordingly.
42 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeNodeNotReady.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Node Not Ready
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeNodeNotReady
 7 | 
 8 | ## Meaning
 9 | 
10 | KubeNodeNotReady alert is fired when a Kubernetes node is not in `Ready`
11 | state for a certain period. In this case, the node is not able to host any new
12 | pods as described [here][KubeNode].
13 | 
14 | ## Impact
15 | 
16 | The performance of the cluster deployments is affected, depending on the overall
17 | workload and the type of the node.
18 | 
19 | ## Diagnosis
20 | 
21 | The notification details should list the node that's not ready. For Example:
22 | 
23 | ```txt
24 |  - alertname = KubeNodeNotReady
25 | ...
26 |  - node = node1.example.com
27 | ...
28 | ```
29 | 
30 | Login to the cluster. Check the status of that node:
31 | 
32 | ```shell
33 | $ kubectl get node $NODE -o yaml
34 | ```
35 | 
36 | The output should describe why the node isn't ready (e.g.: timeouts reaching the
37 | API or kubelet).
38 | 
39 | ## Mitigation
40 | 
41 | Once, the problem was resolved that prevented node from being replaced,
42 | the instance should be terminated.
43 | 
44 | See [KubeNode](https://kubernetes.io/docs/concepts/architecture/nodes/#condition)
45 | See [node problem detector](https://github.com/kubernetes/node-problem-detector)
46 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeNodeReadinessFlapping.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Node Readiness Flapping
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeNodeReadinessFlapping
 7 | 
 8 | ## Meaning
 9 | 
10 | The readiness status of node  has changed few times in the last 15 minutes.
11 | 
12 | ## Impact
13 | 
14 | The performance of the cluster deployments is affected, depending on the overall
15 | workload and the type of the node.
16 | 
17 | ## Diagnosis
18 | 
19 | The notification details should list the node that's not reachable. For Example:
20 | 
21 | ```txt
22 |  - alertname = KubeNodeUnreachable
23 | ...
24 |  - node = node1.example.com
25 | ...
26 | ```
27 | 
28 | Login to the cluster. Check the status of that node:
29 | 
30 | ```shell
31 | $ kubectl get node $NODE -o yaml
32 | ```
33 | 
34 | The output should describe why the node is not reachable.
35 | 
36 | Common failure scenarios:
37 | 
38 | - disruptive software upgrades
39 | - network patitioning due to hardware failures
40 | - firewall rules
41 | - virtual machines suspended due to storage area network problems
42 | - system crashes / freezes due to software or hardware malfunctions
43 | 
44 | ## Mitigation
45 | 
46 | In case of maintenance ensure to [cordon and drain node](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/).
47 | 
48 | In other cases ensure storage and networking redundancy if applicable.
49 | 
50 | See [KubeNode](https://kubernetes.io/docs/concepts/architecture/nodes/#condition)
51 | See [node problem detector](https://github.com/kubernetes/node-problem-detector)
52 | See [Watchdog timer](https://en.wikipedia.org/wiki/Watchdog_timer)
53 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeNodeUnreachable.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Node Unreachable
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeNodeUnreachable
 7 | 
 8 | ## Meaning
 9 | 
10 | Kubernetes node is unreachable and some workloads may be rescheduled.
11 | 
12 | ## Impact
13 | 
14 | The performance of the cluster deployments is affected, depending on the overall
15 | workload and the type of the node.
16 | 
17 | ## Diagnosis
18 | 
19 | The notification details should list the node that's not reachable. For Example:
20 | 
21 | ```txt
22 |  - alertname = KubeNodeUnreachable
23 | ...
24 |  - node = node1.example.com
25 | ...
26 | ```
27 | 
28 | Login to the cluster. Check the status of that node:
29 | 
30 | ```shell
31 | $ kubectl get node $NODE -o yaml
32 | ```
33 | 
34 | The output should describe why the node is not reachable.
35 | 
36 | Common failure scenarios:
37 | 
38 | - disruptive software upgrades
39 | - network patitioning due to hardware failures
40 | - firewall rules
41 | - virtual machines suspended due to storage area network problems
42 | - system crashes / freezes due to software or hardware malfunctions
43 | 
44 | ## Mitigation
45 | 
46 | In case of maintenance ensure to [cordon and drain node](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/).
47 | 
48 | In other cases ensure storage and networking redundancy if applicable.
49 | 
50 | See [KubeNode](https://kubernetes.io/docs/concepts/architecture/nodes/#condition)
51 | See [node problem detector](https://github.com/kubernetes/node-problem-detector)
52 | See [Watchdog timer](https://en.wikipedia.org/wiki/Watchdog_timer)
53 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubePersistentVolumeErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Persistent Volume Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubePersistentVolumeErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | PersistentVolume is having issues with provisioning.
11 | 
12 | ## Impact
13 | 
14 | Volue may be unavailable or have data erors (corrupted storage).
15 | 
16 | Service degradation, data loss.
17 | 
18 | ## Diagnosis
19 | 
20 | - Check PV events via `kubectl describe pv $PV`.
21 | - Check storage provider for logs.
22 | - Check storage quotas in the cloud.
23 | 
24 | ## Mitigation
25 | 
26 | In happy scenario storage is just not provisioned as fast as expected.
27 | In worst scenario there is data corruption or data loss. Restore from backup.
28 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubePersistentVolumeFillingUp.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Kube Persistent Volume Filling Up
  3 | weight: 20
  4 | ---
  5 | 
  6 | # KubePersistentVolumeFillingUp
  7 | 
  8 | ## Meaning
  9 | 
 10 | There can be various reasons why a volume is filling up.
 11 | This runbook does not cover application specific reasons, only mitigations
 12 | for volumes that are legitimately filling.
 13 | 
 14 | As always refer to recommended scenarios for given service.
 15 | 
 16 | ## Impact
 17 | 
 18 | Service degradation, switching to read only mode.
 19 | 
 20 | ## Diagnosis
 21 | 
 22 | Check app usage in time.
 23 | Check if there are any configurations such as snapshotting, automatic data retention.
 24 | 
 25 | ## Mitigation
 26 | 
 27 | ### Data retention
 28 | 
 29 | Deleting no longer needed data is the fastest and the cheapest solution.
 30 | 
 31 | Ask the service owner if specific old data can be deleted.
 32 | Enable data retention especially for snapshots, if possible.
 33 | 
 34 | ### Data export
 35 | 
 36 | If data is not needed in the service but needs to be processed later 
 37 | then send it to somewhere else, for example to S3 bucket.
 38 | 
 39 | ### Data rebalance in the cluster
 40 | 
 41 | Some services automatically rebalance data on the cluster when one node
 42 | fills up. 
 43 | Some allow to rebalance data across existing nodes, the other may require
 44 | adding new nodes.
 45 | If this is supported then increase number of replicas and wait for data
 46 | migration or trigger it manually.
 47 | 
 48 | Example services that support this:
 49 | 
 50 | - cassandra
 51 | - ceph
 52 | - elasticsearch/opensearch
 53 | - gluster
 54 | - hadoop
 55 | - kafka
 56 | - minio
 57 | 
 58 | **Notice**: some services may require special scaling conditions such as
 59 | adding twice more nodes than exist now.
 60 | 
 61 | ### Direct Volume resizing
 62 | 
 63 | If volume resizing is available, it's easiest to increase the capacity of
 64 | the volume.
 65 | 
 66 | To check if volume expansion is available, run this with your namespace
 67 | and PVC-name replaced.
 68 | 
 69 | ```shell
 70 | $ kubectl get storageclass `kubectl -n <my-namespace> get pvc <my-pvc> -ojson | jq -r '.spec.storageClassName'`       
 71 | NAME                 PROVISIONER            RECLAIMPOLICY   VOLUMEBINDINGMODE   ALLOWVOLUMEEXPANSION   AGE
 72 | standard (default)   kubernetes.io/gce-pd   Delete          Immediate           true                   28d
 73 | ```
 74 | 
 75 | In this case `ALLOWVOLUMEEXPANSION` is true, so we can make use of the feature.
 76 | 
 77 | To resize the volume run:
 78 | 
 79 | ```shell
 80 | $ kubectl -n <my-namespace> edit pvc <my-pvc>
 81 | ```
 82 | 
 83 | And edit `.spec.resources.requests.storage` to the new desired storage size.
 84 | Eventually the PVC status will say "Waiting for user to (re-)start a pod to
 85 | finish file system resize of volume on node."
 86 | 
 87 | You can check this with:
 88 | 
 89 | ```shell
 90 | $ kubectl -n <my-namespace> get pvc <my-pvc>
 91 | ```
 92 | 
 93 | Once the PVC status says to restart the respective pod, run this to restart it
 94 | (this automatically finds the pod that mounts the PVC and deletes it,
 95 | if you know the pod name, you can also just simply delete that pod):
 96 | 
 97 | ```shell
 98 | $ kubectl -n <my-namespace> delete pod `kubectl -n <my-namespace> get pod -ojson | jq -r '.items[] | select(.spec.volumes[] .persistentVolumeClaim.claimName=="<my-pvc>") | .metadata.name'`
 99 | ```
100 | 
101 | ### Migrate data to a new, larger volume
102 | 
103 | When resizing is not available and the data is not safe to be deleted,
104 | then the only way is to create a larger volume and migrate the data.
105 | 
106 | TODO
107 | 
108 | ### Purge volume
109 | 
110 | When the data is ephemeral and volume expansion is not available,
111 | it may be best to purge the volume.
112 | 
113 | **WARNING/DANGER**: This will permanently delete the data on the volume.
114 | Performing these steps is your responsibility.
115 | 
116 | TODO
117 | 
118 | ### Migrate data to new, larger instance pool in the same cluster
119 | 
120 | In very specific scenarios it is better to schedule data migration in the
121 | same cluster but to a new instances.
122 | This is sometimes hard to accomplish due to the way how certain resources
123 | are managed in kubernetes.
124 | 
125 | In general procedure is like this:
126 | 
127 | - add new nodes with bigger capacity than existing cluster
128 | - trigger data migration
129 | - scale in to 0 old instance pool and after that delete it.
130 | 
131 | ### Migrate data to new, larger cluster
132 | 
133 | This is most common scenario, but is much more expensive and may be a bit
134 | time consuming.
135 | Also sometimes this causes split brain issues when writing.
136 | 
137 | In general procedure is like this, this is only a suggestion, though:
138 | 
139 | - create data snapshot on existing cluster
140 | - add new cluster with bigger capacity than existing cluster
141 | - start data restore on new cluster based on the snapshot
142 | - switch old cluster to read only mode
143 | - reconfigure networking to point to new cluster
144 | - trigger data migration from old cluster to new cluster to sync difference
145 |   between snapshot and latest writes
146 | - remove old cluster
147 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubePodCrashLooping.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Pod Crash Looping
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubePodCrashLooping
 7 | 
 8 | ## Meaning
 9 | 
10 | Pod is in CrashLoop which means the app dies or is unresponsive and 
11 | kubernetes tries to restart it automatically.
12 | 
13 | ## Impact
14 | 
15 | Service degradation or unavailability.
16 | Inability to do rolling upgrades.
17 | Certain apps will not perform required tasks such as data migrations.
18 | 
19 | ## Diagnosis
20 | 
21 | - Check template via `kubectl -n $NAMESPACE get pod $POD`.
22 | - Check pod events via `kubectl -n $NAMESPACE describe pod $POD`.
23 | - Check pod logs via `kubectl -n $NAMESPACE logs $POD -c $CONTAINER`
24 | - Check pod template parameters such as:
25 |   - pod priority
26 |   - resources - maybe it tries to use unavailable resource, such as GPU but
27 |     there is limited number of nodes with GPU
28 |   - readiness and liveness probes may be incorrect - wrong port or command,
29 |     check is failing too fast due to short timeout for response
30 | 
31 | Other things to check:
32 | 
33 | - app responding extremely slow due to resource constraints such as memory too
34 |   low, not enough CPU which is required on start
35 | - app waits for other services to start, such as database
36 | - misconfiguration causing app crash on start
37 | - missing files such as configmaps/secrets/volumes
38 | - read only filesystem
39 | - wrong user permissions in container
40 | - lack of special container capabilities (securityContext)
41 | - app is executed in different directory than expected
42 |   (for example WORKDIR from Docerkfile is not used in OpenShift)
43 | 
44 | ## Mitigation
45 | 
46 | Talk with developers or read documentation about the app, ensure to define
47 | sane default values to start the app.
48 | 
49 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
50 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubePodNotReady.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Pod Not Ready
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubePodNotReady
 7 | 
 8 | ## Meaning
 9 | 
10 | Pod has been in a non-ready state for more than 15 minutes.
11 | 
12 | State Running but not ready means readiness probe fails.
13 | State Pending means pod can not be created for specific namespace and node.
14 | 
15 | <details>
16 | <summary>Full context</summary>
17 | 
18 | Pod failed to reach ready state, depending on the readiness/liveness probes.
19 | See [pod-lifecycle](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/)
20 | 
21 | </details>
22 | 
23 | ## Impact
24 | 
25 | Service degradation or unavailability.
26 | Pod not attached to service, thus not getting any traffic.
27 | 
28 | ## Diagnosis
29 | 
30 | - Check template via `kubectl -n $NAMESPACE get pod $POD`.
31 | - Check pod events via `kubectl -n $NAMESPACE describe pod $POD`.
32 | - Check pod logs via `kubectl -n $NAMESPACE logs $POD -c $CONTAINER`
33 | - Check pod template parameters such as:
34 |   - pod priority
35 |   - resources - maybe it tries to use unavailable resource, such as GPU but
36 |     there is limited number of nodes with GPU
37 |   - readiness and liveness probes may be incorrect - wrong port or command,
38 |     check is failing too fast due to short timeout for response
39 |   - stuck or long running init containers
40 | 
41 | Other things to check:
42 | 
43 | - app responding extremely slow due to resource constraints such as memory too
44 |   low, not enough CPU which is required on start
45 | - app waits for other services to start, such as database
46 | - misconfiguration causing app crash on start
47 | - missing files such as configmaps/secrets/volumes
48 | - read only filesystem
49 | - wrong user permissions in container
50 | - lack of special container capabilities (securityContext)
51 | - app is executed in different directory than expected
52 |   (for example WORKDIR from Docerkfile is not used in OpenShift)
53 | 
54 | ## Mitigation
55 | 
56 | Talk with developers or read documentation about the app, ensure to define
57 | sane default values to start the app.
58 | 
59 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
60 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeProxyDown.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: KubeProxy Down
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeProxyDown
 7 | 
 8 | ## Meaning
 9 | 
10 | The `KubeProxyDown` alert is triggered when all Kubernetes Proxy instances have not
11 | been reachable by the monitoring system for more than 15 minutes.
12 | 
13 | ## Impact
14 | 
15 | kube-proxy is a network proxy that runs on each node in your cluster,
16 | implementing part of the Kubernetes Service concept.
17 | 
18 | kube-proxy maintains network rules on nodes.
19 | These network rules allow network communication to your Pods
20 | from network sessions inside or outside of your cluster.
21 | 
22 | kube-proxy uses the operating system packet filtering layer if
23 | there is one and it's available. Otherwise, kube-proxy forwards the traffic
24 | itself.
25 | 
26 | ## Diagnosis
27 | 
28 | Check the status of the `kube-proxy` daemon sets in the `kube-system` namespace.
29 | 
30 | ```console
31 | kubectl get pods -l k8s-app=kube-proxy -n kube-system
32 | ```
33 | 
34 | Check the specific daemon-set for logs with the following command:
35 | 
36 | ```console
37 | kubectl logs -n kube-system kube-proxy-b9g23
38 | ```
39 | 
40 | ## Mitigation
41 | 
42 | ### AWS EKS
43 | If you are running AWS EKS cluster and you find that the `kube-proxy` pods are all running normally, make sure to update the `kube-proxy-config` cm as shown below.
44 | 
45 | ```console
46 | kubectl edit cm -n kube-system kube-proxy-config
47 | ...
48 | metricsBindAddress: 0.0.0.0:10249
49 | ...
50 | ```
51 | This setting configures the IP address with port for the metrics server to serve on (set to '0.0.0.0:10249' for all IPv4 interfaces and '[::]:10249' for all IPv6 interfaces). More information on the [documentation page](https://kubernetes.io/docs/reference/command-line-tools-reference/kube-proxy/)
52 | 
53 | Then just go delete `kube-proxy` pods and new ones will be created automatically.
54 | 
55 | ```console
56 | kubectl delete pod -l k8s-app=kube-proxy -n kube-system
57 | ```
58 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeQuotaAlmostFull.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Quota Almost Full
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeQuotaAlmostFull
 7 | 
 8 | ## Meaning
 9 | 
10 | Cluster reaches to the allowed limits for given namespace.
11 | 
12 | ## Impact
13 | 
14 | In the future deployments may not be possbile.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check resource usage for the namespace in given time span
19 | 
20 | ## Mitigation
21 | 
22 | - Review existing quota for given namespace and adjust it accordingly.
23 | - Review resources used by the quota and fine tune them.
24 | - Continue with standard capacity planning procedures.
25 | - See [Quotas](https://kubernetes.io/docs/concepts/policy/resource-quotas/)
26 | 
27 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeQuotaExceeded.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Quota Exceeded
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeQuotaExceeded
 7 | 
 8 | ## Meaning
 9 | 
10 | Cluster reaches to the allowed hard limits for given namespace.
11 | 
12 | ## Impact
13 | 
14 | Inability to create resources in kubernetes.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check resource usage for the namespace in given time span
19 | 
20 | ## Mitigation
21 | 
22 | - Review existing quota for given namespace and adjust it accordingly.
23 | - Review resources used by the quota and fine tune them.
24 | - Continue with standard capacity planning procedures.
25 | - See [Quotas](https://kubernetes.io/docs/concepts/policy/resource-quotas/)
26 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeQuotaFullyUsed.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Quota Fully Used
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeQuotaFullyUsed
 7 | 
 8 | ## Meaning
 9 | 
10 | Cluster reached allowed limits for given namespace.
11 | 
12 | ## Impact
13 | 
14 | New app installations may not be possible.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check resource usage for the namespace in given time span
19 | 
20 | ## Mitigation
21 | 
22 | - Review existing quota for given namespace and adjust it accordingly.
23 | - Review resources used by the quota and fine tune them.
24 | - Continue with standard capacity planning procedures.
25 | - See [Quotas](https://kubernetes.io/docs/concepts/policy/resource-quotas/)
26 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeSchedulerDown.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Scheduler Down
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeSchedulerDown
 7 | 
 8 | ## Meaning
 9 | 
10 | Kube Scheduler has disappeared from Prometheus target discovery.
11 | 
12 | ## Impact
13 | 
14 | This is a critical alert. The cluster may partially or fully non-functional.
15 | 
16 | ## Diagnosis
17 | 
18 | To be added.
19 | 
20 | ## Mitigation
21 | 
22 | See old CoreOS docs in [Web Archive](http://web.archive.org/web/20201026205154/https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html)
23 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeStatefulSetGenerationMismatch.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube StatefulSet Generation Mismatch
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeStatefulSetGenerationMismatch
 7 | 
 8 | ## Meaning
 9 | 
10 | StatefulSet generation mismatch due to possible roll-back.
11 | 
12 | ## Impact
13 | 
14 | Service degradation or unavailability.
15 | 
16 | ## Diagnosis
17 | 
18 | See [Kubernetes Docs - Failed Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#failed-deployment)
19 | which can be also applied to StatefulSets to some extent
20 | 
21 | - Check out rollout history `kubectl -n $NAMESPACE rollout history statefulset $NAME`
22 | - Check rollout status if it is not paused
23 | - Check deployment status via `kubectl -n $NAMESPACE describe statefulset $NAME`.
24 | - Check how many replicas are there declared.
25 | - Investigate if new pods are not crashing.
26 | - Look at the issues with PersistentVolumes attached to StatefulSets.
27 | - Check the status of the pods which belong to the replica sets under the deployment.
28 | - Check pod template parameters such as:
29 |   - pod priority - maybe it was evicted by other more important pods
30 |   - resources - maybe it tries to use unavailable resource, such as GPU
31 |     but there is limited number of nodes with GPU
32 |   - affinity rules - maybe due to affinities and not enough nodes it is
33 |     not possible to schedule pods
34 |   - pod termination grace period - if too long then pods may be for too long
35 |     in terminating state
36 | - Check if Horizontal Pod Autoscaler (HPA) is not triggered due to untested
37 |   values (requests values).
38 | - Check if cluster-autoscaler is able to create new nodes - see its logs or
39 |   cluster-autoscaler status configmap.
40 | 
41 | ## Mitigation
42 | 
43 | Statefulsets are quite specific, and usually have special scripts on pod termination.
44 | See if there are special commands executed such as data migration, which may significantly slow down the progress.
45 | 
46 | In case of scale out usually adding new nodes solves the issue.
47 | 
48 | Otherwise probably statefulset definition needs to be fixed.
49 | 
50 | In rare cases roll back to previous version - see [Kubernetes Docs - Rolling Back](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#rolling-updates)
51 | 
52 | In extremely rare situations it may be better to delete problematic pods.
53 | 
54 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
55 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeStatefulSetReplicasMismatch.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube StatefulSet Replicas Mismatch
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeStatefulSetReplicasMismatch
 7 | 
 8 | ## Meaning
 9 | 
10 | StatefulSet has not matched the expected number of replicas.
11 | 
12 | <details>
13 | <summary>Full context</summary>
14 | 
15 | Kubernetes StatefulSet resource does not have number of replicas which were
16 | declared to be in operation.
17 | For example statefulset is expected to have 3 replicas, but it has less than
18 | that for a noticeable period of time.
19 | 
20 | In rare occasions there may be more replicas than it should and system did not
21 | clean it up.
22 | </details>
23 | 
24 | ## Impact
25 | 
26 | Service degradation or unavailability.
27 | 
28 | ## Diagnosis
29 | 
30 | - Check statefulset via `kubectl -n $NAMESPACE describe statefulset $NAME`.
31 | - Check how many replicas are there declared.
32 | - Check the status of the pods which belong to the replica sets under the
33 |   statefulset.
34 | - Check pod template parameters such as:
35 |   - pod priority - maybe it was evicted by other more importand pods
36 |   - resources - maybe it tries to use unavailabe resource, such as GPU but
37 |     there is limited number of nodes with GPU
38 |   - affinity rules - maybe due to affinities and not enough nodes it is
39 |     not possible to schedule pods
40 |   - pod termination grace period - if too long then pods may be for too long
41 |     in terminating state
42 | - Check if there are issues with attaching disks to statefulset - for example
43 |   disk was in Zone A, but pod is scheduled in Zone B.
44 | - Check if Horizontal Pod Autoscaler (HPA) is not triggered due to untested
45 |   values (requests values).
46 | - Check if cluster-autoscaler is able to create new nodes - see its logs or
47 |   cluster-autoscaler status configmap.
48 | 
49 | ## Mitigation
50 | 
51 | Depending on the conditions usually adding new nodes solves the issue.
52 | 
53 | Set proper affinity rules to schedule pods in the same zone to avoid issues
54 | with volumes.
55 | 
56 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
57 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeStatefulSetUpdateNotRolledOut.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube StatefulSet Update Not RolledOut
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeStatefulSetUpdateNotRolledOut
 7 | 
 8 | ## Meaning
 9 | 
10 | StatefulSet update has not been rolled out.
11 | 
12 | ## Impact
13 | 
14 | Service degradation or unavailability.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check statefulset via `kubectl -n $NAMESPACE describe statefulset $NAME`.
19 | - Check if statefuls update was not paused manually (see status)
20 | - Check how many replicas are there declared.
21 | - Check the status of the pods which belong to the replica sets under the
22 |   statefulset.
23 | - Check pod template parameters such as:
24 |   - pod priority - maybe it was evicted by other more importand pods
25 |   - resources - maybe it tries to use unavailabe resource, such as GPU but
26 |     there is limited number of nodes with GPU
27 |   - affinity rules - maybe due to affinities and not enough nodes it is
28 |     not possible to schedule pods
29 |   - pod termination grace period - if too long then pods may be for too long
30 |     in terminating state
31 | - Check if there are issues with attaching disks to statefulset - for example
32 |   disk was in Zone A, but pod is scheduled in Zone B.
33 | - Check if Horizontal Pod Autoscaler (HPA) is not triggered due to untested
34 |   values (requests values).
35 | - Check if cluster-autoscaler is able to create new nodes - see its logs or
36 |   cluster-autoscaler status configmap.
37 | 
38 | ## Mitigation
39 | 
40 | TODO
41 | 
42 | See [Debugging Pods](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/#debugging-pods)
43 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeVersionMismatch.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kube Version Mismatch
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeVersionMismatch
 7 | 
 8 | ## Meaning
 9 | 
10 | Different semantic versions of Kubernetes components running.
11 | Usually happens during kubernetes cluster upgrade process.
12 | 
13 | <details>
14 | <summary>Full context</summary>
15 | 
16 | Kubernetes control plane nodes or worker nodes use different versions.
17 | This usually happens when kubernetes cluster is upgraded between minor and
18 | major version.
19 | 
20 | </details>
21 | 
22 | ## Impact
23 | 
24 | Incompatible API versions between kubernetes components may have very
25 | broad range of issues, influencing single containers, through app stability,
26 | ending at whole cluster stability.
27 | 
28 | ## Diagnosis
29 | 
30 | - Check existing kubernetes versions via `kubectl get nodes` and see
31 |   VERSION column
32 | - Check if there is ongoing kubernetes upgrade - especially in managed services
33 |   in the cloud
34 | 
35 | ## Mitigation
36 | 
37 | - Drain affected nodes, then upgrade or replace them with newer ones,
38 |   see [Safely drain node](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/)
39 | 
40 | - Ensure to set proper control plane version and node pool versions when
41 |   creating clusters.
42 | - Ensure auto cluster updates for control plane and node pools.
43 | - Set proper maintenance windows for the clusters.
44 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeletClientCertificateExpiration.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kubelet Client Certificate Expiration
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeletClientCertificateExpiration
 7 | 
 8 | ## Meaning
 9 | 
10 | Client certificate for Kubelet on node expires soon or already expired.
11 | 
12 | ## Impact
13 | 
14 | Node will not be able to be used within the cluster.
15 | 
16 | ## Diagnosis
17 | 
18 | Check when certificate was issued and when it expires.
19 | 
20 | ## Mitigation
21 | 
22 | Update certificates in the cluster control nodes and the worker nodes.
23 | Refer to the documentation of the tool used to create cluster.
24 | 
25 | Another option is to delete node if it affects only one,
26 | 
27 | In extreme situations recreate cluster.
28 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeletClientCertificateRenewalErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kubelet Client Certificate Renewal Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeletClientCertificateRenewalErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Kubelet on node  has failed to renew its client certificate
11 | (XX errors in the last 15 minutes)
12 | 
13 | ## Impact
14 | 
15 | Node will not be able to be used within the cluster.
16 | 
17 | ## Diagnosis
18 | 
19 | Check when certificate was issued and when it expires.
20 | 
21 | ## Mitigation
22 | 
23 | Update certificates in the cluster control nodes and the worker nodes.
24 | Refer to the documentation of the tool used to create cluster.
25 | 
26 | Another option is to delete node if it affects only one,
27 | 
28 | In extreme situations recreate cluster.
29 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeletDown.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kubelet Down
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeletDown
 7 | 
 8 | ## Meaning
 9 | 
10 | This alert is triggered when the monitoring system has not been able to reach
11 | any of the cluster's Kubelets for more than 15 minutes.
12 | 
13 | ## Impact
14 | 
15 | This alert represents a critical threat to the cluster's stability. Excluding
16 | the possibility of a network issue preventing the monitoring system from
17 | scraping Kubelet metrics, multiple nodes in the cluster are likely unable to
18 | respond to configuration changes for pods and other resources, and some
19 | debugging tools are likely not functional, e.g. `kubectl exec` and `kubectl logs`.
20 | 
21 | ## Diagnosis
22 | 
23 | Check the status of nodes and for recent events on `Node` objects, or for recent
24 | events in general:
25 | 
26 | ```shell
27 | $ kubectl get nodes
28 | $ kubectl describe node $NODE_NAME
29 | $ kubectl get events --field-selector 'involvedObject.kind=Node'
30 | $ kubectl get events
31 | ```
32 | 
33 | If you have SSH access to the nodes, access the logs for the Kubelet directly:
34 | 
35 | ```shell
36 | $ journalctl -b -f -u kubelet.service
37 | ```
38 | 
39 | ## Mitigation
40 | 
41 | The mitigation depends on what is causing the Kubelets to become
42 | unresponsive. Check for wide-spread networking issues, or node level
43 | configuration issues.
44 | 
45 | See [Kubernetes Docs - kubelet](https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/)
46 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeletPlegDurationHigh.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kubelet Pod Lifecycle Event Generator Duration High
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeletPlegDurationHigh
 7 | 
 8 | ## Meaning
 9 | 
10 | The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of
11 | XX seconds on node.
12 | 
13 | ## Impact
14 | 
15 | TODO
16 | 
17 | ## Diagnosis
18 | 
19 | TODO
20 | 
21 | ## Mitigation
22 | 
23 | TODO
24 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeletPodStartUpLatencyHigh.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kubelet Pod Start Up Latency High
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeletPodStartUpLatencyHigh
 7 | 
 8 | ## Meaning
 9 | 
10 | Kubelet Pod startup 99th percentile latency is XX seconds on node.
11 | 
12 | ## Impact
13 | 
14 | Slow pod starts.
15 | 
16 | ## Diagnosis
17 | 
18 | Usually exhaused IOPS for node storage.
19 | 
20 | ## Mitigation
21 | 
22 | [Cordon and drain node](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/) and delete it.
23 | If issue persists look into the node logs.
24 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeletServerCertificateExpiration.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kubelet Server Certificate Expiration
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeletServerCertificateExpiration
 7 | 
 8 | ## Meaning
 9 | 
10 | Server certificate for Kubelet on node expires soon or already expired.
11 | 
12 | ## Impact
13 | 
14 | **Critical** - Cluster will be in inoperable state.
15 | 
16 | ## Diagnosis
17 | 
18 | Check when certificate was issued and when it expires.
19 | 
20 | ## Mitigation
21 | 
22 | Update certificates in the cluster control nodes and the worker nodes.
23 | Refer to the documentation of the tool used to create cluster.
24 | 
25 | Another option is to delete node if it affects only one,
26 | 
27 | In extreme situations recreate cluster.
28 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeletServerCertificateRenewalErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kubelet Server Certificate Renewal Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeletServerCertificateRenewalErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Kubelet on node  has failed to renew its server certificate
11 | (XX errors in the last 5 minutes)
12 | 
13 | ## Impact
14 | 
15 | **Critical** - Cluster will be in inoperable state.
16 | 
17 | ## Diagnosis
18 | 
19 | Check when certificate was issued and when it expires.
20 | 
21 | ## Mitigation
22 | 
23 | Update certificates in the cluster control nodes and the worker nodes.
24 | Refer to the documentation of the tool used to create cluster.
25 | 
26 | Another option is to delete node if it affects only one,
27 | 
28 | In extreme situations recreate cluster.
29 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/KubeletTooManyPods.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kubelet Too Many Pods
 3 | weight: 20
 4 | ---
 5 | 
 6 | # KubeletTooManyPods
 7 | 
 8 | ## Meaning
 9 | 
10 | The alert fires when a specific node is running >95% of its capacity of pods
11 | (110 by default).
12 | 
13 | <details>
14 | <summary>Full context</summary>
15 | 
16 | Kubelets have a configuration that limits how many Pods they can run.
17 | The default value of this is 110 Pods per Kubelet, but it is configurable
18 | (and this alert takes that configuration into account with the
19 | `kube_node_status_capacity_pods` metric).
20 | 
21 | </details>
22 | 
23 | ## Impact
24 | 
25 | Running many pods (more than 110) on a single node places a strain on the
26 | Container Runtime Interface (CRI), Container Network Interface (CNI),
27 | and the operating system itself. Approaching that limit may affect performance
28 | and availability of that node.
29 | 
30 | ## Diagnosis
31 | 
32 | Check the number of pods on a given node by running:
33 | 
34 | ```shell
35 | kubectl get pods --all-namespaces --field-selector spec.nodeName=<node>
36 | ```
37 | 
38 | ## Mitigation
39 | 
40 | Since Kubernetes only officially supports [110 pods per node](https://kubernetes.io/docs/setup/best-practices/cluster-large/),
41 | you should preferably move pods onto other nodes or expand your cluster with more worker nodes.
42 | 
43 | If you're certain the node can handle more pods, you can raise the max pods
44 | per node limit by changing `maxPods` in your [KubeletConfiguration](https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/)
45 | (for kubeadm-based clusters) or changing the setting in your cloud provider's
46 | dashboard (if supported).
47 | 


--------------------------------------------------------------------------------
/content/runbooks/kubernetes/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: kubernetes
3 | bookCollapseSection: true
4 | bookFlatSection: true
5 | weight: 10
6 | ---
7 | 
8 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeClockNotSynchronising.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Clock Not Synchronising
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeClockNotSynchronising
 7 | 
 8 | ## Meaning
 9 | 
10 | Clock not synchronising.
11 | 
12 | ## Impact
13 | 
14 | Time is not automatically synchronizing on the node. This can cause issues with handling TLS as well as problems with other time-sensitive applications.
15 | 
16 | ## Diagnosis
17 | 
18 | TODO
19 | 
20 | ## Mitigation
21 | 
22 | See [Node Clok Skew Detected]({{< ref "./NodeClockSkewDetected.md" >}}) for mitigation steps.
23 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeClockSkewDetected.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Clock Skew Detected
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeClockSkewDetected
 7 | 
 8 | ## Meaning
 9 | 
10 | Clock skew detected.
11 | 
12 | ## Impact
13 | 
14 | Time is skewed on the node. This can cause issues with handling TLS as well as problems with other time-sensitive applications.
15 | 
16 | ## Diagnosis
17 | 
18 | TODO
19 | 
20 | ## Mitigation
21 | 
22 | Ensure time synchronization service is running.
23 | Set proper time servers.
24 | Esure to sync time on server start, especially when using
25 | low power mode or hibernation.
26 | 
27 | Some resource consuming process can cause issues on given hardware,
28 | so move it to different servers.
29 | 
30 | On physical servers check if on-board battery requires replacement.
31 | Check for hardware errors.
32 | Check for firmware updates.
33 | Ensure to use newer hardware (like server mainboard and so on).
34 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeFileDescriptorLimit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node File Descriptor Limit
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeFileDescriptorLimit
 7 | 
 8 | ## Meaning
 9 | 
10 | This alert is triggered when a node's kernel is found to be running out of
11 | available file descriptors -- a `warning` level alert at greater than 70% usage
12 | and a `critical` level alert at greater than 90% usage.
13 | 
14 | ## Impact
15 | 
16 | Applications on the node may no longer be able to open and operate on
17 | files. This is likely to have severe consequences for anything scheduled on this
18 | node.
19 | 
20 | ## Diagnosis
21 | 
22 | You can open a shell on the node and use the standard Linux utilities to
23 | diagnose the issue:
24 | 
25 | ```shell
26 | $ NODE_NAME='<value of instance label from alert>'
27 | 
28 | $ oc debug "node/$NODE_NAME"
29 | # sysctl -a | grep 'fs.file-'
30 | fs.file-max = 1597016
31 | fs.file-nr = 7104       0       1597016
32 | # lsof -n
33 | ```
34 | 
35 | ## Mitigation
36 | 
37 | Reduce the number of files opened simultaneously by either adjusting application
38 | configuration or by moving some applications to other nodes.
39 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeFilesystemAlmostOutOfFiles.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Filesystem Almost Out Of Files
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeFilesystemAlmostOutOfFiles
 7 | 
 8 | ## Meaning
 9 | 
10 | This alert is similar to the NodeFilesystemSpaceFillingUp alert, but rather
11 | than being based on a prediction that a filesystem will run out of inodes in a
12 | certain amount of time, it uses simple static thresholds. The alert will fire as
13 | at a `warning` level at 5% of available inodes left, and at a `critical` level
14 | with 3% of available inodes left.
15 | 
16 | ## Impact
17 | 
18 | A node's filesystem becoming full can have a far reaching impact, as it may
19 | cause any or all of the applications scheduled to that node to experience
20 | anything from performance degradation to full inoperability. Depending on the
21 | node and filesystem involved, this could pose a critical threat to the stability
22 | of the cluster.
23 | 
24 | ## Diagnosis
25 | 
26 | 
27 | ## Mitigation
28 | 
29 | See [Node Filesystem FilesFilling Up]({{< ref "./NodeFilesystemFilesFillingUp.md" >}})
30 | 
31 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeFilesystemAlmostOutOfSpace.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Filesystem Almost Out Of Space
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeFilesystemAlmostOutOfSpace
 7 | 
 8 | ## Meaning
 9 | 
10 | This alert is similar to the NodeFilesystemSpaceFillingUp alert, but rather
11 | than being based on a prediction that a filesystem will become full in a certain
12 | amount of time, it uses simple static thresholds. The alert will fire as at a
13 | `warning` level at 5% space left, and at a `critical` level with 3% space left.
14 | 
15 | ## Impact
16 | 
17 | A node's filesystem becoming full can have a far reaching impact, as it may
18 | cause any or all of the applications scheduled to that node to experience
19 | anything from performance degradation to full inoperability. Depending on the
20 | node and filesystem involved, this could pose a critical threat to the stability
21 | of the cluster.
22 | 
23 | ## Diagnosis
24 | 
25 | ## Mitigation
26 | 
27 | See [Node Filesystem FilesFilling Up]({{< ref "./NodeFilesystemFilesFillingUp.md" >}})
28 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeFilesystemFilesFillingUp.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Filesystem Files Filling Up
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeFilesystemFilesFillingUp
 7 | 
 8 | ## Meaning
 9 | 
10 | This alert is similar to the NodeFilesystemSpaceFillingUp alert, but
11 | predicts the filesystem will run out of inodes rather than bytes of storage
12 | space. The alert fires at a `critical` level when the filesystem is predicted to
13 | run out of available inodes within four hours.
14 | 
15 | ## Impact
16 | 
17 | A node's filesystem becoming full can have a far reaching impact, as it may
18 | cause any or all of the applications scheduled to that node to experience
19 | anything from performance degradation to full inoperability. Depending on the
20 | node and filesystem involved, this could pose a critical threat to the stability
21 | of the cluster.
22 | 
23 | ## Diagnosis
24 | 
25 | Note the `instance` and `mountpoint` labels from the alert. You can graph the
26 | usage history of this filesystem with the following query in the OpenShift web
27 | console:
28 | 
29 | ```promql
30 | node_filesystem_files_free{
31 |   instance="<value of instance label from alert>",
32 |   mountpoint="<value of mountpoint label from alert>"
33 | }
34 | ```
35 | 
36 | You can also open a debug session on the node and use the standard Linux
37 | utilities to locate the source of the usage:
38 | 
39 | ```shell
40 | $ MOUNT_POINT='<value of mountpoint label from alert>'
41 | $ NODE_NAME='<value of instance label from alert>'
42 | 
43 | $ oc debug "node/$NODE_NAME"
44 | $ df -hi "/host/$MOUNT_POINT"
45 | ```
46 | 
47 | Note that in many cases a filesystem running out of inodes will still have
48 | available storage. Running out of inodes is often caused by many many small
49 | files being created by an application.
50 | 
51 | ## Mitigation
52 | 
53 | The number of inodes allocated to a filesystem is usually based on the storage
54 | size. You may be able to solve the problem, or buy time, by increasing size of
55 | the storage volume. Otherwise, determine the application that is creating large
56 | numbers of files and adjust its configuration or provide it dedicated storage.
57 | 
58 | See [Node Filesystem FilesFilling Up]({{< ref "./NodeFilesystemFilesFillingUp.md" >}}) for additional mitigation steps.
59 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeFilesystemSpaceFillingUp.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Filesystem Space Filling Up
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeFilesystemSpaceFillingUp
 7 | 
 8 | ## Meaning
 9 | 
10 | This alert is based on an extrapolation of the space used in a file system. It
11 | fires if both the current usage is above a certain threshold _and_ the
12 | extrapolation predicts to run out of space in a certain time. This is a
13 | warning-level alert if that time is less than 24h. It's a critical alert if that
14 | time is less than 4h.
15 | 
16 | <details>
17 | <summary>Full context</summary>
18 | 
19 | The filesystem on Kubernetes nodes mainly consists of the operating system,
20 | [container ephemeral storage][1], container images, and container logs.
21 | Since Kubelet automatically handles [cleaning up old logs][2] and
22 | [deleting unused images][3], container ephemeral storage is a common cause of
23 | this alert. Although this alert may be triggered before Kubelet's garbage
24 | collection kicks in.
25 | 
26 | </details>
27 | 
28 | ## Impact
29 | 
30 | A filesystem running full is very bad for any process in need to write to the
31 | filesystem. But even before a filesystem runs full, performance is usually
32 | degrading.
33 | 
34 | ## Diagnosis
35 | 
36 | Study the recent trends of filesystem usage on a dashboard. Sometimes a periodic
37 | pattern of writing and cleaning up can trick the linear prediction into a false
38 | alert. Use the usual OS tools to investigate what directories are the worst
39 | and/or recent offenders. Is this some irregular condition, e.g. a process fails
40 | to clean up behind itself or is this organic growth? If monitoring is enabled,
41 | the following metric can be watched in PromQL.
42 | 
43 | ```promql
44 | node_filesystem_free_bytes
45 | ```
46 | 
47 | Check the alert's `mountpoint` label.
48 | 
49 | ## Mitigation
50 | 
51 | For the case that the `mountpoint` label is `/`, `/sysroot` or `/var`; then
52 | removing unused images solves that issue:
53 | 
54 | Debug the node by accessing the node filesystem:
55 | 
56 | ```shell
57 | $ NODE_NAME=<instance label from alert>
58 | $ kubectl -n default debug node/$NODE_NAME
59 | $ chroot /host
60 | ```
61 | 
62 | Remove dangling images:
63 | 
64 | ```shell
65 | # TODO: Command needed
66 | ```
67 | 
68 | Remove unused images:
69 | 
70 | ```shell
71 | # TODO: Command needed
72 | ```
73 | 
74 | Exit debug:
75 | 
76 | ```shell
77 | $ exit
78 | $ exit
79 | ```
80 | 
81 | - [1](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#local-ephemeral-storage)
82 | - [2](https://kubernetes.io/docs/concepts/cluster-administration/logging/)
83 | - [3](https://kubernetes.io/docs/concepts/architecture/garbage-collection/#containers-images)
84 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeHighNumberConntrackEntriesUsed.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node High Number Conntrack Entries Used
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeHighNumberConntrackEntriesUsed
 7 | 
 8 | ## Meaning
 9 | 
10 | Number of conntrack are getting close to the limit.
11 | 
12 | ## Impact
13 | 
14 | When reached the limit then some connections will be dropped, degrading service quality.
15 | 
16 | ## Diagnosis
17 | 
18 | Check current conntrack value on the node.
19 | Check which apps are generating a lot of connections.
20 | 
21 | ## Mitigation
22 | 
23 | Migrate some pods to another nodes.
24 | Bump conntrack limit directly on the node, remembering to make it persistent across node reboots.
25 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeNetworkReceiveErrs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Network Receive Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeNetworkReceiveErrs
 7 | 
 8 | ## Meaning
 9 | 
10 | Network interface is reporting many receive errors.
11 | 
12 | ## Impact
13 | 
14 | Applications on the node may no longer be able to operate with other services.
15 | Network attached storage performance issues or even data loss.
16 | 
17 | ## Diagnosis
18 | 
19 | Investigate networking issues on the node and to connected hardware.
20 | Check physical cables, check networking firewall rules and so on.
21 | 
22 | ## Mitigation
23 | 
24 | In general mitigation landscape is quite vast, some suggestions:
25 | 
26 | - Ensure some node capacity is left unallocated (cpu/memory) for handling
27 | networking.
28 | - [Increase TX queue length](https://access.redhat.com/documentation/en-us/red_hat_openstack_platform/13/html/ovs-dpdk_end_to_end_troubleshooting_guide/high_packet_loss_in_the_tx_queue_of_the_instance_s_tap_interface)
29 | - Spread services to other nodes/pods.
30 | - Replace physical cables, change ports.
31 | - Look into introducting Quality of Service or other
32 | [TCP congestion avoidance algorithms](https://en.wikipedia.org/wiki/TCP_congestion_control)
33 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeNetworkTransmitErrs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Network Transmit Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeNetworkTransmitErrs
 7 | 
 8 | ## Meaning
 9 | 
10 | Network interface is reporting many transmit errors.
11 | 
12 | ## Impact
13 | 
14 | Applications on the node may no longer be able to operate with other services.
15 | Network attached storage performance issues or even data loss.
16 | 
17 | ## Diagnosis
18 | 
19 | Investigate networking issues on the node and to connected hardware.
20 | Check network interface saturation.
21 | Check CPU usage saturation.
22 | Check physical cables, check networking firewall rules and so on.
23 | 
24 | ## Mitigation
25 | 
26 | In general mitigation landscape is quite vast, some suggestions:
27 | 
28 | - Ensure some node capacity is left unallocated (cpu/memory) for handling
29 | networking.
30 | - [Increase TX queue length](https://access.redhat.com/documentation/en-us/red_hat_openstack_platform/13/html/ovs-dpdk_end_to_end_troubleshooting_guide/high_packet_loss_in_the_tx_queue_of_the_instance_s_tap_interface)
31 | - Spread services to other nodes/pods.
32 | - Replace physical cables, change ports.
33 | - Look into introducting Quality of Service or other
34 | [TCP congestion avoidance algorithms](https://en.wikipedia.org/wiki/TCP_congestion_control)
35 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeRAIDDegraded.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node RAID Degraded
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeRAIDDegraded
 7 | 
 8 | ## Meaning
 9 | 
10 | RAID Array is degraded.
11 | 
12 | This alert is triggered when a node has a storage configuration with RAID array,
13 | and the array is reporting as being in a degraded state due to one or more disk
14 | failures.
15 | 
16 | ## Impact
17 | 
18 | The affected node could go offline at any moment if the RAID array fully fails
19 | due to further issues with disks.
20 | 
21 | ## Diagnosis
22 | 
23 | You can open a shell on the node and use the standard Linux utilities to
24 | diagnose the issue, but you may need to install additional software in the debug
25 | container:
26 | 
27 | ```shell
28 | $ NODE_NAME='<value of instance label from alert>'
29 | 
30 | $ oc debug "node/$NODE_NAME"
31 | $ cat /proc/mdstat
32 | ```
33 | 
34 | ## Mitigation
35 | 
36 | Cordon and drain node if possible, proceed to RAID recovery.
37 | 
38 | See the Red Hat Enterprise Linux [documentation][1] for potential steps.
39 | 
40 | - [1](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/managing_storage_devices/managing-raid_managing-storage-devices)
41 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeRAIDDiskFailure.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Node RAID Disk Failure
3 | weight: 20
4 | ---
5 | 
6 | # NodeRAIDDiskFailure
7 | 
8 | See [Node RAID Degraded]({{< ref "./NodeRAIDDegraded.md" >}})
9 | 


--------------------------------------------------------------------------------
/content/runbooks/node/NodeTextFileCollectorScrapeError.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Node Text File Collector Scrape Error
 3 | weight: 20
 4 | ---
 5 | 
 6 | # NodeTextFileCollectorScrapeError
 7 | 
 8 | ## Meaning
 9 | 
10 | Node Exporter text file collector failed to scrape.
11 | 
12 | ## Impact
13 | 
14 | Missing metrics from additional scripts.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check node_exporter logs
19 | - Check script supervisor (like systemd or cron) for more information about failed script execution
20 | 
21 | ## Mitigation
22 | 
23 | Check if provided configuration is valid, if files were not renamed during upgrades.
24 | 


--------------------------------------------------------------------------------
/content/runbooks/node/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: node
3 | bookCollapseSection: true
4 | bookFlatSection: true
5 | weight: 10
6 | ---
7 | 
8 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/ConfigReloaderSidecarErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Config Reloader Sidecar Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # ConfigReloaderSidecarErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Errors encountered while the config-reloader sidecar attempts to sync
11 | configuration in a given namespace.
12 | 
13 | ## Impact
14 | 
15 | As a result, configuration for services such as prometheus or alertmanager maybe
16 | stale and cannot be automatically updated.'
17 | 
18 | ## Diagnosis
19 | 
20 | Check config-reloader logs and the configuration which it tries to reload.
21 | 
22 | ## Mitigation
23 | 
24 | Usually means new config was rejected by the controlled app because it contains
25 | errors such as unknown configuration sections or bad resource definitions.
26 | 
27 | You can prevent such issues with better config testing tools in CI/CD systems
28 | such as:
29 | 
30 | - [yamllint](https://yamllint.readthedocs.io/en/stable/)
31 | - [yamale](https://github.com/23andMe/Yamale)
32 | - [promtool](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/)
33 | - [jq](https://stedolan.github.io/jq/)
34 | - yq (notice there is python and golang versions)
35 | - [conftest](https://www.conftest.dev/)
36 | - some apps have syntax checking command switch
37 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/PrometheusOperatorListErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Operator List Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusOperatorListErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Errors while performing list operations in controller.
11 | 
12 | ## Impact
13 | 
14 | Prometheus Operator has troubles in managing its operands and Custom Resources.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check logs of Prometheus Operator pod.
19 | - Check service account tokens.
20 | - Check Prometheus Operator RBAC configuration.
21 | 
22 | ## Mitigation
23 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/PrometheusOperatorNodeLookupErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Operator Node Lookup Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusOperatorNodeLookupErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Errors while reconciling information about kubernetes nodes.
11 | 
12 | ## Impact
13 | 
14 | Prometheus Operator is not able to configure Prometheus scrape configuration.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check logs of Prometheus Operator pod.
19 | - Check kubelet Service managed by Prometheus Operator
20 | ```shell
21 | $ kubelet describe Service -n kube-system -l app.kubernetes.io/managed-by=prometheus-operator
22 | 
23 | ## Mitigation
24 | 
25 | TODO
26 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/PrometheusOperatorNotReady.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Operator NotReady
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusOperatorNotReady
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus operator is not ready.
11 | 
12 | ## Impact
13 | 
14 | Prometheus Operator is not able to perform any operation.
15 | 
16 | ## Diagnosis
17 | 
18 | - Check Prometheus Operator Deployment configuration.
19 | - Check logs of Prometheus Operator pod.
20 | - Check service account tokens.
21 | 
22 | ## Mitigation
23 | 
24 | TODO
25 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/PrometheusOperatorReconcileErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Operator Reconcile Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusOperatorReconcileErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Errors while reconciling controller.
11 | 
12 | ## Impact
13 | 
14 | Prometheus Operator will not be able to manage Prometheuses/Alertmanagers.
15 | 
16 | ## Diagnosis
17 | 
18 | Check logs of Prometheus Operator pod.
19 | Check service account tokens.
20 | 
21 | ## Mitigation
22 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/PrometheusOperatorRejectedResources.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Operator Rejected Resources
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusOperatorRejectedResources
 7 | 
 8 | ## Meaning
 9 | 
10 | Custom Resources managed by Prometheus Operator were rejected and not propagated to operands (prometheus, alertmanager).
11 | 
12 | ## Impact
13 | 
14 | Custom Resource won't be used by prometheus-operator and thus configuration it carries won't be translated to prometheus or alertmanager configuration. 
15 | 
16 | ## Diagnosis
17 | 
18 | - Check newly created Custom Resources like Prometheus, Alertmanager, Rules, Probes, ServiceMonitors, and others that have a CRD used by Prometheus Operator.
19 | - Check logs of Prometheus Operator pod.
20 | 
21 | ## Mitigation
22 | 
23 | Fix newly created Custom Resource to conform to the schema defined in a CRD and reapply it to the cluster.
24 | 
25 | Consider using a tool like [`kubeconform`](https://github.com/yannh/kubeconform) to validate newly created resources. You can check [kube-prometheus integration of such a tool in the CI pipeline](https://github.com/prometheus-operator/kube-prometheus/blob/main/Makefile#L65-L67).
26 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/PrometheusOperatorSyncFailed:
--------------------------------------------------------------------------------
 1 | # PrometheusOperatorSyncFailed
 2 | 
 3 | ## Meaning
 4 | 
 5 | There were problems with Prometheus components synchronization that is usually done by prometheus-operator.
 6 | 
 7 | ## Impact
 8 | 
 9 | New changes promoted to prometheus-operator can not be deployed or there could be some problems with auto-healing.
10 | 
11 | ## Diagnosis
12 | 
13 | Check prometheus-operator pods logs by running, ex. `kubect logs -f <name_of_pod> -n <name_of_namespace>`
14 | 
15 | ## Mitigation
16 | 
17 | Any fixes in configuration diagnosed by reviewing prometheus-operator logs.
18 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/PrometheusOperatorSyncFailed.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Operator Sync Failed
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusOperatorSyncFailed
 7 | 
 8 | ## Meaning
 9 | 
10 | Last controller reconciliation failed
11 | 
12 | ## Impact
13 | 
14 | Prometheus Operator will not be able to manage Prometheuses/Alertmanagers.
15 | 
16 | ## Diagnosis
17 | 
18 | Check logs of Prometheus Operator pod.
19 | Check service account tokens.
20 | 
21 | ## Mitigation
22 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/PrometheusOperatorWatchErrors.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Operator Watch Errors
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusOperatorWatchErrors
 7 | 
 8 | ## Meaning
 9 | 
10 | Errors while performing watch operations in controller.
11 | 
12 | ## Impact
13 | 
14 | Prometheus Operator will not be able to manage Prometheuses/Alertmanagers.
15 | 
16 | ## Diagnosis
17 | 
18 | Check logs of Prometheus Operator pod.
19 | Check service account tokens.
20 | 
21 | ## Mitigation
22 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus-operator/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: prometheus-operator
3 | bookCollapseSection: true
4 | bookFlatSection: true
5 | weight: 10
6 | ---
7 | 
8 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusBadConfig.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Bad Config
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusBadConfig
 7 | 
 8 | ## Meaning
 9 | 
10 | Alert fires when Prometheus cannot successfully reload the configuration file
11 | due to the file having incorrect content.
12 | 
13 | ## Impact
14 | 
15 | Configuration cannot be reloaded and prometheus operates with last known good
16 | configuration.
17 | Configuration changes in any of Prometheus, Probe, PodMonitor,
18 | or ServiceMonitor objects may not be picked up by prometheus server.
19 | 
20 | ## Diagnosis
21 | 
22 | Check prometheus container logs for an explanation of which part of the
23 | configuration is problematic.
24 | 
25 | Usually this can occur when ServiceMonitors or
26 | PodMonitors share the same job label.
27 | 
28 | ## Mitigation
29 | 
30 | Remove conflicting configuration option.
31 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusDuplicateTimestamps.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Duplicate Timestamps
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusDuplicateTimestamps
 7 | 
 8 | Find the Prometheus Pod that concerns this.
 9 | 
10 | ```shell
11 | $ kubectl -n <namespace> get pod
12 | prometheus-k8s-0                       2/2     Running   1          122m
13 | prometheus-k8s-1                       2/2     Running   1          122m
14 | ```
15 | 
16 | Look at the logs of each of them, there should be a log line such as:
17 | 
18 | ```shell
19 | $ kubectl -n <namespace> logs prometheus-k8s-0
20 | level=warn ts=2021-01-04T15:08:55.613Z caller=scrape.go:1372 component="scrape manager" scrape_pool=default/main-ingress-nginx-controller/0 target=http://10.0.7.3:10254/metrics msg="Error on ingesting samples with different value but same timestamp" num_dropped=16
21 | ```
22 | 
23 | Now there is a judgement call to make, this could be the result of:
24 | 
25 | * Faulty configuration, which could be resolved by removing the offending
26 |   `ServiceMonitor` or `PodMonitor` object, which can be identified through
27 |   the `scrape_pool` label in the log line, which is in the format of
28 |   `<namespace>/<service-monitor-name>/<endpoint-id>`.
29 | 
30 | * The target is reporting faulty data, sometimes this can be resolved by
31 |   restarting the target, or it might need to be fixed in code of the offending
32 |   application.
33 | 
34 | Further reading [blog](https://www.robustperception.io/debugging-out-of-order-samples)
35 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusErrorSendingAlertsToAnyAlertmanager.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Error Sending Alerts To Any Alertmanager
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusErrorSendingAlertsToAnyAlertmanager
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus has encountered errors sending alerts to a any Alertmanager.
11 | 
12 | ## Impact
13 | 
14 | All alerts may be lost.
15 | 
16 | ## Diagnosis
17 | 
18 | Check connectivity issues between Prometheus and AlertManager cluster.
19 | Check NetworkPolicies, network saturation.
20 | Check if AlertManager is not overloaded or has not enough resources.
21 | 
22 | ## Mitigation
23 | 
24 | Set multiple AlertManager instances, spread them across nodes.
25 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusErrorSendingAlertsToSomeAlertmanagers.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Error Sending Alerts To Some Alertmanagers
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusErrorSendingAlertsToSomeAlertmanagers
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
11 | 
12 | ## Impact
13 | 
14 | Some alerts may be lost.
15 | 
16 | ## Diagnosis
17 | 
18 | Check connectivity issues between Prometheus and AlertManager.
19 | Check NetworkPolicies, network saturation.
20 | Check if AlertManager is not overloaded or has not enough resources.
21 | 
22 | ## Mitigation
23 | 
24 | Set multiple AlertManager instances, spread them across nodes.
25 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusLabelLimitHit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Label LimitHit
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusLabelLimitHit
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
11 | 
12 | ## Impact
13 | 
14 | Metrics and alerts may be missing or inaccurate.
15 | 
16 | ## Diagnosis
17 | 
18 | 
19 | ## Mitigation
20 | 
21 | Start thinking about sharding prometheus.
22 | Increase scrape times to perform it less frequently.
23 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusMissingRuleEvaluations.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Missing Rule Evaluations
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusMissingRuleEvaluations
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus is missing rule evaluations due to slow rule group evaluation.
11 | 
12 | ## Impact
13 | 
14 | Metrics and alerts may be missing or inaccurate.
15 | 
16 | ## Diagnosis
17 | 
18 | Check which rules fail, try to calcuate them differently.
19 | 
20 | ## Mitigation
21 | 
22 | Sometimes giving more CPU is the only way to fix it.
23 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusNotConnectedToAlertmanagers.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Not Connected To Alertmanagers
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusNotConnectedToAlertmanagers
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus is not connected to any Alertmanagers.
11 | 
12 | ## Impact
13 | 
14 | Sending alerts is not possible.
15 | 
16 | ## Diagnosis
17 | 
18 | Check connectivity issues between Prometheus and AlertManager.
19 | Check NetworkPolicies, network saturation.
20 | Check if AlertManager is not overloaded or has not enough resources.
21 | 
22 | ## Mitigation
23 | 
24 | Set multiple AlertManager instances, spread them across nodes.
25 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusNotIngestingSamples.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Not Ingesting Samples
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusNotIngestingSamples
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus is not ingesting samples.
11 | 
12 | ## Impact
13 | 
14 | Missing metrics.
15 | 
16 | ## Diagnosis
17 | 
18 | TODO
19 | 
20 | ## Mitigation
21 | 
22 | TODO
23 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusNotificationQueueRunningFull.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Notification Queue Running Full
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusNotificationQueueRunningFull
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus alert notification queue predicted to run full in less than 30m.
11 | 
12 | ## Impact
13 | 
14 | Fail to send alerts.
15 | 
16 | ## Diagnosis
17 | 
18 | Check prometheus container logs for an explanation of which part of the
19 | configuration is problematic.
20 | 
21 | ## Mitigation
22 | 
23 | Remove conflicting configuration option.
24 | 
25 | Check if there is an option to decrease number of alerts firing,
26 | for example by sharding prometheus.
27 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusOutOfOrderTimestamps.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Prometheus Out Of Order Timestamps
3 | weight: 20
4 | ---
5 | 
6 | # PrometheusOutOfOrderTimestamps
7 | 
8 | More information in [blog](https://www.robustperception.io/debugging-out-of-order-samples)
9 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusRemoteStorageFailures.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Remote Storage Failures
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusRemoteStorageFailures
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus fails to send samples to remote storage.
11 | 
12 | ## Impact
13 | 
14 | Metrics and alerts may be missing or inaccurate.
15 | 
16 | ## Diagnosis
17 | 
18 | Check prometheus logs and remote storage logs.
19 | Investigate network issues.
20 | Check configs and credentials.
21 | 
22 | ## Mitigation
23 | 
24 | TODO
25 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusRemoteWriteBehind.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Remote Write Behind
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusRemoteStorageFailures
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus remote write is behind.
11 | 
12 | ## Impact
13 | 
14 | Metrics and alerts may be missing or inaccurate.
15 | Increased data lag between locations.
16 | 
17 | ## Diagnosis
18 | 
19 | Check prometheus logs and remote storage logs.
20 | Investigate network issues.
21 | Check configs and credentials.
22 | 
23 | ## Mitigation
24 | 
25 | Probbaly amout of data sent to remote system is too high
26 | for given network connectivity speed.
27 | You may need to limit which metrics to send to minimize transfers.
28 | 
29 | See [Prometheus Remote Storage Failures]({{< ref "./PrometheusRemoteStorageFailures.md" >}})
30 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusRemoteWriteDesiredShards.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: PrometheusRemoteWriteDesiredShards
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusRemoteWriteDesiredShards
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus remote write desired shards calculation wants to run
11 | more than configured max shards.
12 |       
13 | 
14 | ## Impact
15 | 
16 | Metrics and alerts may be missing or inaccurate.
17 | 
18 | 
19 | ## Diagnosis
20 | 
21 | Check metrics cardinality.
22 | 
23 | Check prometheus logs and remote storage logs.
24 | Investigate network issues.
25 | Check configs and credentials.
26 | 
27 | ## Mitigation
28 | 
29 | Probbaly amout of data sent to remote system is too high
30 | for given network connectivity speed.
31 | You may need to limit which metrics to send to minimize transfers.
32 | 
33 | See [Prometheus Remote Storage Failures]({{< ref "./PrometheusRemoteStorageFailures.md" >}})
34 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusRuleFailures.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Rule Failures
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusRuleFailures
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus is failing rule evaluations.
11 | Prometheus rules are incorrect or failed to calculate.
12 | 
13 | ## Impact
14 | 
15 | Metrics and alerts may be missing or inaccurate.
16 | 
17 | ## Diagnosis
18 | 
19 | Your best starting point is the rules page of the Prometheus UI (:9090/rules).
20 | It will show the error.
21 | 
22 | You can also evaluate the rule expression yourself, using the UI, or maybe
23 | using PromLens to help debug expression issues.
24 | 
25 | ## Mitigation
26 | 
27 | Fix rules.
28 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusTSDBCompactionsFailing.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus TSDB Compactions Failing
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusTSDBCompactionsFailing
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus has issues compacting blocks.
11 | 
12 | ## Impact
13 | 
14 | Metrics and alerts may be missing or inaccurate.
15 | 
16 | ## Diagnosis
17 | 
18 | Check storage used by the pod.
19 | This can happen if there is a lot of going on in the cluster and
20 | prometheus did not manage to compact data.
21 | 
22 | ## Mitigation
23 | 
24 | At first just wait, it may fix itself after some time.
25 | 
26 | Increase Prometheus pod memory so that it caches more from disk.
27 | Try expanding volumes if they are too small or too slow.
28 | Change PVC storageClass to a more performant one.
29 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusTSDBReloadsFailing.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus TSDB Reloads Failing
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusTSDBReloadsFailing
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus has issues reloading blocks from disk.
11 | 
12 | ## Impact
13 | 
14 | Metrics and alerts may be missing or inaccurate.
15 | 
16 | ## Diagnosis
17 | 
18 | Check storage used by the pod.
19 | 
20 | ## Mitigation
21 | 
22 | Increase Prometheus pod memory so that it caches more from disk.
23 | Try expanding volumes if they are too small or too slow.
24 | Change PVC storageClass to a more performant one.
25 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusTargetLimitHit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Target Limit Hit
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusTargetLimitHit
 7 | 
 8 | ## Meaning
 9 | 
10 | Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
11 | 
12 | ## Impact
13 | 
14 | Metrics and alerts may be missing or inaccurate.
15 | 
16 | ## Diagnosis
17 | 
18 | 
19 | ## Mitigation
20 | 
21 | Start thinking about sharding prometheus.
22 | Increase scrape times to perform it less frequently.
23 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/PrometheusTargetSyncFailure.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Prometheus Target Sync Failure
 3 | weight: 20
 4 | ---
 5 | 
 6 | # PrometheusTargetSyncFailure
 7 | 
 8 | ## Meaning
 9 | 
10 | This alert is triggered when at least one of the Prometheus instances has
11 | consistently failed to sync its configuration.
12 | 
13 | ## Impact
14 | 
15 | Metrics and alerts may be missing or inaccurate.
16 | 
17 | ## Diagnosis
18 | 
19 | Determine whether the alert is for the cluster or user workload Prometheus by
20 | inspecting the alert's `namespace` label.
21 | 
22 | Check the logs for the appropriate Prometheus instance:
23 | 
24 | ```shell
25 | $ NAMESPACE='<value of namespace label from alert>'
26 | 
27 | $ oc -n $NAMESPACE logs -l 'app=prometheus'
28 | level=error ... msg="Creating target failed" ...
29 | ```
30 | 
31 | ## Mitigation
32 | 
33 | If the logs indicate a syntax or other configuration error, correct the
34 | corresponding `ServiceMonitor`, `PodMonitor`, or other configuration
35 | resource. In most all cases, the operator should prevent this from happening.
36 | 


--------------------------------------------------------------------------------
/content/runbooks/prometheus/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: prometheus
3 | bookCollapseSection: true
4 | bookFlatSection: true
5 | weight: 10
6 | ---
7 | 
8 | 


--------------------------------------------------------------------------------
/layouts/404.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="{{ .Site.Language.Lang }}">
 3 | 
 4 |   <head>
 5 |     {{ partial "docs/html-head" . }}
 6 |     {{ partial "docs/inject/head" . }}
 7 | 
 8 |     <style>
 9 |       .not-found {
10 |         text-align: center;
11 |       }
12 |       .not-found h1 {
13 |         margin: .25em 0 0 0;
14 |         opacity: .25;
15 |         font-size: 40vmin;
16 |       }
17 |     </style>
18 |   </head>
19 | 
20 |   <body>
21 |     <main class="flex justify-center not-found">
22 |       <div>
23 |         <h1>404</h1>
24 |         <h2>Page Not Found. Don't panic!</h2>
25 |         <h4>If you were directed here by a link from an alert, we sadly don't have a runbook for it... yet!</h4>
26 |         <p>All runbooks on this site are created in the open and maintained by a team of passionate people.</p>
27 |         <p>We would like to have runbooks for all alerts shipped with kube-prometheus, but we sadly don't have time to write them all.</p>
28 |         <p>If you would like to help us, please consider opening a pull request. Thank you!</p>
29 |          <h3>
30 |           <a href="/docs/add-runbook">Add runbook</a>
31 |         </h3>
32 |         <h3>
33 |           <a href="{{ .Site.Params.BookRepo }}/issues/new">Open an issue</a>
34 |         </h3>
35 |         <h3>
36 |           <a href="{{ .Site.Home.RelPermalink }}">Back to main page</a>
37 |         </h3>
38 |       </div>
39 |     </main>
40 | 
41 |     {{ partial "docs/inject/body" . }}
42 |   </body>
43 | 
44 | </html>
45 | 


--------------------------------------------------------------------------------