├── .github
    └── workflows
    │   └── documentation.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── cpu-limits-low-perc-state-variables.tf
├── cpu-limits-low-perc-state.tf
├── cpu-limits-low-perc-variables.tf
├── cpu-limits-low-perc.tf
├── cpu-limits-low-variables.tf
├── cpu-limits-low.tf
├── cpu-on-dns-pods-high-variables.tf
├── cpu-on-dns-pods-high.tf
├── cpu-requests-low-perc-state-variables.tf
├── cpu-requests-low-perc-state.tf
├── cpu-requests-low-perc-variables.tf
├── cpu-requests-low-perc.tf
├── cpu-requests-low-variables.tf
├── cpu-requests-low.tf
├── daemonset-incomplete-variables.tf
├── daemonset-incomplete.tf
├── daemonset-multiple-restarts-variables.tf
├── daemonset-multiple-restarts.tf
├── datadog-agent-variables.tf
├── datadog-agent.tf
├── deploy-desired-vs-status-variables.tf
├── deploy-desired-vs-status.tf
├── deployment-multiple-restarts-variables.tf
├── deployment-multiple-restarts.tf
├── examples
    └── example.tf
├── hpa-status-variables.tf
├── hpa-status.tf
├── main.tf
├── memory-limits-low-perc-state-variables.tf
├── memory-limits-low-perc-state.tf
├── memory-limits-low-perc-variables.tf
├── memory-limits-low-perc.tf
├── memory-limits-low-variables.tf
├── memory-limits-low.tf
├── memory-requests-low-perc-state-variables.tf
├── memory-requests-low-perc-state.tf
├── memory-requests-low-perc-variables.tf
├── memory-requests-low-perc.tf
├── memory-requests-low-variables.tf
├── memory-requests-low.tf
├── module_description.md
├── network-unavailable-variables.tf
├── network-unavailable.tf
├── node-diskpressure-variables.tf
├── node-diskpressure.tf
├── node-memory-used-percent-variables.tf
├── node-memory-used-percent.tf
├── node-memorypressure-variables.tf
├── node-memorypressure.tf
├── node-ready-variables.tf
├── node-ready.tf
├── node-status-variables.tf
├── node-status.tf
├── persistent-volumes-variables.tf
├── persistent-volumes.tf
├── pid-pressure-variables.tf
├── pid-pressure.tf
├── pod-count-per-node-high-variables.tf
├── pod-count-per-node-high.tf
├── pod-ready-variables.tf
├── pod-ready.tf
├── pod-restarts-variables.tf
├── pod-restarts.tf
├── pods-failed-variables.tf
├── pods-failed.tf
├── pods-pending-variables.tf
├── pods-pending.tf
├── provider.tf
├── renovate.json
├── replicaset-incomplete-variables.tf
├── replicaset-incomplete.tf
├── replicaset-unavailable-variables.tf
├── replicaset-unavailable.tf
└── variables.tf


/.github/workflows/documentation.yaml:
--------------------------------------------------------------------------------
 1 | name: Generate terraform docs
 2 | 
 3 | on:
 4 |   push:
 5 |     # don't run when we push a tag
 6 |     tags-ignore:
 7 |     - '*'
 8 |     # don't run when we merge to main
 9 |     # the action should have run already
10 |     branches-ignore:
11 |     - 'main'
12 | jobs:
13 |   pre-commit:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |     - uses: terraform-linters/setup-tflint@v2
17 |       name: Setup TFLint
18 |       with:
19 |         tflint_version: v0.38.1
20 |     - uses: actions/checkout@v3
21 |     - uses: actions/setup-python@v4
22 |     - uses: pre-commit/action@v3.0.0
23 |       # pre-commit fails if it changed files
24 |       # we want to go on
25 |       continue-on-error: true
26 |     - uses: pre-commit/action@v3.0.0
27 |     - uses: EndBug/add-and-commit@v9
28 |       with:
29 |         default_author: github_actions
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .terraform
2 | # lock file should live in top level module, it is generated by the pre-commit hook
3 | .terraform.lock.hcl
4 | README.md.orig.*
5 | README.md.toc.*
6 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/gruntwork-io/pre-commit
 3 |     rev: v0.1.12
 4 |     hooks:
 5 |       - id: terraform-fmt
 6 |       - id: terraform-validate
 7 |       - id: tflint
 8 |   - repo: https://github.com/kabisa/terraform-datadog-pre-commit-hook
 9 |     rev: "1.3.6"
10 |     hooks:
11 |       - id: terraform-datadog-docs
12 |         args:
13 |         - "."


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Kabisa
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cpu-limits-low-perc-state-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "cpu_limits_low_perc_state_enabled" {
 2 |   type        = bool
 3 |   default     = false
 4 |   description = "CPU state limits are only available when the state metrics api is deployed https://github.com/kubernetes/kube-state-metrics"
 5 | }
 6 | 
 7 | variable "cpu_limits_low_perc_state_warning" {
 8 |   type    = number
 9 |   default = 95
10 | }
11 | 
12 | variable "cpu_limits_low_perc_state_critical" {
13 |   type    = number
14 |   default = 100
15 | }
16 | 
17 | variable "cpu_limits_low_perc_state_evaluation_period" {
18 |   type    = string
19 |   default = "last_5m"
20 | }
21 | 
22 | variable "cpu_limits_low_perc_state_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "cpu_limits_low_perc_state_docs" {
28 |   type    = string
29 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
30 | }
31 | 
32 | variable "cpu_limits_low_perc_state_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "cpu_limits_low_perc_state_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "cpu_limits_low_perc_state_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "cpu_limits_low_perc_state_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "cpu_limits_low_perc_state_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "cpu_limits_low_perc_state_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 3
62 | }
63 | 


--------------------------------------------------------------------------------
/cpu-limits-low-perc-state.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   cpu_limits_low_perc_state_filter = coalesce(
 3 |     var.cpu_limits_low_perc_state_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "cpu_limits_low_perc_state" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available CPU for Limits in percentages Low"
13 |   query            = "max(${var.cpu_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_limit{${local.cpu_limits_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_limits_low_perc_state_filter}} by {host,kube_cluster_name}) * 100 > ${var.cpu_limits_low_perc_state_critical}"
14 |   alert_message    = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits / percentage is too low"
15 |   recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits / percentage has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.cpu_limits_low_perc_state_enabled
19 |   alerting_enabled   = var.cpu_limits_low_perc_state_alerting_enabled
20 |   critical_threshold = var.cpu_limits_low_perc_state_critical
21 |   warning_threshold  = var.cpu_limits_low_perc_state_warning
22 |   priority           = min(var.cpu_limits_low_perc_state_priority + var.priority_offset, 5)
23 |   docs               = var.cpu_limits_low_perc_state_docs
24 |   note               = var.cpu_limits_low_perc_state_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/cpu-limits-low-perc-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "cpu_limits_low_perc_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "cpu_limits_low_perc_warning" {
 7 |   type    = number
 8 |   default = 95
 9 | }
10 | 
11 | variable "cpu_limits_low_perc_critical" {
12 |   type    = number
13 |   default = 100
14 | }
15 | 
16 | variable "cpu_limits_low_perc_evaluation_period" {
17 |   type    = string
18 |   default = "last_5m"
19 | }
20 | 
21 | variable "cpu_limits_low_perc_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "cpu_limits_low_perc_docs" {
27 |   type    = string
28 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
29 | }
30 | 
31 | variable "cpu_limits_low_perc_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "cpu_limits_low_perc_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "cpu_limits_low_perc_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "cpu_limits_low_perc_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "cpu_limits_low_perc_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "cpu_limits_low_perc_priority" {
57 |   description = "Number from 1 (high) to 5 (low)."
58 | 
59 |   type    = number
60 |   default = 3
61 | }
62 | 


--------------------------------------------------------------------------------
/cpu-limits-low-perc.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   cpu_limits_low_perc_filter = coalesce(
 3 |     var.cpu_limits_low_perc_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "cpu_limits_low_perc" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available CPU for Limits in percentages Low"
13 |   query            = "max(${var.cpu_limits_low_perc_evaluation_period}):(sum:kubernetes.cpu.limits{${local.cpu_limits_low_perc_filter}} by {host,kube_cluster_name} / max:system.cpu.num_cores{${local.cpu_limits_low_perc_filter}} by {host,kube_cluster_name}) * 100 > ${var.cpu_limits_low_perc_critical}"
14 |   alert_message    = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits / percentage is too low"
15 |   recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits / percentage has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.cpu_limits_low_perc_enabled
19 |   alerting_enabled   = var.cpu_limits_low_perc_alerting_enabled
20 |   critical_threshold = var.cpu_limits_low_perc_critical
21 |   warning_threshold  = var.cpu_limits_low_perc_warning
22 |   priority           = min(var.cpu_limits_low_perc_priority + var.priority_offset, 5)
23 |   docs               = var.cpu_limits_low_perc_docs
24 |   note               = var.cpu_limits_low_perc_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/cpu-limits-low-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "cpu_limits_low_enabled" {
 2 |   type        = bool
 3 |   default     = false
 4 |   description = "This monitor is based on absolute values and thus less useful. Prefer setting cpu_limits_low_perc_enabled to true."
 5 | }
 6 | 
 7 | variable "cpu_limits_low_warning" {
 8 |   type    = number
 9 |   default = 0
10 | }
11 | 
12 | variable "cpu_limits_low_critical" {
13 |   type    = number
14 |   default = -30
15 | }
16 | 
17 | variable "cpu_limits_low_evaluation_period" {
18 |   type    = string
19 |   default = "last_5m"
20 | }
21 | 
22 | variable "cpu_limits_low_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "cpu_limits_low_docs" {
28 |   type    = string
29 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
30 | }
31 | 
32 | variable "cpu_limits_low_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "cpu_limits_low_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "cpu_limits_low_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "cpu_limits_low_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "cpu_limits_low_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "cpu_limits_low_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 3
62 | }
63 | 


--------------------------------------------------------------------------------
/cpu-limits-low.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   cpu_limits_low_filter = coalesce(
 3 |     var.cpu_limits_low_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "cpu_limits_low" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available CPU for Limits Low"
13 |   query            = "min(${var.cpu_limits_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_limits_low_filter}} by {kube_cluster_name,host} - sum:kubernetes.cpu.limits{${local.cpu_limits_low_filter}} by {kube_cluster_name,host} < ${var.cpu_limits_low_critical}"
14 |   alert_message    = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits is too low "
15 |   recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits has recovered"
16 | 
17 | 
18 |   # monitor level vars
19 |   enabled            = var.cpu_limits_low_enabled
20 |   alerting_enabled   = var.cpu_limits_low_alerting_enabled
21 |   critical_threshold = var.cpu_limits_low_critical
22 |   warning_threshold  = var.cpu_limits_low_warning
23 |   priority           = min(var.cpu_limits_low_priority + var.priority_offset, 5)
24 |   docs               = var.cpu_limits_low_docs
25 |   note               = var.cpu_limits_low_note
26 | 
27 |   # module level vars
28 |   env                  = var.env
29 |   service              = var.service
30 |   service_display_name = var.service_display_name
31 |   notification_channel = var.notification_channel
32 |   additional_tags      = var.additional_tags
33 |   locked               = var.locked
34 |   name_prefix          = var.name_prefix
35 |   name_suffix          = var.name_suffix
36 | }
37 | 


--------------------------------------------------------------------------------
/cpu-on-dns-pods-high-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "cpu_on_dns_pods_high_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "cpu_on_dns_pods_high_warning" {
 7 |   type    = number
 8 |   default = 70
 9 | }
10 | 
11 | variable "cpu_on_dns_pods_high_critical" {
12 |   type    = number
13 |   default = 85
14 | }
15 | 
16 | variable "cpu_on_dns_pods_high_evaluation_period" {
17 |   type    = string
18 |   default = "last_30m"
19 | }
20 | 
21 | variable "cpu_on_dns_pods_high_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "cpu_on_dns_pods_high_docs" {
27 |   type    = string
28 |   default = ""
29 | }
30 | 
31 | variable "cpu_on_dns_pods_high_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "dns_filter_tags" {
37 |   description = <<-EOD
38 |     Getting all the DNS containers by default is hard to do.
39 |     What we try is to make a list of datadog tags / filters that should help us find those
40 |     We then build a filter in the following way: ($originalfilterstring) AND (item1 OR item2 OR item3...)
41 |     If that doesn't work for your use-cause you can override the filter list or use cpu_on_dns_pods_high_filter_override
42 |   EOD
43 |   type        = list(string)
44 |   default = [
45 |     "kube_service:kube-dns",
46 |     "short_image:coredns",
47 |     "short_image:ucp-coredns",
48 |     "short_image:ucp-kube-dns",
49 |   ]
50 | }
51 | 
52 | variable "cpu_on_dns_pods_high_alerting_enabled" {
53 |   type    = bool
54 |   default = true
55 | }
56 | 
57 | variable "cpu_on_dns_pods_high_no_data_timeframe" {
58 |   type    = number
59 |   default = null
60 | }
61 | 
62 | variable "cpu_on_dns_pods_high_notify_no_data" {
63 |   type    = bool
64 |   default = false
65 | }
66 | 
67 | variable "cpu_on_dns_pods_high_ok_threshold" {
68 |   type    = number
69 |   default = null
70 | }
71 | 
72 | variable "cpu_on_dns_pods_high_priority" {
73 |   description = "Number from 1 (high) to 5 (low)."
74 | 
75 |   type    = number
76 |   default = 2
77 | }
78 | 


--------------------------------------------------------------------------------
/cpu-on-dns-pods-high.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   dns_filter_tags = join(" OR ", var.dns_filter_tags)
 3 |   filter_string   = "(${var.filter_str}) AND (${local.dns_filter_tags})"
 4 |   cpu_on_dns_pods_high_filter = coalesce(
 5 |     var.cpu_on_dns_pods_high_filter_override,
 6 |     local.filter_string
 7 |   )
 8 | }
 9 | 
10 | module "cpu_on_dns_pods_high" {
11 |   source  = "kabisa/generic-monitor/datadog"
12 |   version = "1.0.0"
13 | 
14 |   name             = "CPU Usage on DNS pods is high"
15 |   query            = "avg(${var.cpu_on_dns_pods_high_evaluation_period}):avg:docker.cpu.usage{${local.cpu_on_dns_pods_high_filter}} by {kube_cluster_name,host,container_name} > ${var.cpu_on_dns_pods_high_critical}"
16 |   alert_message    = "Kubernetes CPU usage on DNS pods is too high"
17 |   recovery_message = "Kubernetes CPU usage on DNS pods has recovered"
18 | 
19 |   # monitor level vars
20 |   enabled            = var.cpu_on_dns_pods_high_enabled
21 |   alerting_enabled   = var.cpu_on_dns_pods_high_alerting_enabled
22 |   critical_threshold = var.cpu_on_dns_pods_high_critical
23 |   warning_threshold  = var.cpu_on_dns_pods_high_warning
24 |   priority           = min(var.cpu_on_dns_pods_high_priority + var.priority_offset, 5)
25 |   docs               = var.cpu_on_dns_pods_high_docs
26 |   note               = var.cpu_on_dns_pods_high_note
27 | 
28 |   # module level vars
29 |   env                  = var.env
30 |   service              = var.service
31 |   service_display_name = var.service_display_name
32 |   notification_channel = var.notification_channel
33 |   additional_tags      = var.additional_tags
34 |   locked               = var.locked
35 |   name_prefix          = var.name_prefix
36 |   name_suffix          = var.name_suffix
37 | }
38 | 


--------------------------------------------------------------------------------
/cpu-requests-low-perc-state-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "cpu_requests_low_perc_state_enabled" {
 2 |   type        = bool
 3 |   default     = false
 4 |   description = "CPU state limits are only available when the state metrics api is deployed https://github.com/kubernetes/kube-state-metrics"
 5 | }
 6 | 
 7 | variable "cpu_requests_low_perc_state_warning" {
 8 |   type    = number
 9 |   default = 80
10 | }
11 | 
12 | variable "cpu_requests_low_perc_state_critical" {
13 |   type    = number
14 |   default = 95
15 | }
16 | 
17 | variable "cpu_requests_low_perc_state_evaluation_period" {
18 |   type    = string
19 |   default = "last_5m"
20 | }
21 | 
22 | variable "cpu_requests_low_perc_state_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "cpu_requests_low_perc_state_docs" {
28 |   type    = string
29 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
30 | }
31 | 
32 | variable "cpu_requests_low_perc_state_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "cpu_requests_low_perc_state_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "cpu_requests_low_perc_state_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "cpu_requests_low_perc_state_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "cpu_requests_low_perc_state_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "cpu_requests_low_perc_state_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 3
62 | }
63 | 


--------------------------------------------------------------------------------
/cpu-requests-low-perc-state.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   cpu_requests_low_perc_state_filter = coalesce(
 3 |     var.cpu_requests_low_perc_state_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "cpu_requests_low_perc_state" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available CPU for requests in percentages Low"
13 |   query            = "max(${var.cpu_requests_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_requested{${local.cpu_requests_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_requests_low_perc_state_filter}} by {host,kube_cluster_name} ) * 100 > ${var.cpu_requests_low_perc_state_critical}"
14 |   alert_message    = "Kubernetes cluster cpu room for requests / percentage is too low"
15 |   recovery_message = "Kubernetes cluster cpu requests / percentage has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.cpu_requests_low_perc_state_enabled
19 |   alerting_enabled   = var.cpu_requests_low_perc_state_alerting_enabled
20 |   critical_threshold = var.cpu_requests_low_perc_state_critical
21 |   warning_threshold  = var.cpu_requests_low_perc_state_warning
22 |   priority           = min(var.cpu_requests_low_perc_state_priority + var.priority_offset, 5)
23 |   docs               = var.cpu_requests_low_perc_state_docs
24 |   note               = var.cpu_requests_low_perc_state_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/cpu-requests-low-perc-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "cpu_requests_low_perc_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "cpu_requests_low_perc_warning" {
 7 |   type    = number
 8 |   default = 80
 9 | }
10 | 
11 | variable "cpu_requests_low_perc_critical" {
12 |   type    = number
13 |   default = 95
14 | }
15 | 
16 | variable "cpu_requests_low_perc_evaluation_period" {
17 |   type    = string
18 |   default = "last_5m"
19 | }
20 | 
21 | variable "cpu_requests_low_perc_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "cpu_requests_low_perc_docs" {
27 |   type    = string
28 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
29 | }
30 | 
31 | variable "cpu_requests_low_perc_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "cpu_requests_low_perc_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "cpu_requests_low_perc_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "cpu_requests_low_perc_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "cpu_requests_low_perc_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "cpu_requests_low_perc_priority" {
57 |   description = "Number from 1 (high) to 5 (low)."
58 | 
59 |   type    = number
60 |   default = 3
61 | }
62 | 


--------------------------------------------------------------------------------
/cpu-requests-low-perc.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   cpu_requests_low_perc_filter = coalesce(
 3 |     var.cpu_requests_low_perc_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "cpu_requests_low_perc" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available CPU for requests in percentages Low"
13 |   query            = "max(${var.cpu_requests_low_perc_evaluation_period}):100 * sum:kubernetes.cpu.requests{${local.cpu_requests_low_perc_filter}} by {kube_cluster_name,host} / max:system.cpu.num_cores{${local.cpu_requests_low_perc_filter}} by {kube_cluster_name,host} > ${var.cpu_requests_low_perc_critical}"
14 |   alert_message    = "Kubernetes cluster cpu room for requests / percentage is too low"
15 |   recovery_message = "Kubernetes cluster cpu requests / percentage has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.cpu_requests_low_perc_enabled
19 |   alerting_enabled   = var.cpu_requests_low_perc_alerting_enabled
20 |   critical_threshold = var.cpu_requests_low_perc_critical
21 |   warning_threshold  = var.cpu_requests_low_perc_warning
22 |   priority           = min(var.cpu_requests_low_perc_priority + var.priority_offset, 5)
23 |   docs               = var.cpu_requests_low_perc_docs
24 |   note               = var.cpu_requests_low_perc_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/cpu-requests-low-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "cpu_requests_low_enabled" {
 2 |   type        = bool
 3 |   default     = false
 4 |   description = "This monitor is based on absolute values and thus less useful. Prefer setting cpu_requests_low_perc_enabled to true."
 5 | }
 6 | 
 7 | variable "cpu_requests_low_warning" {
 8 |   type    = number
 9 |   default = 1
10 | }
11 | 
12 | variable "cpu_requests_low_critical" {
13 |   type    = number
14 |   default = "0.5"
15 | }
16 | 
17 | variable "cpu_requests_low_evaluation_period" {
18 |   type    = string
19 |   default = "last_5m"
20 | }
21 | 
22 | variable "cpu_requests_low_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "cpu_requests_low_docs" {
28 |   type    = string
29 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
30 | }
31 | 
32 | variable "cpu_requests_low_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "cpu_requests_low_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "cpu_requests_low_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "cpu_requests_low_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "cpu_requests_low_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "cpu_requests_low_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 3
62 | }
63 | 


--------------------------------------------------------------------------------
/cpu-requests-low.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   cpu_requests_low_filter = coalesce(
 3 |     var.cpu_requests_low_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "cpu_requests_low" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available CPU for Requests Low"
13 |   query            = "max(${var.cpu_requests_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_requests_low_filter}} by {kube_cluster_name,host} - sum:kubernetes.cpu.requests{${local.cpu_requests_low_filter}} by {kube_cluster_name,host} < ${var.cpu_requests_low_critical}"
14 |   alert_message    = "Kubernetes cluster cpu room for requests is too low"
15 |   recovery_message = "Kubernetes cluster cpu requests has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.cpu_requests_low_enabled
19 |   alerting_enabled   = var.cpu_requests_low_alerting_enabled
20 |   critical_threshold = var.cpu_requests_low_critical
21 |   warning_threshold  = var.cpu_requests_low_warning
22 |   priority           = min(var.cpu_requests_low_priority + var.priority_offset, 5)
23 |   docs               = var.cpu_requests_low_docs
24 |   note               = var.cpu_requests_low_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/daemonset-incomplete-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "daemonset_incomplete_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "daemonset_incomplete_critical" {
 7 |   type        = number
 8 |   default     = 0
 9 |   description = "alert is raised when (desired - running) > daemonset_incomplete_critical"
10 | }
11 | 
12 | variable "daemonset_incomplete_evaluation_period" {
13 |   type    = string
14 |   default = "last_15m"
15 | }
16 | 
17 | variable "daemonset_incomplete_note" {
18 |   type    = string
19 |   default = ""
20 | }
21 | 
22 | variable "daemonset_incomplete_docs" {
23 |   type    = string
24 |   default = "In kubernetes a daemonset is responsible for running the same pod across all Nodes. An example for when this fails, is when the image cannot be pulled, the pod fails to initialize or no resources are available on the cluster\nThis alert is raised when (desired - running) > 0"
25 | }
26 | 
27 | variable "daemonset_incomplete_filter_override" {
28 |   type    = string
29 |   default = ""
30 | }
31 | 
32 | variable "daemonset_incomplete_alerting_enabled" {
33 |   type    = bool
34 |   default = true
35 | }
36 | 
37 | variable "daemonset_incomplete_no_data_timeframe" {
38 |   type    = number
39 |   default = null
40 | }
41 | 
42 | variable "daemonset_incomplete_notify_no_data" {
43 |   type    = bool
44 |   default = false
45 | }
46 | 
47 | variable "daemonset_incomplete_ok_threshold" {
48 |   type    = number
49 |   default = null
50 | }
51 | 
52 | variable "daemonset_incomplete_priority" {
53 |   description = "Number from 1 (high) to 5 (low)."
54 | 
55 |   type    = number
56 |   default = 2
57 | }
58 | 


--------------------------------------------------------------------------------
/daemonset-incomplete.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   daemonset_incomplete_filter = coalesce(
 3 |     var.daemonset_incomplete_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "daemonset_incomplete" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Daemonset Incomplete"
13 |   query            = "min(${var.daemonset_incomplete_evaluation_period}):max:kubernetes_state.daemonset.scheduled{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} - min:kubernetes_state.daemonset.ready{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} > 0"
14 |   alert_message    = "Kubernetes Daemonset {{kube_daemon_set}} is incomplete. Missing pod count:{{value}}"
15 |   recovery_message = "Kubernetes Daemonset {{kube_daemon_set}} has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.daemonset_incomplete_enabled
19 |   alerting_enabled   = var.daemonset_incomplete_alerting_enabled
20 |   critical_threshold = var.daemonset_incomplete_critical
21 |   # no warning threshold for this monitor
22 |   priority = min(var.daemonset_incomplete_priority + var.priority_offset, 5)
23 |   docs     = var.daemonset_incomplete_docs
24 |   note     = var.daemonset_incomplete_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/daemonset-multiple-restarts-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "daemonset_multiple_restarts_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "daemonset_multiple_restarts_warning" {
 7 |   type    = number
 8 |   default = null
 9 | }
10 | 
11 | variable "daemonset_multiple_restarts_critical" {
12 |   type    = number
13 |   default = 5.0
14 | }
15 | 
16 | variable "daemonset_multiple_restarts_evaluation_period" {
17 |   type    = string
18 |   default = "last_15m"
19 | }
20 | 
21 | variable "daemonset_multiple_restarts_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "daemonset_multiple_restarts_docs" {
27 |   type    = string
28 |   default = "If a container restarts once, it can be considered 'normal behaviour' for K8s. A Daemonset restarting multiple times though is a problem"
29 | }
30 | 
31 | variable "daemonset_multiple_restarts_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "daemonset_multiple_restarts_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "daemonset_multiple_restarts_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "daemonset_multiple_restarts_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "daemonset_multiple_restarts_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "daemonset_multiple_restarts_name_prefix" {
57 |   type    = string
58 |   default = ""
59 | }
60 | 
61 | variable "daemonset_multiple_restarts_name_suffix" {
62 |   type    = string
63 |   default = ""
64 | }
65 | 
66 | variable "daemonset_multiple_restarts_priority" {
67 |   description = "Number from 1 (high) to 5 (low)."
68 | 
69 |   type    = number
70 |   default = 3
71 | }
72 | 
73 | variable "daemonset_multiple_restarts_notification_channel_override" {
74 |   type    = string
75 |   default = ""
76 | }
77 | 


--------------------------------------------------------------------------------
/daemonset-multiple-restarts.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   daemonset_multiple_restarts_filter = coalesce(
 3 |     var.daemonset_multiple_restarts_filter_override,
 4 |     "${var.filter_str}${var.filter_str_concatenation}kube_daemon_set:*"
 5 |   )
 6 | }
 7 | 
 8 | module "daemonset_multiple_restarts" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name  = "Daemonset Multiple Restarts"
13 |   query = "max(${var.daemonset_multiple_restarts_evaluation_period}):clamp_min(max:kubernetes.containers.restarts{${local.daemonset_multiple_restarts_filter}} by {kube_daemon_set} - hour_before(max:kubernetes.containers.restarts{${local.daemonset_multiple_restarts_filter}} by {kube_daemon_set}), 0) > ${var.daemonset_multiple_restarts_critical}"
14 | 
15 |   # alert specific configuration
16 |   require_full_window = true
17 |   alert_message       = "Kubernetes Daemonset {{kube_daemon_set.name}} has more than {{threshold}} ({{value}}) restarts within one hour"
18 |   recovery_message    = "Kubernetes Daemonset {{kube_daemon_set.name}} is now at {{value}} restarts of the last hour"
19 | 
20 |   # monitor level vars
21 |   enabled              = var.daemonset_multiple_restarts_enabled
22 |   alerting_enabled     = var.daemonset_multiple_restarts_alerting_enabled
23 |   warning_threshold    = var.daemonset_multiple_restarts_warning
24 |   critical_threshold   = var.daemonset_multiple_restarts_critical
25 |   priority             = min(var.daemonset_multiple_restarts_priority + var.priority_offset, 5)
26 |   docs                 = var.daemonset_multiple_restarts_docs
27 |   note                 = var.daemonset_multiple_restarts_note
28 |   notification_channel = try(coalesce(var.daemonset_multiple_restarts_notification_channel_override, var.notification_channel), "")
29 | 
30 |   # module level vars
31 |   env                  = var.env
32 |   service              = var.service
33 |   service_display_name = var.service_display_name
34 |   additional_tags      = var.additional_tags
35 |   locked               = var.locked
36 |   name_prefix          = var.name_prefix
37 |   name_suffix          = var.name_suffix
38 | }
39 | 


--------------------------------------------------------------------------------
/datadog-agent-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "datadog_agent_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "datadog_agent_evaluation_period" {
 7 |   type    = string
 8 |   default = "last_5m"
 9 | }
10 | 
11 | variable "datadog_agent_note" {
12 |   type    = string
13 |   default = ""
14 | }
15 | 
16 | variable "datadog_agent_docs" {
17 |   type    = string
18 |   default = ""
19 | }
20 | 
21 | variable "datadog_agent_filter_override" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "datadog_agent_alerting_enabled" {
27 |   type    = bool
28 |   default = true
29 | }
30 | 
31 | variable "datadog_agent_no_data_timeframe" {
32 |   type    = number
33 |   default = null
34 | }
35 | 
36 | variable "datadog_agent_notify_no_data" {
37 |   type    = bool
38 |   default = false
39 | }
40 | 
41 | variable "datadog_agent_ok_threshold" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "datadog_agent_priority" {
47 |   description = "Number from 1 (high) to 5 (low)."
48 | 
49 |   type    = number
50 |   default = 2
51 | }
52 | 


--------------------------------------------------------------------------------
/datadog-agent.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   datadog_agent_filter = coalesce(
 3 |     var.datadog_agent_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "datadog_agent" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Datadog agent not running"
13 |   query            = "avg(${var.datadog_agent_evaluation_period}):avg:datadog.agent.running{${local.datadog_agent_filter}} by {host,kube_cluster_name} < 1"
14 |   alert_message    = "Datadog Agent not running on {{host.name}} in Cluster: {{kube_cluster_name.name}}"
15 |   recovery_message = "Agent running again"
16 |   notify_no_data   = true
17 |   no_data_message  = "Datadog agent is not running on {{host.name}} in Cluster: {{kube_cluster_name.name}}"
18 | 
19 |   # monitor level vars
20 |   enabled            = var.datadog_agent_enabled
21 |   alerting_enabled   = var.datadog_agent_alerting_enabled
22 |   critical_threshold = 1
23 |   # no warning threshold for this monitor
24 |   priority = min(var.datadog_agent_priority + var.priority_offset, 5)
25 |   docs     = var.datadog_agent_docs
26 |   note     = var.datadog_agent_note
27 | 
28 |   # module level vars
29 |   env                  = var.env
30 |   service              = var.service
31 |   service_display_name = var.service_display_name
32 |   notification_channel = var.notification_channel
33 |   additional_tags      = var.additional_tags
34 |   locked               = var.locked
35 |   name_prefix          = var.name_prefix
36 |   name_suffix          = var.name_suffix
37 | }
38 | 


--------------------------------------------------------------------------------
/deploy-desired-vs-status-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "deploy_desired_vs_status_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "deploy_desired_vs_status_warning" {
 7 |   type    = number
 8 |   default = 1
 9 |   # warning at 1 difference
10 | }
11 | 
12 | variable "deploy_desired_vs_status_critical" {
13 |   type    = number
14 |   default = 10
15 |   # critical at 10 difference
16 | }
17 | 
18 | variable "deploy_desired_vs_status_evaluation_period" {
19 |   type    = string
20 |   default = "last_15m"
21 | }
22 | 
23 | variable "deploy_desired_vs_status_note" {
24 |   type    = string
25 |   default = ""
26 | }
27 | 
28 | variable "deploy_desired_vs_status_docs" {
29 |   type    = string
30 |   default = "The amount of expected pods to run minus the actual number"
31 | }
32 | 
33 | variable "deploy_desired_vs_status_filter_override" {
34 |   type    = string
35 |   default = ""
36 | }
37 | 
38 | variable "deploy_desired_vs_status_alerting_enabled" {
39 |   type    = bool
40 |   default = true
41 | }
42 | 
43 | variable "deploy_desired_vs_status_no_data_timeframe" {
44 |   type    = number
45 |   default = null
46 | }
47 | 
48 | variable "deploy_desired_vs_status_notify_no_data" {
49 |   type    = bool
50 |   default = false
51 | }
52 | 
53 | variable "deploy_desired_vs_status_ok_threshold" {
54 |   type    = number
55 |   default = null
56 | }
57 | 
58 | variable "deploy_desired_vs_status_priority" {
59 |   description = "Number from 1 (high) to 5 (low)."
60 | 
61 |   type    = number
62 |   default = 3
63 | }
64 | 


--------------------------------------------------------------------------------
/deploy-desired-vs-status.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   deploy_desired_vs_status_filter = coalesce(
 3 |     var.deploy_desired_vs_status_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "deploy_desired_vs_status" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Desired pods vs current pods (Deployments)"
13 |   query            = "avg(${var.deploy_desired_vs_status_evaluation_period}):max:kubernetes_state.deployment.replicas_desired{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} - max:kubernetes_state.deployment.replicas_available{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} > ${var.deploy_desired_vs_status_critical}"
14 |   alert_message    = "Kubernetes is having trouble getting all the pods to start. (Based on replicas number in all the deployments)"
15 |   recovery_message = "All pods described in deployments have started"
16 |   notify_no_data   = true
17 |   no_data_message  = "Kubernetes State data missing for {{kube_cluster_name.name}}"
18 | 
19 |   # monitor level vars
20 |   enabled            = var.state_metrics_monitoring && var.deploy_desired_vs_status_enabled
21 |   alerting_enabled   = var.deploy_desired_vs_status_alerting_enabled
22 |   critical_threshold = var.deploy_desired_vs_status_critical
23 |   warning_threshold  = var.deploy_desired_vs_status_warning
24 |   priority           = min(var.deploy_desired_vs_status_priority + var.priority_offset, 5)
25 |   docs               = var.deploy_desired_vs_status_docs
26 |   note               = var.deploy_desired_vs_status_note
27 | 
28 |   # module level vars
29 |   env                  = var.env
30 |   service              = var.service
31 |   service_display_name = var.service_display_name
32 |   notification_channel = var.notification_channel
33 |   additional_tags      = var.additional_tags
34 |   locked               = var.locked
35 |   name_prefix          = var.name_prefix
36 |   name_suffix          = var.name_suffix
37 | }
38 | 


--------------------------------------------------------------------------------
/deployment-multiple-restarts-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "deployment_multiple_restarts_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "deployment_multiple_restarts_warning" {
 7 |   type    = number
 8 |   default = null
 9 | }
10 | 
11 | variable "deployment_multiple_restarts_critical" {
12 |   type    = number
13 |   default = 5.0
14 | }
15 | 
16 | variable "deployment_multiple_restarts_evaluation_period" {
17 |   type    = string
18 |   default = "last_15m"
19 | }
20 | 
21 | variable "deployment_multiple_restarts_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "deployment_multiple_restarts_docs" {
27 |   type    = string
28 |   default = "If a container restarts once, it can be considered 'normal behaviour' for K8s. A Deployment restarting multiple times though is a problem"
29 | }
30 | 
31 | variable "deployment_multiple_restarts_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "deployment_multiple_restarts_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "deployment_multiple_restarts_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "deployment_multiple_restarts_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "deployment_multiple_restarts_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "deployment_multiple_restarts_name_prefix" {
57 |   type    = string
58 |   default = ""
59 | }
60 | 
61 | variable "deployment_multiple_restarts_name_suffix" {
62 |   type    = string
63 |   default = ""
64 | }
65 | 
66 | variable "deployment_multiple_restarts_priority" {
67 |   description = "Number from 1 (high) to 5 (low)."
68 | 
69 |   type    = number
70 |   default = 3
71 | }
72 | 
73 | variable "deployment_multiple_restarts_notification_channel_override" {
74 |   type    = string
75 |   default = ""
76 | }
77 | 


--------------------------------------------------------------------------------
/deployment-multiple-restarts.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   deployment_multiple_restarts_filter = coalesce(
 3 |     var.deployment_multiple_restarts_filter_override,
 4 |     "${var.filter_str}${var.filter_str_concatenation}kube_deployment:*"
 5 |   )
 6 | }
 7 | 
 8 | module "deployment_multiple_restarts" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name  = "Deployment Multiple Restarts"
13 |   query = "max(${var.deployment_multiple_restarts_evaluation_period}):clamp_min(max:kubernetes.containers.restarts{${local.deployment_multiple_restarts_filter}} by {kube_deployment} - hour_before(max:kubernetes.containers.restarts{${local.deployment_multiple_restarts_filter}} by {kube_deployment}), 0) > ${var.deployment_multiple_restarts_critical}"
14 | 
15 |   # alert specific configuration
16 |   require_full_window = true
17 |   alert_message       = "Kubernetes Deployment {{kube_deployment.name}} has more than {{threshold}} ({{value}}) restarts within one hour"
18 |   recovery_message    = "Kubernetes Deployment {{kube_deployment.name}} is now at {{value}} restarts of the last hour"
19 | 
20 |   # monitor level vars
21 |   enabled              = var.deployment_multiple_restarts_enabled
22 |   alerting_enabled     = var.deployment_multiple_restarts_alerting_enabled
23 |   warning_threshold    = var.deployment_multiple_restarts_warning
24 |   critical_threshold   = var.deployment_multiple_restarts_critical
25 |   priority             = min(var.deployment_multiple_restarts_priority + var.priority_offset, 5)
26 |   docs                 = var.deployment_multiple_restarts_docs
27 |   note                 = var.deployment_multiple_restarts_note
28 |   notification_channel = try(coalesce(var.deployment_multiple_restarts_notification_channel_override, var.notification_channel), "")
29 | 
30 |   # module level vars
31 |   env                  = var.env
32 |   service              = var.service
33 |   service_display_name = var.service_display_name
34 |   additional_tags      = var.additional_tags
35 |   locked               = var.locked
36 |   name_prefix          = var.name_prefix
37 |   name_suffix          = var.name_suffix
38 | }
39 | 


--------------------------------------------------------------------------------
/examples/example.tf:
--------------------------------------------------------------------------------
 1 | # tflint-ignore: terraform_module_version
 2 | module "kubernetes" {
 3 |   source = "kabisa/kubernetes/datadog"
 4 | 
 5 |   notification_channel = "mail@example.com"
 6 |   service              = "Kubernetes"
 7 |   env                  = "prd"
 8 |   filter_str           = "kube_cluster_name:production"
 9 | }
10 | 


--------------------------------------------------------------------------------
/hpa-status-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "hpa_status_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "hpa_status_evaluation_period" {
 7 |   type    = string
 8 |   default = "last_15m"
 9 | }
10 | 
11 | variable "hpa_status_note" {
12 |   type    = string
13 |   default = ""
14 | }
15 | 
16 | variable "hpa_status_docs" {
17 |   type    = string
18 |   default = "The Horizontal Pod Autoscaler automatically scales the number of Pods in a replication controller, deployment, replica set or stateful set based on observed CPU utilization\nWhen the HPA is unavailable, the situation could arise that not enough resources are provisioned to handle the incoming load\nhttps://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/"
19 | }
20 | 
21 | variable "hpa_status_filter_override" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "hpa_status_alerting_enabled" {
27 |   type    = bool
28 |   default = true
29 | }
30 | 
31 | variable "hpa_status_no_data_timeframe" {
32 |   type    = number
33 |   default = null
34 | }
35 | 
36 | variable "hpa_status_notify_no_data" {
37 |   type    = bool
38 |   default = false
39 | }
40 | 
41 | variable "hpa_status_ok_threshold" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "hpa_status_priority" {
47 |   description = "Number from 1 (high) to 5 (low)."
48 | 
49 |   type    = number
50 |   default = 3
51 | }
52 | 


--------------------------------------------------------------------------------
/hpa-status.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   hpa_status_filter = (
 3 |     var.hpa_status_filter_override != "" ? var.hpa_status_filter_override : var.filter_str
 4 |   )
 5 | }
 6 | 
 7 | module "hpa_status" {
 8 |   source  = "kabisa/generic-monitor/datadog"
 9 |   version = "1.0.0"
10 | 
11 |   name             = "HPA Status not OK"
12 |   query            = "avg(${var.hpa_status_evaluation_period}):avg:kubernetes_state.hpa.condition{${local.hpa_status_filter}} by {hpa,kube_namespace,status,condition} < 1"
13 |   alert_message    = "Kubernetes HPA Status for Node {{node}} is not ok"
14 |   recovery_message = "Kubernetes HPA Status for Node {{node}} has recovered"
15 | 
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.hpa_status_enabled
19 |   alerting_enabled   = var.hpa_status_alerting_enabled
20 |   critical_threshold = 1
21 |   # No warning_threshold possible
22 |   priority = min(var.hpa_status_priority + var.priority_offset, 5)
23 |   docs     = var.hpa_status_docs
24 |   note     = var.hpa_status_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/main.tf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kabisa/terraform-datadog-kubernetes/36572ad31f227a1b4326c4211d4522f865ac3270/main.tf


--------------------------------------------------------------------------------
/memory-limits-low-perc-state-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "memory_limits_low_perc_state_enabled" {
 2 |   type        = bool
 3 |   default     = false
 4 |   description = "Memory state limits are only available when the state metrics api is deployed https://github.com/kubernetes/kube-state-metrics"
 5 | }
 6 | 
 7 | variable "memory_limits_low_perc_state_warning" {
 8 |   type    = number
 9 |   default = 95
10 | }
11 | 
12 | variable "memory_limits_low_perc_state_critical" {
13 |   type    = number
14 |   default = 100
15 | }
16 | 
17 | variable "memory_limits_low_perc_state_evaluation_period" {
18 |   type    = string
19 |   default = "last_5m"
20 | }
21 | 
22 | variable "memory_limits_low_perc_state_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "memory_limits_low_perc_state_docs" {
28 |   type    = string
29 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
30 | }
31 | 
32 | variable "memory_limits_low_perc_state_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "memory_limits_low_perc_state_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "memory_limits_low_perc_state_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "memory_limits_low_perc_state_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "memory_limits_low_perc_state_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "memory_limits_low_perc_state_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 3
62 | }
63 | 


--------------------------------------------------------------------------------
/memory-limits-low-perc-state.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   memory_limits_low_perc_state_filter = coalesce(
 3 |     var.memory_limits_low_perc_state_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "memory_limits_low_perc_state" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available Memory for Limits in percentage Low"
13 |   query            = "max(${var.memory_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.memory_limit{${local.memory_limits_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.memory_allocatable{${local.memory_limits_low_perc_state_filter}} by {host,kube_cluster_name}) * 100 > ${var.memory_limits_low_perc_state_critical}"
14 |   alert_message    = "Kubernetes cluster memory room for limits in percentage is too low"
15 |   recovery_message = "Kubernetes cluster memory limits in percentage has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.memory_limits_low_perc_state_enabled
19 |   alerting_enabled   = var.memory_limits_low_perc_state_alerting_enabled
20 |   critical_threshold = var.memory_limits_low_perc_state_critical
21 |   warning_threshold  = var.memory_limits_low_perc_state_warning
22 |   priority           = min(var.memory_limits_low_perc_state_priority + var.priority_offset, 5)
23 |   docs               = var.memory_limits_low_perc_state_docs
24 |   note               = var.memory_limits_low_perc_state_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/memory-limits-low-perc-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "memory_limits_low_perc_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "memory_limits_low_perc_warning" {
 7 |   type    = number
 8 |   default = 95
 9 | }
10 | 
11 | variable "memory_limits_low_perc_critical" {
12 |   type    = number
13 |   default = 100
14 | }
15 | 
16 | variable "memory_limits_low_perc_evaluation_period" {
17 |   type    = string
18 |   default = "last_5m"
19 | }
20 | 
21 | variable "memory_limits_low_perc_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "memory_limits_low_perc_docs" {
27 |   type    = string
28 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
29 | }
30 | 
31 | variable "memory_limits_low_perc_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "memory_limits_low_perc_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "memory_limits_low_perc_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "memory_limits_low_perc_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "memory_limits_low_perc_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "memory_limits_low_perc_priority" {
57 |   description = "Number from 1 (high) to 5 (low)."
58 | 
59 |   type    = number
60 |   default = 3
61 | }
62 | 


--------------------------------------------------------------------------------
/memory-limits-low-perc.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   memory_limits_low_perc_filter = coalesce(
 3 |     var.memory_limits_low_perc_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "memory_limits_low_perc" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available Memory for Limits in percentage Low"
13 |   query            = "max(${var.memory_limits_low_perc_evaluation_period}):( max:kubernetes.memory.limits{${local.memory_limits_low_perc_filter}}  by {host,kube_cluster_name}/ max:system.mem.total{${local.memory_limits_low_perc_filter}} by {host,kube_cluster_name}) * 100 > ${var.memory_limits_low_perc_critical}"
14 |   alert_message    = "Kubernetes cluster memory room for limits in percentage is too low"
15 |   recovery_message = "Kubernetes cluster memory limits in percentage has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.memory_limits_low_perc_enabled
19 |   alerting_enabled   = var.memory_limits_low_perc_alerting_enabled
20 |   critical_threshold = var.memory_limits_low_perc_critical
21 |   warning_threshold  = var.memory_limits_low_perc_warning
22 |   priority           = min(var.memory_limits_low_perc_priority + var.priority_offset, 5)
23 |   docs               = var.memory_limits_low_perc_docs
24 |   note               = var.memory_limits_low_perc_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/memory-limits-low-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "memory_limits_low_enabled" {
 2 |   type        = bool
 3 |   default     = false
 4 |   description = "This monitor is based on absolute values and thus less useful. Prefer setting memory_limits_low_perc_enabled to true."
 5 | }
 6 | 
 7 | variable "memory_limits_low_warning" {
 8 |   type    = number
 9 |   default = 4000000000
10 | }
11 | 
12 | variable "memory_limits_low_critical" {
13 |   type    = number
14 |   default = 3000000000
15 | }
16 | 
17 | variable "memory_limits_low_evaluation_period" {
18 |   type    = string
19 |   default = "last_5m"
20 | }
21 | 
22 | variable "memory_limits_low_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "memory_limits_low_docs" {
28 |   type    = string
29 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
30 | }
31 | 
32 | variable "memory_limits_low_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "memory_limits_low_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "memory_limits_low_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "memory_limits_low_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "memory_limits_low_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "memory_limits_low_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 3
62 | }
63 | 


--------------------------------------------------------------------------------
/memory-limits-low.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   memory_limits_low_filter = coalesce(
 3 |     var.memory_limits_low_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "memory_limits_low" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available Memory for Limits Low"
13 |   query            = "avg(${var.memory_limits_low_evaluation_period}):max:system.mem.total{${local.memory_limits_low_filter}} by {host,kube_cluster_name} - max:kubernetes.memory.limits{${local.memory_limits_low_filter}} by {host,kube_cluster_name} < ${var.memory_limits_low_critical}"
14 |   alert_message    = "Kubernetes cluster memory room for limits is too low"
15 |   recovery_message = "Kubernetes cluster memory limits has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.memory_limits_low_enabled
19 |   alerting_enabled   = var.memory_limits_low_alerting_enabled
20 |   critical_threshold = var.memory_limits_low_critical
21 |   warning_threshold  = var.memory_limits_low_warning
22 |   priority           = min(var.memory_limits_low_priority + var.priority_offset, 5)
23 |   docs               = var.memory_limits_low_docs
24 |   note               = var.memory_limits_low_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/memory-requests-low-perc-state-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "memory_requests_low_perc_state_enabled" {
 2 |   type        = bool
 3 |   default     = false
 4 |   description = "Memory state limits are only available when the state metrics api is deployed https://github.com/kubernetes/kube-state-metrics"
 5 | }
 6 | 
 7 | variable "memory_requests_low_perc_state_warning" {
 8 |   type    = number
 9 |   default = 85
10 | }
11 | 
12 | variable "memory_requests_low_perc_state_critical" {
13 |   type    = number
14 |   default = 95
15 | }
16 | 
17 | variable "memory_requests_low_perc_state_evaluation_period" {
18 |   type    = string
19 |   default = "last_5m"
20 | }
21 | 
22 | variable "memory_requests_low_perc_state_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "memory_requests_low_perc_state_docs" {
28 |   type    = string
29 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
30 | }
31 | 
32 | variable "memory_requests_low_perc_state_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "memory_requests_low_perc_state_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "memory_requests_low_perc_state_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "memory_requests_low_perc_state_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "memory_requests_low_perc_state_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "memory_requests_low_perc_state_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 3
62 | }
63 | 


--------------------------------------------------------------------------------
/memory-requests-low-perc-state.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   memory_requests_low_perc_state_filter = coalesce(
 3 |     var.memory_requests_low_perc_state_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "memory_requests_low_perc_state" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available Memory for Requests in percentage Low"
13 |   query            = "max(${var.memory_requests_low_perc_state_evaluation_period}):( max:kubernetes_state.container.memory_requested{${local.memory_requests_low_perc_state_filter}} / max:kubernetes_state.node.memory_allocatable{${local.memory_requests_low_perc_state_filter}} ) * 100 > ${var.memory_requests_low_perc_state_critical}"
14 |   alert_message    = "Kubernetes cluster memory room for Requests in percentage is too low"
15 |   recovery_message = "Kubernetes cluster memory Requests in percentage has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.memory_requests_low_perc_state_enabled
19 |   alerting_enabled   = var.memory_requests_low_perc_state_alerting_enabled
20 |   critical_threshold = var.memory_requests_low_perc_state_critical
21 |   warning_threshold  = var.memory_requests_low_perc_state_warning
22 |   priority           = min(var.memory_requests_low_perc_state_priority + var.priority_offset, 5)
23 |   docs               = var.memory_requests_low_perc_state_docs
24 |   note               = var.memory_requests_low_perc_state_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/memory-requests-low-perc-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "memory_requests_low_perc_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "memory_requests_low_perc_warning" {
 7 |   type    = number
 8 |   default = 85
 9 | }
10 | 
11 | variable "memory_requests_low_perc_critical" {
12 |   type    = number
13 |   default = 95
14 | }
15 | 
16 | variable "memory_requests_low_perc_evaluation_period" {
17 |   type    = string
18 |   default = "last_5m"
19 | }
20 | 
21 | variable "memory_requests_low_perc_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "memory_requests_low_perc_docs" {
27 |   type    = string
28 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
29 | }
30 | 
31 | variable "memory_requests_low_perc_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "memory_requests_low_perc_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "memory_requests_low_perc_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "memory_requests_low_perc_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "memory_requests_low_perc_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "memory_requests_low_perc_priority" {
57 |   description = "Number from 1 (high) to 5 (low)."
58 | 
59 |   type    = number
60 |   default = 3
61 | }
62 | 


--------------------------------------------------------------------------------
/memory-requests-low-perc.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   memory_requests_low_perc_filter = coalesce(
 3 |     var.memory_requests_low_perc_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "memory_requests_low_perc" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available Memory for Requests in percentage Low"
13 |   query            = "max(${var.cpu_requests_low_perc_evaluation_period}):( max:kubernetes.memory.requests{${local.cpu_requests_low_perc_filter}} / max:system.mem.total{${local.cpu_requests_low_perc_filter}} ) * 100 > ${var.cpu_requests_low_perc_critical}"
14 |   alert_message    = "Kubernetes cluster memory room for Requests in percentage is too low"
15 |   recovery_message = "Kubernetes cluster memory Requests in percentage has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.memory_requests_low_perc_enabled
19 |   alerting_enabled   = var.memory_requests_low_perc_alerting_enabled
20 |   critical_threshold = var.memory_requests_low_perc_critical
21 |   warning_threshold  = var.memory_requests_low_perc_warning
22 |   priority           = min(var.memory_requests_low_perc_priority + var.priority_offset, 5)
23 |   docs               = var.memory_requests_low_perc_docs
24 |   note               = var.memory_requests_low_perc_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/memory-requests-low-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "memory_requests_low_enabled" {
 2 |   type        = bool
 3 |   default     = false
 4 |   description = "This monitor is based on absolute values and thus less useful. Prefer setting memory_requests_low_perc_enabled to true."
 5 | }
 6 | 
 7 | variable "memory_requests_low_warning" {
 8 |   type    = number
 9 |   default = 4000000000 # Divided by 1024 = around 4GiB
10 | }
11 | 
12 | variable "memory_requests_low_critical" {
13 |   type    = number
14 |   default = 3000000000 # Divided by 1024 = around 3GiB
15 | }
16 | 
17 | variable "memory_requests_low_evaluation_period" {
18 |   type    = string
19 |   default = "last_5m"
20 | }
21 | 
22 | variable "memory_requests_low_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "memory_requests_low_docs" {
28 |   type    = string
29 |   default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/"
30 | }
31 | 
32 | variable "memory_requests_low_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "memory_requests_low_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "memory_requests_low_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "memory_requests_low_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "memory_requests_low_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "memory_requests_low_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 3
62 | }
63 | 


--------------------------------------------------------------------------------
/memory-requests-low.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   memory_requests_low_filter = coalesce(
 3 |     var.memory_requests_low_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "memory_requests_low" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Available Memory for Requests Low"
13 |   query            = "avg(${var.memory_requests_low_evaluation_period}):max:system.mem.total{${local.memory_requests_low_filter}} by {host,kube_cluster_name} - max:kubernetes.memory.requests{${local.memory_requests_low_filter}} by {host,kube_cluster_name} < ${var.memory_requests_low_critical}"
14 |   alert_message    = "Total memory available for requests on {{ host }} is low ({{value}})"
15 |   recovery_message = "Total memory available for requests on {{ host }} has recovered ({{value}})"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.memory_requests_low_enabled
19 |   alerting_enabled   = var.memory_requests_low_alerting_enabled
20 |   critical_threshold = var.memory_requests_low_critical
21 |   warning_threshold  = var.memory_requests_low_warning
22 |   priority           = min(var.memory_requests_low_priority + var.priority_offset, 5)
23 |   docs               = var.memory_requests_low_docs
24 |   note               = var.memory_requests_low_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/module_description.md:
--------------------------------------------------------------------------------
1 | This module mainly check on Kubernetes resource level and cluster health.
2 | System level monitoring can best be implemented with the [system module](https://github.com/kabisa/terraform-datadog-system).
3 | Docker/Container level monitoring can best be implemented with the [docker module](https://github.com/kabisa/terraform-datadog-docker-container).
4 | 
5 | # Recent changes:
6 | 
7 | - switch from kubernetes_state to kubernetes_state_core as a default https://docs.datadoghq.com/integrations/kubernetes_state_core/?tab=helm
8 | - upgrade provider to ~> 3.12
9 | 


--------------------------------------------------------------------------------
/network-unavailable-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "network_unavailable_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "network_unavailable_critical" {
 7 |   type        = number
 8 |   default     = 0
 9 |   description = "alert is raised when (desired - running) > network_unavailable_critical"
10 | }
11 | 
12 | variable "network_unavailable_evaluation_period" {
13 |   type    = string
14 |   default = "last_5m"
15 | }
16 | 
17 | variable "network_unavailable_note" {
18 |   type    = string
19 |   default = ""
20 | }
21 | 
22 | variable "network_unavailable_docs" {
23 |   type    = string
24 |   default = "All your nodes need network  connections, and this status indicates that there’s something wrong with a node’s network connection. Either it wasn’t set up properly (due to route exhaustion or a misconfiguration), or there’s a physical problem with the network connection to your hardware."
25 | }
26 | 
27 | variable "network_unavailable_filter_override" {
28 |   type    = string
29 |   default = ""
30 | }
31 | 
32 | variable "network_unavailable_alerting_enabled" {
33 |   type    = bool
34 |   default = true
35 | }
36 | 
37 | variable "network_unavailable_no_data_timeframe" {
38 |   type    = number
39 |   default = null
40 | }
41 | 
42 | variable "network_unavailable_notify_no_data" {
43 |   type    = bool
44 |   default = false
45 | }
46 | 
47 | variable "network_unavailable_ok_threshold" {
48 |   type    = number
49 |   default = null
50 | }
51 | 
52 | variable "network_unavailable_priority" {
53 |   description = "Number from 1 (high) to 5 (low)."
54 | 
55 |   type    = number
56 |   default = 3
57 | }
58 | 


--------------------------------------------------------------------------------
/network-unavailable.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   network_unavailable_filter = coalesce(
 3 |     var.network_unavailable_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "network_unavailable" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Nodes with Network Unavailable"
13 |   query            = "avg(${var.network_unavailable_evaluation_period}):max:kubernetes_state.node.by_condition{${local.network_unavailable_filter} AND condition:networkunavailable AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.network_unavailable_critical}"
14 |   alert_message    = "Kubernetes cluster node {{node}} has no network. Meaning it is not accessible"
15 |   recovery_message = "Kubernetes cluster node {{node}} has come back on the network"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.network_unavailable_enabled
19 |   alerting_enabled   = var.network_unavailable_alerting_enabled
20 |   critical_threshold = var.network_unavailable_critical
21 |   # no warning threshold for this monitor
22 |   priority = min(var.network_unavailable_priority + var.priority_offset, 5)
23 |   docs     = var.network_unavailable_docs
24 |   note     = var.network_unavailable_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/node-diskpressure-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "node_diskpressure_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "node_diskpressure_critical" {
 7 |   type        = number
 8 |   default     = 0
 9 |   description = "alert is raised when (desired - running) > node_diskpressure_critical"
10 | }
11 | 
12 | variable "node_diskpressure_evaluation_period" {
13 |   type    = string
14 |   default = "last_5m"
15 | }
16 | 
17 | variable "node_diskpressure_note" {
18 |   type    = string
19 |   default = ""
20 | }
21 | 
22 | variable "node_diskpressure_docs" {
23 |   type    = string
24 |   default = "Disk pressure is a condition indicating that a node is using too much disk space or is using disk space too fast, according to the thresholds you have set in your Kubernetes configuration. This is important to monitor because it might mean that you need to add more disk space, if your application legitimately needs more space. Or it might mean that an application is misbehaving and filling up the disk prematurely in an unanticipated manner. Either way, it’s a condition which needs your attention."
25 | }
26 | 
27 | variable "node_diskpressure_filter_override" {
28 |   type    = string
29 |   default = ""
30 | }
31 | 
32 | variable "node_diskpressure_alerting_enabled" {
33 |   type    = bool
34 |   default = true
35 | }
36 | 
37 | variable "node_diskpressure_no_data_timeframe" {
38 |   type    = number
39 |   default = null
40 | }
41 | 
42 | variable "node_diskpressure_notify_no_data" {
43 |   type    = bool
44 |   default = false
45 | }
46 | 
47 | variable "node_diskpressure_ok_threshold" {
48 |   type    = number
49 |   default = null
50 | }
51 | 
52 | variable "node_diskpressure_priority" {
53 |   description = "Number from 1 (high) to 5 (low)."
54 | 
55 |   type    = number
56 |   default = 3
57 | }
58 | 


--------------------------------------------------------------------------------
/node-diskpressure.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   node_diskpressure_filter = coalesce(
 3 |     var.node_diskpressure_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "node_diskpressure" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Nodes with Diskpressure"
13 |   query            = "avg(${var.node_diskpressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.node_diskpressure_filter} AND condition:diskpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.node_diskpressure_critical}"
14 |   alert_message    = "Kubernetes cluster node {{node}} has diskpressure. Meaning it is low on disk space (Logging, emptydir volumes, caching, etc)"
15 |   recovery_message = "Kubernetes cluster node {{node}} no longer has problems with DiskPressure."
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.node_diskpressure_enabled
19 |   alerting_enabled   = var.node_diskpressure_alerting_enabled
20 |   critical_threshold = var.node_diskpressure_critical
21 |   # no warning threshold for this monitor
22 |   priority = min(var.node_diskpressure_priority + var.priority_offset, 5)
23 |   docs     = var.node_diskpressure_docs
24 |   note     = var.node_diskpressure_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/node-memory-used-percent-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "node_memory_used_percent_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "node_memory_used_percent_warning" {
 7 |   type    = number
 8 |   default = 80
 9 |   # 80 %
10 | }
11 | 
12 | variable "node_memory_used_percent_critical" {
13 |   type    = number
14 |   default = 90
15 |   # 90 %
16 | }
17 | 
18 | variable "node_memory_used_percent_evaluation_period" {
19 |   type    = string
20 |   default = "last_5m"
21 | }
22 | 
23 | variable "node_memory_used_percent_note" {
24 |   type    = string
25 |   default = ""
26 | }
27 | 
28 | variable "node_memory_used_percent_docs" {
29 |   type    = string
30 |   default = ""
31 | }
32 | 
33 | variable "node_memory_used_percent_filter_override" {
34 |   type    = string
35 |   default = ""
36 | }
37 | 
38 | variable "node_memory_used_percent_alerting_enabled" {
39 |   type    = bool
40 |   default = true
41 | }
42 | 
43 | variable "node_memory_used_percent_no_data_timeframe" {
44 |   type    = number
45 |   default = null
46 | }
47 | 
48 | variable "node_memory_used_percent_notify_no_data" {
49 |   type    = bool
50 |   default = false
51 | }
52 | 
53 | variable "node_memory_used_percent_ok_threshold" {
54 |   type    = number
55 |   default = null
56 | }
57 | 
58 | variable "node_memory_used_percent_priority" {
59 |   description = "Number from 1 (high) to 5 (low)."
60 | 
61 |   type    = number
62 |   default = 2
63 | }
64 | 


--------------------------------------------------------------------------------
/node-memory-used-percent.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   node_memory_used_percent_filter = coalesce(
 3 |     var.node_memory_used_percent_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "node_memory_used_percent" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Memory Used Percent"
13 |   query            = "avg(${var.node_memory_used_percent_evaluation_period}):( 100 * max:kubernetes.memory.usage{${local.node_memory_used_percent_filter}} by {host,kube_cluster_name} ) / max:system.mem.total{${local.node_memory_used_percent_filter}} by {host,kube_cluster_name} > ${var.node_memory_used_percent_critical}"
14 |   alert_message    = "Available memory on ${var.service} Node {{host.name}} has dropped below {{threshold}} and has {{value}}% available"
15 |   recovery_message = "Available memory on ${var.service} Node {{host.name}} has recovered {{value}}%"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.node_memory_used_percent_enabled
19 |   alerting_enabled   = var.node_memory_used_percent_alerting_enabled
20 |   critical_threshold = var.node_memory_used_percent_critical
21 |   warning_threshold  = var.node_memory_used_percent_warning
22 |   priority           = min(var.node_memory_used_percent_priority + var.priority_offset, 5)
23 |   docs               = var.node_memory_used_percent_docs
24 |   note               = var.node_memory_used_percent_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/node-memorypressure-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "node_memorypressure_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "node_memorypressure_critical" {
 7 |   type        = number
 8 |   default     = 0
 9 |   description = "alert is raised when (desired - running) > node_memorypressure_critical"
10 | }
11 | 
12 | variable "node_memorypressure_evaluation_period" {
13 |   type    = string
14 |   default = "last_5m"
15 | }
16 | 
17 | variable "node_memorypressure_note" {
18 |   type    = string
19 |   default = ""
20 | }
21 | 
22 | variable "node_memorypressure_docs" {
23 |   type    = string
24 |   default = "Memory pressure is a resourcing condition indicating that your node is running out of memory. Similar to CPU resourcing, you don’t want to run out of memory. You especially need to watch for this condition because it could mean there’s a memory leak in one of your applications."
25 | }
26 | 
27 | variable "node_memorypressure_filter_override" {
28 |   type    = string
29 |   default = ""
30 | }
31 | 
32 | variable "node_memorypressure_alerting_enabled" {
33 |   type    = bool
34 |   default = true
35 | }
36 | 
37 | variable "node_memorypressure_no_data_timeframe" {
38 |   type    = number
39 |   default = null
40 | }
41 | 
42 | variable "node_memorypressure_notify_no_data" {
43 |   type    = bool
44 |   default = false
45 | }
46 | 
47 | variable "node_memorypressure_ok_threshold" {
48 |   type    = number
49 |   default = null
50 | }
51 | 
52 | variable "node_memorypressure_priority" {
53 |   description = "Number from 1 (high) to 5 (low)."
54 | 
55 |   type    = number
56 |   default = 3
57 | }
58 | 


--------------------------------------------------------------------------------
/node-memorypressure.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   node_memorypressure_filter = coalesce(
 3 |     var.node_memorypressure_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "node_memorypressure" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Nodes with Memorypressure"
13 |   query            = "avg(${var.node_memorypressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.node_memorypressure_filter} AND condition:memorypressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.node_memorypressure_critical}"
14 |   alert_message    = "Kubernetes cluster node {{node}} has memorypressure. Meaning it is low on memory"
15 |   recovery_message = "Kubernetes cluster node {{node}} no longer has Memory Pressure."
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.node_memorypressure_enabled
19 |   alerting_enabled   = var.node_memorypressure_alerting_enabled
20 |   critical_threshold = var.node_memorypressure_critical
21 |   # no warning threshold for this monitor
22 |   priority = min(var.node_memorypressure_priority + var.priority_offset, 5)
23 |   docs     = var.node_memorypressure_docs
24 |   note     = var.node_memorypressure_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/node-ready-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "node_ready_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "node_ready_critical" {
 7 |   type    = number
 8 |   default = 1
 9 | }
10 | 
11 | variable "node_ready_evaluation_period" {
12 |   type    = string
13 |   default = "last_5m"
14 | }
15 | 
16 | variable "node_ready_note" {
17 |   type    = string
18 |   default = ""
19 | }
20 | 
21 | variable "node_ready_docs" {
22 |   type    = string
23 |   default = "Checks to see if the node is in ready status or not"
24 | }
25 | 
26 | variable "node_ready_filter_override" {
27 |   type    = string
28 |   default = ""
29 | }
30 | 
31 | variable "node_ready_alerting_enabled" {
32 |   type    = bool
33 |   default = true
34 | }
35 | 
36 | variable "node_ready_no_data_timeframe" {
37 |   type    = number
38 |   default = null
39 | }
40 | 
41 | variable "node_ready_notify_no_data" {
42 |   type    = bool
43 |   default = false
44 | }
45 | 
46 | variable "node_ready_ok_threshold" {
47 |   type    = number
48 |   default = null
49 | }
50 | 
51 | variable "node_ready_priority" {
52 |   description = "Number from 1 (high) to 5 (low)."
53 | 
54 |   type    = number
55 |   default = 2
56 | }
57 | 


--------------------------------------------------------------------------------
/node-ready.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   node_ready_filter = coalesce(
 3 |     var.node_ready_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "node_ready" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Node Not Ready"
13 |   query            = "avg(${var.node_ready_evaluation_period}):count_nonzero(sum:kubernetes_state.node.by_condition{${local.node_ready_filter} AND (NOT condition:ready) AND (status:true OR status:unknown)} by {kube_cluster_name,host}) > ${var.node_ready_critical}"
14 |   alert_message    = "Kubernetes cluster node {{host}} is not ready."
15 |   recovery_message = "Kubernetes cluster node {{host}} is ready again."
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.node_ready_enabled
19 |   alerting_enabled   = var.node_ready_alerting_enabled
20 |   critical_threshold = var.node_ready_critical
21 |   # no warning threshold for this monitor
22 |   priority = min(var.node_ready_priority + var.priority_offset, 5)
23 |   docs     = var.node_ready_docs
24 |   note     = var.node_ready_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/node-status-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "node_status_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "node_status_evaluation_period" {
 7 |   type    = string
 8 |   default = "last_5m"
 9 | }
10 | 
11 | variable "node_status_note" {
12 |   type    = string
13 |   default = ""
14 | }
15 | 
16 | variable "node_status_docs" {
17 |   type    = string
18 |   default = "This cluster state metric provides a high-level overview of a node’s health and whether the scheduler can place pods on that node. It runs checks on the following node conditions\nhttps://kubernetes.io/docs/concepts/architecture/nodes/#condition"
19 | }
20 | 
21 | variable "node_status_filter_override" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "node_status_alerting_enabled" {
27 |   type    = bool
28 |   default = true
29 | }
30 | 
31 | variable "node_status_no_data_timeframe" {
32 |   type    = number
33 |   default = null
34 | }
35 | 
36 | variable "node_status_notify_no_data" {
37 |   type    = bool
38 |   default = false
39 | }
40 | 
41 | variable "node_status_ok_threshold" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "node_status_priority" {
47 |   description = "Number from 1 (high) to 5 (low)."
48 | 
49 |   type    = number
50 |   default = 2
51 | }
52 | 


--------------------------------------------------------------------------------
/node-status.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   node_status_filter = coalesce(
 3 |     var.node_status_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "node_status" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name                = "Node Status not OK"
13 |   query               = "avg(${var.node_status_evaluation_period}):avg:kubernetes_state.node.status{${local.node_status_filter}} by {kube_cluster_name,node} < 1"
14 |   alert_message       = "Kubernetes Node Status for Node {{node}} is not ok"
15 |   recovery_message    = "Kubernetes Node Status for Node {{node}} has recovered"
16 |   require_full_window = false
17 | 
18 |   # monitor level vars
19 |   enabled            = var.state_metrics_monitoring && var.node_status_enabled
20 |   alerting_enabled   = var.node_status_alerting_enabled
21 |   critical_threshold = 1
22 |   # No warning possible for status that is either 0 or 1
23 |   priority = min(var.node_status_priority + var.priority_offset, 5)
24 |   docs     = var.node_status_docs
25 |   note     = var.node_status_note
26 | 
27 |   # module level vars
28 |   env                  = var.env
29 |   service              = var.service
30 |   service_display_name = var.service_display_name
31 |   notification_channel = var.notification_channel
32 |   additional_tags      = var.additional_tags
33 |   locked               = var.locked
34 |   name_prefix          = var.name_prefix
35 |   name_suffix          = var.name_suffix
36 | }
37 | 


--------------------------------------------------------------------------------
/persistent-volumes-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "persistent_volumes_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "persistent_volumes_warning" {
 7 |   type    = number
 8 |   default = 0
 9 | }
10 | 
11 | variable "persistent_volumes_critical" {
12 |   type    = number
13 |   default = 1
14 | }
15 | 
16 | variable "persistent_volumes_evaluation_period" {
17 |   type    = string
18 |   default = "last_5m"
19 | }
20 | 
21 | variable "persistent_volumes_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "persistent_volumes_docs" {
27 |   type    = string
28 |   default = ""
29 | }
30 | 
31 | variable "persistent_volumes_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "persistent_volumes_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "persistent_volumes_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "persistent_volumes_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "persistent_volumes_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "persistent_volumes_priority" {
57 |   description = "Number from 1 (high) to 5 (low)."
58 | 
59 |   type    = number
60 |   default = 3
61 | }
62 | 


--------------------------------------------------------------------------------
/persistent-volumes.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   persistent_volumes_filter = coalesce(
 3 |     var.persistent_volumes_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "persistent_volumes_low" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Failed Persistent Volume Claims"
13 |   query            = "avg(${var.persistent_volumes_evaluation_period}):max:kubernetes_state.persistentvolume.by_phase{${local.persistent_volumes_filter} AND phase:failed} > ${var.persistent_volumes_critical}"
14 |   alert_message    = "There are failed Physical Volume Claims, storage has problems"
15 |   recovery_message = "There are no failed Physical Volume Claims"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.persistent_volumes_enabled
19 |   alerting_enabled   = var.persistent_volumes_alerting_enabled
20 |   critical_threshold = var.persistent_volumes_critical
21 |   warning_threshold  = var.persistent_volumes_warning
22 |   priority           = min(var.persistent_volumes_priority + var.priority_offset, 5)
23 |   docs               = var.persistent_volumes_docs
24 |   note               = var.persistent_volumes_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/pid-pressure-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "pid_pressure_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "pid_pressure_critical" {
 7 |   type        = number
 8 |   default     = 0
 9 |   description = "alert is raised when (desired - running) > pid_pressure_critical"
10 | }
11 | 
12 | variable "pid_pressure_evaluation_period" {
13 |   type    = string
14 |   default = "last_5m"
15 | }
16 | 
17 | variable "pid_pressure_note" {
18 |   type    = string
19 |   default = ""
20 | }
21 | 
22 | variable "pid_pressure_docs" {
23 |   type    = string
24 |   default = "PID pressure is a rare condition where a pod or container spawns too many processes and starves the node of available process IDs. Each node has a limited number of process IDs to distribute amongst running processes; and if it runs out of IDs, no other processes can be started. Kubernetes lets you set PID thresholds for pods to limit their ability to perform runaway process-spawning, and a PID pressure condition means that one or more pods are using up their allocated PIDs and need to be examined."
25 | }
26 | 
27 | variable "pid_pressure_filter_override" {
28 |   type    = string
29 |   default = ""
30 | }
31 | 
32 | variable "pid_pressure_alerting_enabled" {
33 |   type    = bool
34 |   default = true
35 | }
36 | 
37 | variable "pid_pressure_no_data_timeframe" {
38 |   type    = number
39 |   default = null
40 | }
41 | 
42 | variable "pid_pressure_notify_no_data" {
43 |   type    = bool
44 |   default = false
45 | }
46 | 
47 | variable "pid_pressure_ok_threshold" {
48 |   type    = number
49 |   default = null
50 | }
51 | 
52 | variable "pid_pressure_priority" {
53 |   description = "Number from 1 (high) to 5 (low)."
54 | 
55 |   type    = number
56 |   default = 3
57 | }
58 | 


--------------------------------------------------------------------------------
/pid-pressure.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   pid_pressure_filter = coalesce(
 3 |     var.pid_pressure_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "pid_pressure" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Nodes with PID Pressure"
13 |   query            = "avg(${var.pid_pressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.pid_pressure_filter} AND condition:pidpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.pid_pressure_critical}"
14 |   alert_message    = "Kubernetes cluster node {{node}} has PID Pressure, meaning it may not be able to start more containers"
15 |   recovery_message = "Kubernetes cluster node {{node}} n olonger has pid pressure."
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.pid_pressure_enabled
19 |   alerting_enabled   = var.pid_pressure_alerting_enabled
20 |   critical_threshold = var.pid_pressure_critical
21 |   # no warning threshold for this monitor
22 |   priority = min(var.pid_pressure_priority + var.priority_offset, 5)
23 |   docs     = var.pid_pressure_docs
24 |   note     = var.pid_pressure_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/pod-count-per-node-high-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "pod_count_per_node_high_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "pod_count_per_node_high_warning" {
 7 |   type    = number
 8 |   default = 90.0
 9 | }
10 | 
11 | variable "pod_count_per_node_high_critical" {
12 |   type    = number
13 |   default = 100.0
14 | }
15 | 
16 | variable "pod_count_per_node_high_warning_recovery" {
17 |   type    = number
18 |   default = null
19 | }
20 | 
21 | variable "pod_count_per_node_high_critical_recovery" {
22 |   type    = number
23 |   default = null
24 | }
25 | 
26 | variable "pod_count_per_node_high_evaluation_period" {
27 |   type    = string
28 |   default = "last_10m"
29 | }
30 | 
31 | variable "pod_count_per_node_high_note" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "pod_count_per_node_high_docs" {
37 |   type    = string
38 |   default = ""
39 | }
40 | 
41 | variable "pod_count_per_node_high_filter_override" {
42 |   type    = string
43 |   default = ""
44 | }
45 | 
46 | variable "pod_count_per_node_high_alerting_enabled" {
47 |   type    = bool
48 |   default = true
49 | }
50 | 
51 | variable "pod_count_per_node_high_no_data_timeframe" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "pod_count_per_node_high_notify_no_data" {
57 |   type    = bool
58 |   default = false
59 | }
60 | 
61 | variable "pod_count_per_node_high_ok_threshold" {
62 |   type    = number
63 |   default = null
64 | }
65 | 
66 | variable "pod_count_per_node_high_name_prefix" {
67 |   type    = string
68 |   default = ""
69 | }
70 | 
71 | variable "pod_count_per_node_high_name_suffix" {
72 |   type    = string
73 |   default = ""
74 | }
75 | 
76 | variable "pod_count_per_node_high_priority" {
77 |   description = "Number from 1 (high) to 5 (low)."
78 | 
79 |   type    = number
80 |   default = 2
81 | }
82 | 


--------------------------------------------------------------------------------
/pod-count-per-node-high.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   pod_count_per_node_high_filter = coalesce(
 3 |     var.pod_count_per_node_high_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "pod_count_per_node_high" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name  = "Pod count per node high"
13 |   query = "min(${var.pod_count_per_node_high_evaluation_period}):sum:kubernetes.pods.running{${local.pod_count_per_node_high_filter}} by {host} > ${var.pod_count_per_node_high_critical}"
14 | 
15 |   # alert specific configuration
16 |   require_full_window = false
17 |   alert_message       = "Pod count per node high ({{ value }}) in {{ service }} exceeds {{ threshold }}"
18 |   recovery_message    = "Pod count per node high ({{ value }}) in {{ service }} has recovered"
19 | 
20 |   # monitor level vars
21 |   enabled            = var.pod_count_per_node_high_enabled
22 |   alerting_enabled   = var.pod_count_per_node_high_alerting_enabled
23 |   critical_threshold = var.pod_count_per_node_high_critical
24 |   critical_recovery  = var.pod_count_per_node_high_critical_recovery
25 |   warning_threshold  = var.pod_count_per_node_high_warning
26 |   warning_recovery   = var.pod_count_per_node_high_warning_recovery
27 |   priority           = min(var.pod_count_per_node_high_priority + var.priority_offset, 5)
28 |   docs               = var.pod_count_per_node_high_docs
29 |   note               = var.pod_count_per_node_high_note
30 | 
31 |   # module level vars
32 |   env                  = var.env
33 |   service              = var.service
34 |   service_display_name = var.service_display_name
35 |   notification_channel = var.notification_channel
36 |   additional_tags      = var.additional_tags
37 |   locked               = var.locked
38 |   name_prefix          = var.name_prefix
39 |   name_suffix          = var.name_suffix
40 | }
41 | 


--------------------------------------------------------------------------------
/pod-ready-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "pod_ready_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "pod_ready_evaluation_period" {
 7 |   type    = string
 8 |   default = "last_30m"
 9 | }
10 | 
11 | variable "pod_ready_note" {
12 |   type    = string
13 |   default = ""
14 | }
15 | 
16 | variable "pod_ready_docs" {
17 |   type    = string
18 |   default = "A pod may be running but not available, meaning it is not ready and able to accept traffic. This is normal during certain circumstances, such as when a pod is newly launched or when a change is made and deployed to the specification of that pod. But if you see spikes in the number of unavailable pods, or pods that are consistently unavailable, it might indicate a problem with their configuration.\nhttps://www.datadoghq.com/blog/monitoring-kubernetes-performance-metrics/"
19 | }
20 | 
21 | variable "pod_ready_filter_override" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "pod_ready_alerting_enabled" {
27 |   type    = bool
28 |   default = true
29 | }
30 | 
31 | variable "pod_ready_no_data_timeframe" {
32 |   type    = number
33 |   default = null
34 | }
35 | 
36 | variable "pod_ready_notify_no_data" {
37 |   type    = bool
38 |   default = false
39 | }
40 | 
41 | variable "pod_ready_ok_threshold" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "pod_ready_priority" {
47 |   description = "Number from 1 (high) to 5 (low)."
48 | 
49 |   type    = number
50 |   default = 3
51 | }
52 | 


--------------------------------------------------------------------------------
/pod-ready.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   pod_ready_filter = coalesce(
 3 |     var.pod_ready_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "pod_ready" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Pod status not ready"
13 |   query            = "min(${var.pod_ready_evaluation_period}):sum:kubernetes_state.pod.count{${local.pod_ready_filter}} by {kube_cluster_name,kube_namespace} - sum:kubernetes_state.pod.ready{${local.pod_ready_filter}} by {kube_cluster_name,kube_namespace} > 0"
14 |   alert_message    = "Kubernetes Pod {{value}} status not ready in namespace {{kube_namespace}} "
15 |   recovery_message = "Kubernetes Pod status recovered in namespace {{kube_namespace}}"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.pod_ready_enabled
19 |   alerting_enabled   = var.pod_ready_alerting_enabled
20 |   critical_threshold = 0
21 |   # No warning possible for status that is either 0 or 1
22 |   priority = min(var.pod_ready_priority + var.priority_offset, 5)
23 |   docs     = var.pod_ready_docs
24 |   note     = var.pod_ready_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/pod-restarts-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "pod_restarts_enabled" {
 2 |   type        = bool
 3 |   description = "Deprecated in favour of multiple restarts monitoring for Daemonset and Deployment"
 4 |   default     = false
 5 | }
 6 | 
 7 | variable "pod_restarts_warning" {
 8 |   type    = number
 9 |   default = 3
10 | }
11 | 
12 | variable "pod_restarts_critical" {
13 |   type    = number
14 |   default = 5
15 | }
16 | 
17 | variable "pod_restarts_evaluation_period" {
18 |   type    = string
19 |   default = "last_10m"
20 | }
21 | 
22 | variable "pod_restarts_note" {
23 |   type    = string
24 |   default = ""
25 | }
26 | 
27 | variable "pod_restarts_docs" {
28 |   type    = string
29 |   default = ""
30 | }
31 | 
32 | variable "pod_restarts_filter_override" {
33 |   type    = string
34 |   default = ""
35 | }
36 | 
37 | variable "pod_restarts_alerting_enabled" {
38 |   type    = bool
39 |   default = true
40 | }
41 | 
42 | variable "pod_restarts_no_data_timeframe" {
43 |   type    = number
44 |   default = null
45 | }
46 | 
47 | variable "pod_restarts_notify_no_data" {
48 |   type    = bool
49 |   default = false
50 | }
51 | 
52 | variable "pod_restarts_ok_threshold" {
53 |   type    = number
54 |   default = null
55 | }
56 | 
57 | variable "pod_restarts_priority" {
58 |   description = "Number from 1 (high) to 5 (low)."
59 | 
60 |   type    = number
61 |   default = 2
62 | }
63 | 


--------------------------------------------------------------------------------
/pod-restarts.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   pod_restarts_filter = coalesce(
 3 |     var.pod_restarts_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "pod_restarts" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Restarting Pods"
13 |   query            = "change(avg(${var.pod_restarts_evaluation_period}),${var.pod_restarts_evaluation_period}):exclude_null(avg:kubernetes.containers.restarts{${local.pod_restarts_filter}} by {pod_name}) > ${var.pod_restarts_critical}"
14 |   alert_message    = "Pods are restarting multiple times in the last ${var.pod_restarts_evaluation_period}"
15 |   recovery_message = "Pods restarting recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.pod_restarts_enabled
19 |   alerting_enabled   = var.pod_restarts_alerting_enabled
20 |   critical_threshold = var.pod_restarts_critical
21 |   warning_threshold  = var.pod_restarts_warning
22 |   priority           = min(var.pod_restarts_priority + var.priority_offset, 5)
23 |   docs               = var.pod_restarts_docs
24 |   note               = var.pod_restarts_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/pods-failed-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "pods_failed_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "pods_failed_warning" {
 7 |   type    = number
 8 |   default = null
 9 | }
10 | 
11 | variable "pods_failed_critical" {
12 |   type    = number
13 |   default = 0.0
14 | }
15 | 
16 | variable "pods_failed_evaluation_period" {
17 |   type    = string
18 |   default = "last_10m"
19 | }
20 | 
21 | variable "pods_failed_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "pods_failed_docs" {
27 |   type    = string
28 |   default = "https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/"
29 | }
30 | 
31 | variable "pods_failed_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "pods_failed_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "pods_failed_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "pods_failed_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "pods_failed_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "pods_failed_name_prefix" {
57 |   type    = string
58 |   default = ""
59 | }
60 | 
61 | variable "pods_failed_name_suffix" {
62 |   type    = string
63 |   default = ""
64 | }
65 | 
66 | variable "pods_failed_priority" {
67 |   description = "Number from 1 (high) to 5 (low)."
68 | 
69 |   type    = number
70 |   default = 3
71 | }
72 | 


--------------------------------------------------------------------------------
/pods-failed.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   pods_failed_filter = coalesce(
 3 |     var.pods_failed_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "pods_failed" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name  = "Pods Failed"
13 |   query = "min(${var.pods_failed_evaluation_period}):default_zero(max:kubernetes_state.pod.status_phase{phase:failed${var.filter_str_concatenation}${local.pods_failed_filter}} by {kube_namespace}) > ${var.pods_failed_critical}"
14 | 
15 |   # alert specific configuration
16 |   require_full_window = true
17 |   alert_message       = "Kubernetes pods failed ({{ value }}) in {{ service }} exceeds {{ threshold }}"
18 |   recovery_message    = "Kubernetes pods failed ({{ value }}) in {{ service }} has recovered"
19 | 
20 |   # monitor level vars
21 |   enabled            = var.pods_failed_enabled
22 |   alerting_enabled   = var.pods_failed_alerting_enabled
23 |   warning_threshold  = var.pods_failed_warning
24 |   critical_threshold = var.pods_failed_critical
25 |   priority           = min(var.pods_failed_priority + var.priority_offset, 5)
26 |   docs               = var.pods_failed_docs
27 |   note               = var.pods_failed_note
28 | 
29 |   # module level vars
30 |   env                  = var.env
31 |   service              = var.service
32 |   service_display_name = var.service_display_name
33 |   notification_channel = var.notification_channel
34 |   additional_tags      = var.additional_tags
35 |   locked               = var.locked
36 |   name_prefix          = var.name_prefix
37 |   name_suffix          = var.name_suffix
38 | }
39 | 


--------------------------------------------------------------------------------
/pods-pending-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "pods_pending_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "pods_pending_warning" {
 7 |   type    = number
 8 |   default = null
 9 | }
10 | 
11 | variable "pods_pending_critical" {
12 |   type    = number
13 |   default = 0.0
14 | }
15 | 
16 | variable "pods_pending_evaluation_period" {
17 |   type    = string
18 |   default = "last_10m"
19 | }
20 | 
21 | variable "pods_pending_note" {
22 |   type    = string
23 |   default = ""
24 | }
25 | 
26 | variable "pods_pending_docs" {
27 |   type    = string
28 |   default = "https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/"
29 | }
30 | 
31 | variable "pods_pending_filter_override" {
32 |   type    = string
33 |   default = ""
34 | }
35 | 
36 | variable "pods_pending_alerting_enabled" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "pods_pending_no_data_timeframe" {
42 |   type    = number
43 |   default = null
44 | }
45 | 
46 | variable "pods_pending_notify_no_data" {
47 |   type    = bool
48 |   default = false
49 | }
50 | 
51 | variable "pods_pending_ok_threshold" {
52 |   type    = number
53 |   default = null
54 | }
55 | 
56 | variable "pods_pending_name_prefix" {
57 |   type    = string
58 |   default = ""
59 | }
60 | 
61 | variable "pods_pending_name_suffix" {
62 |   type    = string
63 |   default = ""
64 | }
65 | 
66 | variable "pods_pending_priority" {
67 |   description = "Number from 1 (high) to 5 (low)."
68 | 
69 |   type    = number
70 |   default = 3
71 | }
72 | 


--------------------------------------------------------------------------------
/pods-pending.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   pods_pending_filter = coalesce(
 3 |     var.pods_pending_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "pods_pending" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name  = "Pods Pending"
13 |   query = "min(${var.pods_pending_evaluation_period}):default_zero(max:kubernetes_state.pod.status_phase{phase:pending${var.filter_str_concatenation}${local.pods_pending_filter}} by {kube_namespace}) > ${var.pods_pending_critical}"
14 | 
15 |   # alert specific configuration
16 |   require_full_window = true
17 |   alert_message       = "Kubernetes pods pending ({{ value }}) in {{ service }} exceeds {{ threshold }}"
18 |   recovery_message    = "Kubernetes pods pending ({{ value }}) in {{ service }} has recovered"
19 | 
20 |   # monitor level vars
21 |   enabled            = var.pods_pending_enabled
22 |   alerting_enabled   = var.pods_pending_alerting_enabled
23 |   warning_threshold  = var.pods_pending_warning
24 |   critical_threshold = var.pods_pending_critical
25 |   priority           = min(var.pods_pending_priority + var.priority_offset, 5)
26 |   docs               = var.pods_pending_docs
27 |   note               = var.pods_pending_note
28 | 
29 |   # module level vars
30 |   env                  = var.env
31 |   service              = var.service
32 |   service_display_name = var.service_display_name
33 |   notification_channel = var.notification_channel
34 |   additional_tags      = var.additional_tags
35 |   locked               = var.locked
36 |   name_prefix          = var.name_prefix
37 |   name_suffix          = var.name_suffix
38 | }
39 | 


--------------------------------------------------------------------------------
/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 |   required_providers {
3 |     datadog = {
4 |       source  = "DataDog/datadog"
5 |       version = "~> 3.12"
6 |     }
7 |   }
8 | }


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": [
4 |     "config:base"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/replicaset-incomplete-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "replicaset_incomplete_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "replicaset_incomplete_critical" {
 7 |   type        = number
 8 |   default     = 0
 9 |   description = "alert is raised when (desired - running) > replicaset_incomplete_critical"
10 | }
11 | 
12 | variable "replicaset_incomplete_evaluation_period" {
13 |   type    = string
14 |   default = "last_15m"
15 | }
16 | 
17 | variable "replicaset_incomplete_note" {
18 |   type    = string
19 |   default = "There's also a monitor defined for when the replicaset is completely unavailable"
20 | }
21 | 
22 | variable "replicaset_incomplete_docs" {
23 |   type    = string
24 |   default = "In kubernetes a Replicaset is responsible for making sure a specific number of pods run. An example for a reason when that's not is the case, is when the image cannot be pulled, the pod fails to initialize or no resources are available on the cluster\nThis alert is raised when (desired - running) > 0"
25 | }
26 | 
27 | variable "replicaset_incomplete_filter_override" {
28 |   type    = string
29 |   default = ""
30 | }
31 | 
32 | variable "replicaset_incomplete_alerting_enabled" {
33 |   type    = bool
34 |   default = true
35 | }
36 | 
37 | variable "replicaset_incomplete_no_data_timeframe" {
38 |   type    = number
39 |   default = null
40 | }
41 | 
42 | variable "replicaset_incomplete_notify_no_data" {
43 |   type    = bool
44 |   default = false
45 | }
46 | 
47 | variable "replicaset_incomplete_ok_threshold" {
48 |   type    = number
49 |   default = null
50 | }
51 | 
52 | variable "replicaset_incomplete_priority" {
53 |   description = "Number from 1 (high) to 5 (low)."
54 | 
55 |   type    = number
56 |   default = 3
57 | }
58 | 


--------------------------------------------------------------------------------
/replicaset-incomplete.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   replicaset_incomplete_filter = coalesce(
 3 |     var.replicaset_incomplete_filter_override,
 4 |     var.filter_str
 5 |   )
 6 | }
 7 | 
 8 | module "replicaset_incomplete" {
 9 |   source  = "kabisa/generic-monitor/datadog"
10 |   version = "1.0.0"
11 | 
12 |   name             = "Replicaset Incomplete"
13 |   query            = "min(${var.replicaset_incomplete_evaluation_period}):max:kubernetes_state.replicaset.replicas_desired{${local.replicaset_incomplete_filter}} by {kube_replica_set,kube_cluster_name} - min:kubernetes_state.replicaset.replicas_ready{${local.replicaset_incomplete_filter}} by {kube_replica_set,kube_cluster_name} > ${var.replicaset_incomplete_critical}"
14 |   alert_message    = "Kubernetes Replicaset {{kube_replica_set}} is incomplete. Missing pod count:{{value}}"
15 |   recovery_message = "Kubernetes Replicaset {{kube_replica_set}} has recovered"
16 | 
17 |   # monitor level vars
18 |   enabled            = var.state_metrics_monitoring && var.replicaset_incomplete_enabled
19 |   alerting_enabled   = var.replicaset_incomplete_alerting_enabled
20 |   critical_threshold = var.replicaset_incomplete_critical
21 |   # No warning threshold for this monitor
22 |   priority = min(var.replicaset_incomplete_priority + var.priority_offset, 5)
23 |   docs     = var.replicaset_incomplete_docs
24 |   note     = var.replicaset_incomplete_note
25 | 
26 |   # module level vars
27 |   env                  = var.env
28 |   service              = var.service
29 |   service_display_name = var.service_display_name
30 |   notification_channel = var.notification_channel
31 |   additional_tags      = var.additional_tags
32 |   locked               = var.locked
33 |   name_prefix          = var.name_prefix
34 |   name_suffix          = var.name_suffix
35 | }
36 | 


--------------------------------------------------------------------------------
/replicaset-unavailable-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "replicaset_unavailable_enabled" {
 2 |   type    = bool
 3 |   default = true
 4 | }
 5 | 
 6 | variable "replicaset_unavailable_critical" {
 7 |   type        = number
 8 |   default     = 0
 9 |   description = "alert is raised when (desired - running) == 0"
10 | }
11 | 
12 | variable "replicaset_unavailable_evaluation_period" {
13 |   type    = string
14 |   default = "last_5m"
15 | }
16 | 
17 | variable "replicaset_unavailable_note" {
18 |   type    = string
19 |   default = "There's also a monitor defined for when the replicaset is only partially available"
20 | }
21 | 
22 | variable "replicaset_unavailable_docs" {
23 |   type    = string
24 |   default = "In kubernetes a Replicaset is responsible for making sure a specific number of pods runs. An example for a reason when that's not is the case, is when the image cannot be pulled, the pod fails to initialize or no resources are available on the cluster\nThis alert is raised when running == 0 and desired > 1"
25 | }
26 | 
27 | variable "replicaset_unavailable_filter_override" {
28 |   type    = string
29 |   default = ""
30 | }
31 | 
32 | variable "replicaset_unavailable_alerting_enabled" {
33 |   type    = bool
34 |   default = true
35 | }
36 | 
37 | variable "replicaset_unavailable_no_data_timeframe" {
38 |   type    = number
39 |   default = null
40 | }
41 | 
42 | variable "replicaset_unavailable_notify_no_data" {
43 |   type    = bool
44 |   default = false
45 | }
46 | 
47 | variable "replicaset_unavailable_ok_threshold" {
48 |   type    = number
49 |   default = null
50 | }
51 | 
52 | variable "replicaset_unavailable_priority" {
53 |   description = "Number from 1 (high) to 5 (low)."
54 | 
55 |   type    = number
56 |   default = 2
57 | }
58 | 


--------------------------------------------------------------------------------
/replicaset-unavailable.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   replicaset_unavailable_filter = coalesce(
 3 |     var.replicaset_unavailable_filter_override,
 4 |     var.filter_str
 5 |   )
 6 |   rs_pods_ready   = "min:kubernetes_state.replicaset.replicas_ready{${local.replicaset_unavailable_filter}} by {kube_replica_set,kube_cluster_name}"
 7 |   rs_pods_desired = "min:kubernetes_state.replicaset.replicas_desired{${local.replicaset_unavailable_filter}} by {kube_replica_set,kube_cluster_name}"
 8 | }
 9 | 
10 | module "replicaset_unavailable" {
11 |   source  = "kabisa/generic-monitor/datadog"
12 |   version = "1.0.0"
13 | 
14 |   name = "Replicaset Unavailable"
15 |   # This (ab)uses a division by zero to make sure we don't get alerts when nr of desired pods < 2
16 |   query            = "max(${var.replicaset_unavailable_evaluation_period}):( ${local.rs_pods_ready} ) / ${local.rs_pods_desired} / ( ${local.rs_pods_desired} - 1 ) <= 0"
17 |   alert_message    = "Kubernetes Replicaset {{kube_replica_set}} is unavailable"
18 |   recovery_message = "Kubernetes Replicaset {{kube_replica_set}} now has available pods"
19 | 
20 |   # monitor level vars
21 |   enabled            = var.state_metrics_monitoring && var.replicaset_unavailable_enabled
22 |   alerting_enabled   = var.replicaset_unavailable_alerting_enabled
23 |   critical_threshold = 0
24 |   # No warning threshold for this monitor
25 |   priority = min(var.replicaset_unavailable_priority + var.priority_offset, 5)
26 |   docs     = var.replicaset_unavailable_docs
27 |   note     = var.replicaset_unavailable_note
28 | 
29 |   # module level vars
30 |   env                  = var.env
31 |   service              = var.service
32 |   service_display_name = var.service_display_name
33 |   notification_channel = var.notification_channel
34 |   additional_tags      = var.additional_tags
35 |   locked               = var.locked
36 |   name_prefix          = var.name_prefix
37 |   name_suffix          = var.name_suffix
38 | }
39 | 


--------------------------------------------------------------------------------
/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "env" {
 2 |   type = string
 3 | }
 4 | 
 5 | variable "service" {
 6 |   type    = string
 7 |   default = "Kubernetes"
 8 | }
 9 | 
10 | variable "service_display_name" {
11 |   description = "Readable version of service name of what you're monitoring."
12 |   type        = string
13 |   default     = null
14 | }
15 | 
16 | variable "notification_channel" {
17 |   type        = string
18 |   description = "The @user or @pagerduty parameters that indicate to Datadog where to send the alerts"
19 | }
20 | 
21 | variable "additional_tags" {
22 |   type    = list(string)
23 |   default = []
24 | }
25 | 
26 | variable "filter_str" {
27 |   type = string
28 | }
29 | 
30 | variable "locked" {
31 |   type        = bool
32 |   default     = true
33 |   description = "Makes sure only the creator or admin can modify the monitor."
34 | }
35 | 
36 | variable "state_metrics_monitoring" {
37 |   type    = bool
38 |   default = true
39 | }
40 | 
41 | variable "name_prefix" {
42 |   type    = string
43 |   default = ""
44 | }
45 | 
46 | variable "name_suffix" {
47 |   type    = string
48 |   default = ""
49 | }
50 | 
51 | variable "filter_str_concatenation" {
52 |   description = "If you use an IN expression you need to switch from , to AND"
53 |   default     = ","
54 | }
55 | 
56 | variable "priority_offset" {
57 |   description = "For non production workloads we can +1 on the priorities"
58 |   default     = 0
59 | }
60 | 


--------------------------------------------------------------------------------