├── .github └── workflows │ └── documentation.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── cpu-limits-low-perc-state-variables.tf ├── cpu-limits-low-perc-state.tf ├── cpu-limits-low-perc-variables.tf ├── cpu-limits-low-perc.tf ├── cpu-limits-low-variables.tf ├── cpu-limits-low.tf ├── cpu-on-dns-pods-high-variables.tf ├── cpu-on-dns-pods-high.tf ├── cpu-requests-low-perc-state-variables.tf ├── cpu-requests-low-perc-state.tf ├── cpu-requests-low-perc-variables.tf ├── cpu-requests-low-perc.tf ├── cpu-requests-low-variables.tf ├── cpu-requests-low.tf ├── daemonset-incomplete-variables.tf ├── daemonset-incomplete.tf ├── daemonset-multiple-restarts-variables.tf ├── daemonset-multiple-restarts.tf ├── datadog-agent-variables.tf ├── datadog-agent.tf ├── deploy-desired-vs-status-variables.tf ├── deploy-desired-vs-status.tf ├── deployment-multiple-restarts-variables.tf ├── deployment-multiple-restarts.tf ├── examples └── example.tf ├── hpa-status-variables.tf ├── hpa-status.tf ├── main.tf ├── memory-limits-low-perc-state-variables.tf ├── memory-limits-low-perc-state.tf ├── memory-limits-low-perc-variables.tf ├── memory-limits-low-perc.tf ├── memory-limits-low-variables.tf ├── memory-limits-low.tf ├── memory-requests-low-perc-state-variables.tf ├── memory-requests-low-perc-state.tf ├── memory-requests-low-perc-variables.tf ├── memory-requests-low-perc.tf ├── memory-requests-low-variables.tf ├── memory-requests-low.tf ├── module_description.md ├── network-unavailable-variables.tf ├── network-unavailable.tf ├── node-diskpressure-variables.tf ├── node-diskpressure.tf ├── node-memory-used-percent-variables.tf ├── node-memory-used-percent.tf ├── node-memorypressure-variables.tf ├── node-memorypressure.tf ├── node-ready-variables.tf ├── node-ready.tf ├── node-status-variables.tf ├── node-status.tf ├── persistent-volumes-variables.tf ├── persistent-volumes.tf ├── pid-pressure-variables.tf ├── pid-pressure.tf ├── pod-count-per-node-high-variables.tf ├── pod-count-per-node-high.tf ├── pod-ready-variables.tf ├── pod-ready.tf ├── pod-restarts-variables.tf ├── pod-restarts.tf ├── pods-failed-variables.tf ├── pods-failed.tf ├── pods-pending-variables.tf ├── pods-pending.tf ├── provider.tf ├── renovate.json ├── replicaset-incomplete-variables.tf ├── replicaset-incomplete.tf ├── replicaset-unavailable-variables.tf ├── replicaset-unavailable.tf └── variables.tf /.github/workflows/documentation.yaml: -------------------------------------------------------------------------------- 1 | name: Generate terraform docs 2 | 3 | on: 4 | push: 5 | # don't run when we push a tag 6 | tags-ignore: 7 | - '*' 8 | # don't run when we merge to main 9 | # the action should have run already 10 | branches-ignore: 11 | - 'main' 12 | jobs: 13 | pre-commit: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: terraform-linters/setup-tflint@v2 17 | name: Setup TFLint 18 | with: 19 | tflint_version: v0.38.1 20 | - uses: actions/checkout@v3 21 | - uses: actions/setup-python@v4 22 | - uses: pre-commit/action@v3.0.0 23 | # pre-commit fails if it changed files 24 | # we want to go on 25 | continue-on-error: true 26 | - uses: pre-commit/action@v3.0.0 27 | - uses: EndBug/add-and-commit@v9 28 | with: 29 | default_author: github_actions 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .terraform 2 | # lock file should live in top level module, it is generated by the pre-commit hook 3 | .terraform.lock.hcl 4 | README.md.orig.* 5 | README.md.toc.* 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/gruntwork-io/pre-commit 3 | rev: v0.1.12 4 | hooks: 5 | - id: terraform-fmt 6 | - id: terraform-validate 7 | - id: tflint 8 | - repo: https://github.com/kabisa/terraform-datadog-pre-commit-hook 9 | rev: "1.3.6" 10 | hooks: 11 | - id: terraform-datadog-docs 12 | args: 13 | - "." -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Kabisa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cpu-limits-low-perc-state-variables.tf: -------------------------------------------------------------------------------- 1 | variable "cpu_limits_low_perc_state_enabled" { 2 | type = bool 3 | default = false 4 | description = "CPU state limits are only available when the state metrics api is deployed https://github.com/kubernetes/kube-state-metrics" 5 | } 6 | 7 | variable "cpu_limits_low_perc_state_warning" { 8 | type = number 9 | default = 95 10 | } 11 | 12 | variable "cpu_limits_low_perc_state_critical" { 13 | type = number 14 | default = 100 15 | } 16 | 17 | variable "cpu_limits_low_perc_state_evaluation_period" { 18 | type = string 19 | default = "last_5m" 20 | } 21 | 22 | variable "cpu_limits_low_perc_state_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "cpu_limits_low_perc_state_docs" { 28 | type = string 29 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 30 | } 31 | 32 | variable "cpu_limits_low_perc_state_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "cpu_limits_low_perc_state_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "cpu_limits_low_perc_state_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "cpu_limits_low_perc_state_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "cpu_limits_low_perc_state_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "cpu_limits_low_perc_state_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 3 62 | } 63 | -------------------------------------------------------------------------------- /cpu-limits-low-perc-state.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | cpu_limits_low_perc_state_filter = coalesce( 3 | var.cpu_limits_low_perc_state_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "cpu_limits_low_perc_state" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available CPU for Limits in percentages Low" 13 | query = "max(${var.cpu_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_limit{${local.cpu_limits_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_limits_low_perc_state_filter}} by {host,kube_cluster_name}) * 100 > ${var.cpu_limits_low_perc_state_critical}" 14 | alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits / percentage is too low" 15 | recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits / percentage has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.cpu_limits_low_perc_state_enabled 19 | alerting_enabled = var.cpu_limits_low_perc_state_alerting_enabled 20 | critical_threshold = var.cpu_limits_low_perc_state_critical 21 | warning_threshold = var.cpu_limits_low_perc_state_warning 22 | priority = min(var.cpu_limits_low_perc_state_priority + var.priority_offset, 5) 23 | docs = var.cpu_limits_low_perc_state_docs 24 | note = var.cpu_limits_low_perc_state_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /cpu-limits-low-perc-variables.tf: -------------------------------------------------------------------------------- 1 | variable "cpu_limits_low_perc_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "cpu_limits_low_perc_warning" { 7 | type = number 8 | default = 95 9 | } 10 | 11 | variable "cpu_limits_low_perc_critical" { 12 | type = number 13 | default = 100 14 | } 15 | 16 | variable "cpu_limits_low_perc_evaluation_period" { 17 | type = string 18 | default = "last_5m" 19 | } 20 | 21 | variable "cpu_limits_low_perc_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "cpu_limits_low_perc_docs" { 27 | type = string 28 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 29 | } 30 | 31 | variable "cpu_limits_low_perc_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "cpu_limits_low_perc_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "cpu_limits_low_perc_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "cpu_limits_low_perc_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "cpu_limits_low_perc_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "cpu_limits_low_perc_priority" { 57 | description = "Number from 1 (high) to 5 (low)." 58 | 59 | type = number 60 | default = 3 61 | } 62 | -------------------------------------------------------------------------------- /cpu-limits-low-perc.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | cpu_limits_low_perc_filter = coalesce( 3 | var.cpu_limits_low_perc_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "cpu_limits_low_perc" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available CPU for Limits in percentages Low" 13 | query = "max(${var.cpu_limits_low_perc_evaluation_period}):(sum:kubernetes.cpu.limits{${local.cpu_limits_low_perc_filter}} by {host,kube_cluster_name} / max:system.cpu.num_cores{${local.cpu_limits_low_perc_filter}} by {host,kube_cluster_name}) * 100 > ${var.cpu_limits_low_perc_critical}" 14 | alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits / percentage is too low" 15 | recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits / percentage has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.cpu_limits_low_perc_enabled 19 | alerting_enabled = var.cpu_limits_low_perc_alerting_enabled 20 | critical_threshold = var.cpu_limits_low_perc_critical 21 | warning_threshold = var.cpu_limits_low_perc_warning 22 | priority = min(var.cpu_limits_low_perc_priority + var.priority_offset, 5) 23 | docs = var.cpu_limits_low_perc_docs 24 | note = var.cpu_limits_low_perc_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /cpu-limits-low-variables.tf: -------------------------------------------------------------------------------- 1 | variable "cpu_limits_low_enabled" { 2 | type = bool 3 | default = false 4 | description = "This monitor is based on absolute values and thus less useful. Prefer setting cpu_limits_low_perc_enabled to true." 5 | } 6 | 7 | variable "cpu_limits_low_warning" { 8 | type = number 9 | default = 0 10 | } 11 | 12 | variable "cpu_limits_low_critical" { 13 | type = number 14 | default = -30 15 | } 16 | 17 | variable "cpu_limits_low_evaluation_period" { 18 | type = string 19 | default = "last_5m" 20 | } 21 | 22 | variable "cpu_limits_low_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "cpu_limits_low_docs" { 28 | type = string 29 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 30 | } 31 | 32 | variable "cpu_limits_low_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "cpu_limits_low_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "cpu_limits_low_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "cpu_limits_low_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "cpu_limits_low_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "cpu_limits_low_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 3 62 | } 63 | -------------------------------------------------------------------------------- /cpu-limits-low.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | cpu_limits_low_filter = coalesce( 3 | var.cpu_limits_low_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "cpu_limits_low" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available CPU for Limits Low" 13 | query = "min(${var.cpu_limits_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_limits_low_filter}} by {kube_cluster_name,host} - sum:kubernetes.cpu.limits{${local.cpu_limits_low_filter}} by {kube_cluster_name,host} < ${var.cpu_limits_low_critical}" 14 | alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits is too low " 15 | recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits has recovered" 16 | 17 | 18 | # monitor level vars 19 | enabled = var.cpu_limits_low_enabled 20 | alerting_enabled = var.cpu_limits_low_alerting_enabled 21 | critical_threshold = var.cpu_limits_low_critical 22 | warning_threshold = var.cpu_limits_low_warning 23 | priority = min(var.cpu_limits_low_priority + var.priority_offset, 5) 24 | docs = var.cpu_limits_low_docs 25 | note = var.cpu_limits_low_note 26 | 27 | # module level vars 28 | env = var.env 29 | service = var.service 30 | service_display_name = var.service_display_name 31 | notification_channel = var.notification_channel 32 | additional_tags = var.additional_tags 33 | locked = var.locked 34 | name_prefix = var.name_prefix 35 | name_suffix = var.name_suffix 36 | } 37 | -------------------------------------------------------------------------------- /cpu-on-dns-pods-high-variables.tf: -------------------------------------------------------------------------------- 1 | variable "cpu_on_dns_pods_high_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "cpu_on_dns_pods_high_warning" { 7 | type = number 8 | default = 70 9 | } 10 | 11 | variable "cpu_on_dns_pods_high_critical" { 12 | type = number 13 | default = 85 14 | } 15 | 16 | variable "cpu_on_dns_pods_high_evaluation_period" { 17 | type = string 18 | default = "last_30m" 19 | } 20 | 21 | variable "cpu_on_dns_pods_high_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "cpu_on_dns_pods_high_docs" { 27 | type = string 28 | default = "" 29 | } 30 | 31 | variable "cpu_on_dns_pods_high_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "dns_filter_tags" { 37 | description = <<-EOD 38 | Getting all the DNS containers by default is hard to do. 39 | What we try is to make a list of datadog tags / filters that should help us find those 40 | We then build a filter in the following way: ($originalfilterstring) AND (item1 OR item2 OR item3...) 41 | If that doesn't work for your use-cause you can override the filter list or use cpu_on_dns_pods_high_filter_override 42 | EOD 43 | type = list(string) 44 | default = [ 45 | "kube_service:kube-dns", 46 | "short_image:coredns", 47 | "short_image:ucp-coredns", 48 | "short_image:ucp-kube-dns", 49 | ] 50 | } 51 | 52 | variable "cpu_on_dns_pods_high_alerting_enabled" { 53 | type = bool 54 | default = true 55 | } 56 | 57 | variable "cpu_on_dns_pods_high_no_data_timeframe" { 58 | type = number 59 | default = null 60 | } 61 | 62 | variable "cpu_on_dns_pods_high_notify_no_data" { 63 | type = bool 64 | default = false 65 | } 66 | 67 | variable "cpu_on_dns_pods_high_ok_threshold" { 68 | type = number 69 | default = null 70 | } 71 | 72 | variable "cpu_on_dns_pods_high_priority" { 73 | description = "Number from 1 (high) to 5 (low)." 74 | 75 | type = number 76 | default = 2 77 | } 78 | -------------------------------------------------------------------------------- /cpu-on-dns-pods-high.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | dns_filter_tags = join(" OR ", var.dns_filter_tags) 3 | filter_string = "(${var.filter_str}) AND (${local.dns_filter_tags})" 4 | cpu_on_dns_pods_high_filter = coalesce( 5 | var.cpu_on_dns_pods_high_filter_override, 6 | local.filter_string 7 | ) 8 | } 9 | 10 | module "cpu_on_dns_pods_high" { 11 | source = "kabisa/generic-monitor/datadog" 12 | version = "1.0.0" 13 | 14 | name = "CPU Usage on DNS pods is high" 15 | query = "avg(${var.cpu_on_dns_pods_high_evaluation_period}):avg:docker.cpu.usage{${local.cpu_on_dns_pods_high_filter}} by {kube_cluster_name,host,container_name} > ${var.cpu_on_dns_pods_high_critical}" 16 | alert_message = "Kubernetes CPU usage on DNS pods is too high" 17 | recovery_message = "Kubernetes CPU usage on DNS pods has recovered" 18 | 19 | # monitor level vars 20 | enabled = var.cpu_on_dns_pods_high_enabled 21 | alerting_enabled = var.cpu_on_dns_pods_high_alerting_enabled 22 | critical_threshold = var.cpu_on_dns_pods_high_critical 23 | warning_threshold = var.cpu_on_dns_pods_high_warning 24 | priority = min(var.cpu_on_dns_pods_high_priority + var.priority_offset, 5) 25 | docs = var.cpu_on_dns_pods_high_docs 26 | note = var.cpu_on_dns_pods_high_note 27 | 28 | # module level vars 29 | env = var.env 30 | service = var.service 31 | service_display_name = var.service_display_name 32 | notification_channel = var.notification_channel 33 | additional_tags = var.additional_tags 34 | locked = var.locked 35 | name_prefix = var.name_prefix 36 | name_suffix = var.name_suffix 37 | } 38 | -------------------------------------------------------------------------------- /cpu-requests-low-perc-state-variables.tf: -------------------------------------------------------------------------------- 1 | variable "cpu_requests_low_perc_state_enabled" { 2 | type = bool 3 | default = false 4 | description = "CPU state limits are only available when the state metrics api is deployed https://github.com/kubernetes/kube-state-metrics" 5 | } 6 | 7 | variable "cpu_requests_low_perc_state_warning" { 8 | type = number 9 | default = 80 10 | } 11 | 12 | variable "cpu_requests_low_perc_state_critical" { 13 | type = number 14 | default = 95 15 | } 16 | 17 | variable "cpu_requests_low_perc_state_evaluation_period" { 18 | type = string 19 | default = "last_5m" 20 | } 21 | 22 | variable "cpu_requests_low_perc_state_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "cpu_requests_low_perc_state_docs" { 28 | type = string 29 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 30 | } 31 | 32 | variable "cpu_requests_low_perc_state_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "cpu_requests_low_perc_state_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "cpu_requests_low_perc_state_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "cpu_requests_low_perc_state_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "cpu_requests_low_perc_state_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "cpu_requests_low_perc_state_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 3 62 | } 63 | -------------------------------------------------------------------------------- /cpu-requests-low-perc-state.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | cpu_requests_low_perc_state_filter = coalesce( 3 | var.cpu_requests_low_perc_state_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "cpu_requests_low_perc_state" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available CPU for requests in percentages Low" 13 | query = "max(${var.cpu_requests_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_requested{${local.cpu_requests_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_requests_low_perc_state_filter}} by {host,kube_cluster_name} ) * 100 > ${var.cpu_requests_low_perc_state_critical}" 14 | alert_message = "Kubernetes cluster cpu room for requests / percentage is too low" 15 | recovery_message = "Kubernetes cluster cpu requests / percentage has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.cpu_requests_low_perc_state_enabled 19 | alerting_enabled = var.cpu_requests_low_perc_state_alerting_enabled 20 | critical_threshold = var.cpu_requests_low_perc_state_critical 21 | warning_threshold = var.cpu_requests_low_perc_state_warning 22 | priority = min(var.cpu_requests_low_perc_state_priority + var.priority_offset, 5) 23 | docs = var.cpu_requests_low_perc_state_docs 24 | note = var.cpu_requests_low_perc_state_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /cpu-requests-low-perc-variables.tf: -------------------------------------------------------------------------------- 1 | variable "cpu_requests_low_perc_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "cpu_requests_low_perc_warning" { 7 | type = number 8 | default = 80 9 | } 10 | 11 | variable "cpu_requests_low_perc_critical" { 12 | type = number 13 | default = 95 14 | } 15 | 16 | variable "cpu_requests_low_perc_evaluation_period" { 17 | type = string 18 | default = "last_5m" 19 | } 20 | 21 | variable "cpu_requests_low_perc_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "cpu_requests_low_perc_docs" { 27 | type = string 28 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 29 | } 30 | 31 | variable "cpu_requests_low_perc_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "cpu_requests_low_perc_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "cpu_requests_low_perc_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "cpu_requests_low_perc_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "cpu_requests_low_perc_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "cpu_requests_low_perc_priority" { 57 | description = "Number from 1 (high) to 5 (low)." 58 | 59 | type = number 60 | default = 3 61 | } 62 | -------------------------------------------------------------------------------- /cpu-requests-low-perc.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | cpu_requests_low_perc_filter = coalesce( 3 | var.cpu_requests_low_perc_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "cpu_requests_low_perc" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available CPU for requests in percentages Low" 13 | query = "max(${var.cpu_requests_low_perc_evaluation_period}):100 * sum:kubernetes.cpu.requests{${local.cpu_requests_low_perc_filter}} by {kube_cluster_name,host} / max:system.cpu.num_cores{${local.cpu_requests_low_perc_filter}} by {kube_cluster_name,host} > ${var.cpu_requests_low_perc_critical}" 14 | alert_message = "Kubernetes cluster cpu room for requests / percentage is too low" 15 | recovery_message = "Kubernetes cluster cpu requests / percentage has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.cpu_requests_low_perc_enabled 19 | alerting_enabled = var.cpu_requests_low_perc_alerting_enabled 20 | critical_threshold = var.cpu_requests_low_perc_critical 21 | warning_threshold = var.cpu_requests_low_perc_warning 22 | priority = min(var.cpu_requests_low_perc_priority + var.priority_offset, 5) 23 | docs = var.cpu_requests_low_perc_docs 24 | note = var.cpu_requests_low_perc_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /cpu-requests-low-variables.tf: -------------------------------------------------------------------------------- 1 | variable "cpu_requests_low_enabled" { 2 | type = bool 3 | default = false 4 | description = "This monitor is based on absolute values and thus less useful. Prefer setting cpu_requests_low_perc_enabled to true." 5 | } 6 | 7 | variable "cpu_requests_low_warning" { 8 | type = number 9 | default = 1 10 | } 11 | 12 | variable "cpu_requests_low_critical" { 13 | type = number 14 | default = "0.5" 15 | } 16 | 17 | variable "cpu_requests_low_evaluation_period" { 18 | type = string 19 | default = "last_5m" 20 | } 21 | 22 | variable "cpu_requests_low_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "cpu_requests_low_docs" { 28 | type = string 29 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 30 | } 31 | 32 | variable "cpu_requests_low_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "cpu_requests_low_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "cpu_requests_low_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "cpu_requests_low_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "cpu_requests_low_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "cpu_requests_low_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 3 62 | } 63 | -------------------------------------------------------------------------------- /cpu-requests-low.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | cpu_requests_low_filter = coalesce( 3 | var.cpu_requests_low_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "cpu_requests_low" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available CPU for Requests Low" 13 | query = "max(${var.cpu_requests_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_requests_low_filter}} by {kube_cluster_name,host} - sum:kubernetes.cpu.requests{${local.cpu_requests_low_filter}} by {kube_cluster_name,host} < ${var.cpu_requests_low_critical}" 14 | alert_message = "Kubernetes cluster cpu room for requests is too low" 15 | recovery_message = "Kubernetes cluster cpu requests has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.cpu_requests_low_enabled 19 | alerting_enabled = var.cpu_requests_low_alerting_enabled 20 | critical_threshold = var.cpu_requests_low_critical 21 | warning_threshold = var.cpu_requests_low_warning 22 | priority = min(var.cpu_requests_low_priority + var.priority_offset, 5) 23 | docs = var.cpu_requests_low_docs 24 | note = var.cpu_requests_low_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /daemonset-incomplete-variables.tf: -------------------------------------------------------------------------------- 1 | variable "daemonset_incomplete_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "daemonset_incomplete_critical" { 7 | type = number 8 | default = 0 9 | description = "alert is raised when (desired - running) > daemonset_incomplete_critical" 10 | } 11 | 12 | variable "daemonset_incomplete_evaluation_period" { 13 | type = string 14 | default = "last_15m" 15 | } 16 | 17 | variable "daemonset_incomplete_note" { 18 | type = string 19 | default = "" 20 | } 21 | 22 | variable "daemonset_incomplete_docs" { 23 | type = string 24 | default = "In kubernetes a daemonset is responsible for running the same pod across all Nodes. An example for when this fails, is when the image cannot be pulled, the pod fails to initialize or no resources are available on the cluster\nThis alert is raised when (desired - running) > 0" 25 | } 26 | 27 | variable "daemonset_incomplete_filter_override" { 28 | type = string 29 | default = "" 30 | } 31 | 32 | variable "daemonset_incomplete_alerting_enabled" { 33 | type = bool 34 | default = true 35 | } 36 | 37 | variable "daemonset_incomplete_no_data_timeframe" { 38 | type = number 39 | default = null 40 | } 41 | 42 | variable "daemonset_incomplete_notify_no_data" { 43 | type = bool 44 | default = false 45 | } 46 | 47 | variable "daemonset_incomplete_ok_threshold" { 48 | type = number 49 | default = null 50 | } 51 | 52 | variable "daemonset_incomplete_priority" { 53 | description = "Number from 1 (high) to 5 (low)." 54 | 55 | type = number 56 | default = 2 57 | } 58 | -------------------------------------------------------------------------------- /daemonset-incomplete.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | daemonset_incomplete_filter = coalesce( 3 | var.daemonset_incomplete_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "daemonset_incomplete" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Daemonset Incomplete" 13 | query = "min(${var.daemonset_incomplete_evaluation_period}):max:kubernetes_state.daemonset.scheduled{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} - min:kubernetes_state.daemonset.ready{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} > 0" 14 | alert_message = "Kubernetes Daemonset {{kube_daemon_set}} is incomplete. Missing pod count:{{value}}" 15 | recovery_message = "Kubernetes Daemonset {{kube_daemon_set}} has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.daemonset_incomplete_enabled 19 | alerting_enabled = var.daemonset_incomplete_alerting_enabled 20 | critical_threshold = var.daemonset_incomplete_critical 21 | # no warning threshold for this monitor 22 | priority = min(var.daemonset_incomplete_priority + var.priority_offset, 5) 23 | docs = var.daemonset_incomplete_docs 24 | note = var.daemonset_incomplete_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /daemonset-multiple-restarts-variables.tf: -------------------------------------------------------------------------------- 1 | variable "daemonset_multiple_restarts_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "daemonset_multiple_restarts_warning" { 7 | type = number 8 | default = null 9 | } 10 | 11 | variable "daemonset_multiple_restarts_critical" { 12 | type = number 13 | default = 5.0 14 | } 15 | 16 | variable "daemonset_multiple_restarts_evaluation_period" { 17 | type = string 18 | default = "last_15m" 19 | } 20 | 21 | variable "daemonset_multiple_restarts_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "daemonset_multiple_restarts_docs" { 27 | type = string 28 | default = "If a container restarts once, it can be considered 'normal behaviour' for K8s. A Daemonset restarting multiple times though is a problem" 29 | } 30 | 31 | variable "daemonset_multiple_restarts_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "daemonset_multiple_restarts_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "daemonset_multiple_restarts_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "daemonset_multiple_restarts_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "daemonset_multiple_restarts_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "daemonset_multiple_restarts_name_prefix" { 57 | type = string 58 | default = "" 59 | } 60 | 61 | variable "daemonset_multiple_restarts_name_suffix" { 62 | type = string 63 | default = "" 64 | } 65 | 66 | variable "daemonset_multiple_restarts_priority" { 67 | description = "Number from 1 (high) to 5 (low)." 68 | 69 | type = number 70 | default = 3 71 | } 72 | 73 | variable "daemonset_multiple_restarts_notification_channel_override" { 74 | type = string 75 | default = "" 76 | } 77 | -------------------------------------------------------------------------------- /daemonset-multiple-restarts.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | daemonset_multiple_restarts_filter = coalesce( 3 | var.daemonset_multiple_restarts_filter_override, 4 | "${var.filter_str}${var.filter_str_concatenation}kube_daemon_set:*" 5 | ) 6 | } 7 | 8 | module "daemonset_multiple_restarts" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Daemonset Multiple Restarts" 13 | query = "max(${var.daemonset_multiple_restarts_evaluation_period}):clamp_min(max:kubernetes.containers.restarts{${local.daemonset_multiple_restarts_filter}} by {kube_daemon_set} - hour_before(max:kubernetes.containers.restarts{${local.daemonset_multiple_restarts_filter}} by {kube_daemon_set}), 0) > ${var.daemonset_multiple_restarts_critical}" 14 | 15 | # alert specific configuration 16 | require_full_window = true 17 | alert_message = "Kubernetes Daemonset {{kube_daemon_set.name}} has more than {{threshold}} ({{value}}) restarts within one hour" 18 | recovery_message = "Kubernetes Daemonset {{kube_daemon_set.name}} is now at {{value}} restarts of the last hour" 19 | 20 | # monitor level vars 21 | enabled = var.daemonset_multiple_restarts_enabled 22 | alerting_enabled = var.daemonset_multiple_restarts_alerting_enabled 23 | warning_threshold = var.daemonset_multiple_restarts_warning 24 | critical_threshold = var.daemonset_multiple_restarts_critical 25 | priority = min(var.daemonset_multiple_restarts_priority + var.priority_offset, 5) 26 | docs = var.daemonset_multiple_restarts_docs 27 | note = var.daemonset_multiple_restarts_note 28 | notification_channel = try(coalesce(var.daemonset_multiple_restarts_notification_channel_override, var.notification_channel), "") 29 | 30 | # module level vars 31 | env = var.env 32 | service = var.service 33 | service_display_name = var.service_display_name 34 | additional_tags = var.additional_tags 35 | locked = var.locked 36 | name_prefix = var.name_prefix 37 | name_suffix = var.name_suffix 38 | } 39 | -------------------------------------------------------------------------------- /datadog-agent-variables.tf: -------------------------------------------------------------------------------- 1 | variable "datadog_agent_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "datadog_agent_evaluation_period" { 7 | type = string 8 | default = "last_5m" 9 | } 10 | 11 | variable "datadog_agent_note" { 12 | type = string 13 | default = "" 14 | } 15 | 16 | variable "datadog_agent_docs" { 17 | type = string 18 | default = "" 19 | } 20 | 21 | variable "datadog_agent_filter_override" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "datadog_agent_alerting_enabled" { 27 | type = bool 28 | default = true 29 | } 30 | 31 | variable "datadog_agent_no_data_timeframe" { 32 | type = number 33 | default = null 34 | } 35 | 36 | variable "datadog_agent_notify_no_data" { 37 | type = bool 38 | default = false 39 | } 40 | 41 | variable "datadog_agent_ok_threshold" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "datadog_agent_priority" { 47 | description = "Number from 1 (high) to 5 (low)." 48 | 49 | type = number 50 | default = 2 51 | } 52 | -------------------------------------------------------------------------------- /datadog-agent.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | datadog_agent_filter = coalesce( 3 | var.datadog_agent_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "datadog_agent" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Datadog agent not running" 13 | query = "avg(${var.datadog_agent_evaluation_period}):avg:datadog.agent.running{${local.datadog_agent_filter}} by {host,kube_cluster_name} < 1" 14 | alert_message = "Datadog Agent not running on {{host.name}} in Cluster: {{kube_cluster_name.name}}" 15 | recovery_message = "Agent running again" 16 | notify_no_data = true 17 | no_data_message = "Datadog agent is not running on {{host.name}} in Cluster: {{kube_cluster_name.name}}" 18 | 19 | # monitor level vars 20 | enabled = var.datadog_agent_enabled 21 | alerting_enabled = var.datadog_agent_alerting_enabled 22 | critical_threshold = 1 23 | # no warning threshold for this monitor 24 | priority = min(var.datadog_agent_priority + var.priority_offset, 5) 25 | docs = var.datadog_agent_docs 26 | note = var.datadog_agent_note 27 | 28 | # module level vars 29 | env = var.env 30 | service = var.service 31 | service_display_name = var.service_display_name 32 | notification_channel = var.notification_channel 33 | additional_tags = var.additional_tags 34 | locked = var.locked 35 | name_prefix = var.name_prefix 36 | name_suffix = var.name_suffix 37 | } 38 | -------------------------------------------------------------------------------- /deploy-desired-vs-status-variables.tf: -------------------------------------------------------------------------------- 1 | variable "deploy_desired_vs_status_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "deploy_desired_vs_status_warning" { 7 | type = number 8 | default = 1 9 | # warning at 1 difference 10 | } 11 | 12 | variable "deploy_desired_vs_status_critical" { 13 | type = number 14 | default = 10 15 | # critical at 10 difference 16 | } 17 | 18 | variable "deploy_desired_vs_status_evaluation_period" { 19 | type = string 20 | default = "last_15m" 21 | } 22 | 23 | variable "deploy_desired_vs_status_note" { 24 | type = string 25 | default = "" 26 | } 27 | 28 | variable "deploy_desired_vs_status_docs" { 29 | type = string 30 | default = "The amount of expected pods to run minus the actual number" 31 | } 32 | 33 | variable "deploy_desired_vs_status_filter_override" { 34 | type = string 35 | default = "" 36 | } 37 | 38 | variable "deploy_desired_vs_status_alerting_enabled" { 39 | type = bool 40 | default = true 41 | } 42 | 43 | variable "deploy_desired_vs_status_no_data_timeframe" { 44 | type = number 45 | default = null 46 | } 47 | 48 | variable "deploy_desired_vs_status_notify_no_data" { 49 | type = bool 50 | default = false 51 | } 52 | 53 | variable "deploy_desired_vs_status_ok_threshold" { 54 | type = number 55 | default = null 56 | } 57 | 58 | variable "deploy_desired_vs_status_priority" { 59 | description = "Number from 1 (high) to 5 (low)." 60 | 61 | type = number 62 | default = 3 63 | } 64 | -------------------------------------------------------------------------------- /deploy-desired-vs-status.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | deploy_desired_vs_status_filter = coalesce( 3 | var.deploy_desired_vs_status_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "deploy_desired_vs_status" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Desired pods vs current pods (Deployments)" 13 | query = "avg(${var.deploy_desired_vs_status_evaluation_period}):max:kubernetes_state.deployment.replicas_desired{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} - max:kubernetes_state.deployment.replicas_available{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} > ${var.deploy_desired_vs_status_critical}" 14 | alert_message = "Kubernetes is having trouble getting all the pods to start. (Based on replicas number in all the deployments)" 15 | recovery_message = "All pods described in deployments have started" 16 | notify_no_data = true 17 | no_data_message = "Kubernetes State data missing for {{kube_cluster_name.name}}" 18 | 19 | # monitor level vars 20 | enabled = var.state_metrics_monitoring && var.deploy_desired_vs_status_enabled 21 | alerting_enabled = var.deploy_desired_vs_status_alerting_enabled 22 | critical_threshold = var.deploy_desired_vs_status_critical 23 | warning_threshold = var.deploy_desired_vs_status_warning 24 | priority = min(var.deploy_desired_vs_status_priority + var.priority_offset, 5) 25 | docs = var.deploy_desired_vs_status_docs 26 | note = var.deploy_desired_vs_status_note 27 | 28 | # module level vars 29 | env = var.env 30 | service = var.service 31 | service_display_name = var.service_display_name 32 | notification_channel = var.notification_channel 33 | additional_tags = var.additional_tags 34 | locked = var.locked 35 | name_prefix = var.name_prefix 36 | name_suffix = var.name_suffix 37 | } 38 | -------------------------------------------------------------------------------- /deployment-multiple-restarts-variables.tf: -------------------------------------------------------------------------------- 1 | variable "deployment_multiple_restarts_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "deployment_multiple_restarts_warning" { 7 | type = number 8 | default = null 9 | } 10 | 11 | variable "deployment_multiple_restarts_critical" { 12 | type = number 13 | default = 5.0 14 | } 15 | 16 | variable "deployment_multiple_restarts_evaluation_period" { 17 | type = string 18 | default = "last_15m" 19 | } 20 | 21 | variable "deployment_multiple_restarts_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "deployment_multiple_restarts_docs" { 27 | type = string 28 | default = "If a container restarts once, it can be considered 'normal behaviour' for K8s. A Deployment restarting multiple times though is a problem" 29 | } 30 | 31 | variable "deployment_multiple_restarts_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "deployment_multiple_restarts_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "deployment_multiple_restarts_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "deployment_multiple_restarts_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "deployment_multiple_restarts_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "deployment_multiple_restarts_name_prefix" { 57 | type = string 58 | default = "" 59 | } 60 | 61 | variable "deployment_multiple_restarts_name_suffix" { 62 | type = string 63 | default = "" 64 | } 65 | 66 | variable "deployment_multiple_restarts_priority" { 67 | description = "Number from 1 (high) to 5 (low)." 68 | 69 | type = number 70 | default = 3 71 | } 72 | 73 | variable "deployment_multiple_restarts_notification_channel_override" { 74 | type = string 75 | default = "" 76 | } 77 | -------------------------------------------------------------------------------- /deployment-multiple-restarts.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | deployment_multiple_restarts_filter = coalesce( 3 | var.deployment_multiple_restarts_filter_override, 4 | "${var.filter_str}${var.filter_str_concatenation}kube_deployment:*" 5 | ) 6 | } 7 | 8 | module "deployment_multiple_restarts" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Deployment Multiple Restarts" 13 | query = "max(${var.deployment_multiple_restarts_evaluation_period}):clamp_min(max:kubernetes.containers.restarts{${local.deployment_multiple_restarts_filter}} by {kube_deployment} - hour_before(max:kubernetes.containers.restarts{${local.deployment_multiple_restarts_filter}} by {kube_deployment}), 0) > ${var.deployment_multiple_restarts_critical}" 14 | 15 | # alert specific configuration 16 | require_full_window = true 17 | alert_message = "Kubernetes Deployment {{kube_deployment.name}} has more than {{threshold}} ({{value}}) restarts within one hour" 18 | recovery_message = "Kubernetes Deployment {{kube_deployment.name}} is now at {{value}} restarts of the last hour" 19 | 20 | # monitor level vars 21 | enabled = var.deployment_multiple_restarts_enabled 22 | alerting_enabled = var.deployment_multiple_restarts_alerting_enabled 23 | warning_threshold = var.deployment_multiple_restarts_warning 24 | critical_threshold = var.deployment_multiple_restarts_critical 25 | priority = min(var.deployment_multiple_restarts_priority + var.priority_offset, 5) 26 | docs = var.deployment_multiple_restarts_docs 27 | note = var.deployment_multiple_restarts_note 28 | notification_channel = try(coalesce(var.deployment_multiple_restarts_notification_channel_override, var.notification_channel), "") 29 | 30 | # module level vars 31 | env = var.env 32 | service = var.service 33 | service_display_name = var.service_display_name 34 | additional_tags = var.additional_tags 35 | locked = var.locked 36 | name_prefix = var.name_prefix 37 | name_suffix = var.name_suffix 38 | } 39 | -------------------------------------------------------------------------------- /examples/example.tf: -------------------------------------------------------------------------------- 1 | # tflint-ignore: terraform_module_version 2 | module "kubernetes" { 3 | source = "kabisa/kubernetes/datadog" 4 | 5 | notification_channel = "mail@example.com" 6 | service = "Kubernetes" 7 | env = "prd" 8 | filter_str = "kube_cluster_name:production" 9 | } 10 | -------------------------------------------------------------------------------- /hpa-status-variables.tf: -------------------------------------------------------------------------------- 1 | variable "hpa_status_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "hpa_status_evaluation_period" { 7 | type = string 8 | default = "last_15m" 9 | } 10 | 11 | variable "hpa_status_note" { 12 | type = string 13 | default = "" 14 | } 15 | 16 | variable "hpa_status_docs" { 17 | type = string 18 | default = "The Horizontal Pod Autoscaler automatically scales the number of Pods in a replication controller, deployment, replica set or stateful set based on observed CPU utilization\nWhen the HPA is unavailable, the situation could arise that not enough resources are provisioned to handle the incoming load\nhttps://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/" 19 | } 20 | 21 | variable "hpa_status_filter_override" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "hpa_status_alerting_enabled" { 27 | type = bool 28 | default = true 29 | } 30 | 31 | variable "hpa_status_no_data_timeframe" { 32 | type = number 33 | default = null 34 | } 35 | 36 | variable "hpa_status_notify_no_data" { 37 | type = bool 38 | default = false 39 | } 40 | 41 | variable "hpa_status_ok_threshold" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "hpa_status_priority" { 47 | description = "Number from 1 (high) to 5 (low)." 48 | 49 | type = number 50 | default = 3 51 | } 52 | -------------------------------------------------------------------------------- /hpa-status.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | hpa_status_filter = ( 3 | var.hpa_status_filter_override != "" ? var.hpa_status_filter_override : var.filter_str 4 | ) 5 | } 6 | 7 | module "hpa_status" { 8 | source = "kabisa/generic-monitor/datadog" 9 | version = "1.0.0" 10 | 11 | name = "HPA Status not OK" 12 | query = "avg(${var.hpa_status_evaluation_period}):avg:kubernetes_state.hpa.condition{${local.hpa_status_filter}} by {hpa,kube_namespace,status,condition} < 1" 13 | alert_message = "Kubernetes HPA Status for Node {{node}} is not ok" 14 | recovery_message = "Kubernetes HPA Status for Node {{node}} has recovered" 15 | 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.hpa_status_enabled 19 | alerting_enabled = var.hpa_status_alerting_enabled 20 | critical_threshold = 1 21 | # No warning_threshold possible 22 | priority = min(var.hpa_status_priority + var.priority_offset, 5) 23 | docs = var.hpa_status_docs 24 | note = var.hpa_status_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kabisa/terraform-datadog-kubernetes/36572ad31f227a1b4326c4211d4522f865ac3270/main.tf -------------------------------------------------------------------------------- /memory-limits-low-perc-state-variables.tf: -------------------------------------------------------------------------------- 1 | variable "memory_limits_low_perc_state_enabled" { 2 | type = bool 3 | default = false 4 | description = "Memory state limits are only available when the state metrics api is deployed https://github.com/kubernetes/kube-state-metrics" 5 | } 6 | 7 | variable "memory_limits_low_perc_state_warning" { 8 | type = number 9 | default = 95 10 | } 11 | 12 | variable "memory_limits_low_perc_state_critical" { 13 | type = number 14 | default = 100 15 | } 16 | 17 | variable "memory_limits_low_perc_state_evaluation_period" { 18 | type = string 19 | default = "last_5m" 20 | } 21 | 22 | variable "memory_limits_low_perc_state_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "memory_limits_low_perc_state_docs" { 28 | type = string 29 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 30 | } 31 | 32 | variable "memory_limits_low_perc_state_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "memory_limits_low_perc_state_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "memory_limits_low_perc_state_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "memory_limits_low_perc_state_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "memory_limits_low_perc_state_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "memory_limits_low_perc_state_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 3 62 | } 63 | -------------------------------------------------------------------------------- /memory-limits-low-perc-state.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | memory_limits_low_perc_state_filter = coalesce( 3 | var.memory_limits_low_perc_state_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "memory_limits_low_perc_state" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available Memory for Limits in percentage Low" 13 | query = "max(${var.memory_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.memory_limit{${local.memory_limits_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.memory_allocatable{${local.memory_limits_low_perc_state_filter}} by {host,kube_cluster_name}) * 100 > ${var.memory_limits_low_perc_state_critical}" 14 | alert_message = "Kubernetes cluster memory room for limits in percentage is too low" 15 | recovery_message = "Kubernetes cluster memory limits in percentage has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.memory_limits_low_perc_state_enabled 19 | alerting_enabled = var.memory_limits_low_perc_state_alerting_enabled 20 | critical_threshold = var.memory_limits_low_perc_state_critical 21 | warning_threshold = var.memory_limits_low_perc_state_warning 22 | priority = min(var.memory_limits_low_perc_state_priority + var.priority_offset, 5) 23 | docs = var.memory_limits_low_perc_state_docs 24 | note = var.memory_limits_low_perc_state_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /memory-limits-low-perc-variables.tf: -------------------------------------------------------------------------------- 1 | variable "memory_limits_low_perc_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "memory_limits_low_perc_warning" { 7 | type = number 8 | default = 95 9 | } 10 | 11 | variable "memory_limits_low_perc_critical" { 12 | type = number 13 | default = 100 14 | } 15 | 16 | variable "memory_limits_low_perc_evaluation_period" { 17 | type = string 18 | default = "last_5m" 19 | } 20 | 21 | variable "memory_limits_low_perc_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "memory_limits_low_perc_docs" { 27 | type = string 28 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 29 | } 30 | 31 | variable "memory_limits_low_perc_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "memory_limits_low_perc_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "memory_limits_low_perc_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "memory_limits_low_perc_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "memory_limits_low_perc_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "memory_limits_low_perc_priority" { 57 | description = "Number from 1 (high) to 5 (low)." 58 | 59 | type = number 60 | default = 3 61 | } 62 | -------------------------------------------------------------------------------- /memory-limits-low-perc.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | memory_limits_low_perc_filter = coalesce( 3 | var.memory_limits_low_perc_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "memory_limits_low_perc" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available Memory for Limits in percentage Low" 13 | query = "max(${var.memory_limits_low_perc_evaluation_period}):( max:kubernetes.memory.limits{${local.memory_limits_low_perc_filter}} by {host,kube_cluster_name}/ max:system.mem.total{${local.memory_limits_low_perc_filter}} by {host,kube_cluster_name}) * 100 > ${var.memory_limits_low_perc_critical}" 14 | alert_message = "Kubernetes cluster memory room for limits in percentage is too low" 15 | recovery_message = "Kubernetes cluster memory limits in percentage has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.memory_limits_low_perc_enabled 19 | alerting_enabled = var.memory_limits_low_perc_alerting_enabled 20 | critical_threshold = var.memory_limits_low_perc_critical 21 | warning_threshold = var.memory_limits_low_perc_warning 22 | priority = min(var.memory_limits_low_perc_priority + var.priority_offset, 5) 23 | docs = var.memory_limits_low_perc_docs 24 | note = var.memory_limits_low_perc_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /memory-limits-low-variables.tf: -------------------------------------------------------------------------------- 1 | variable "memory_limits_low_enabled" { 2 | type = bool 3 | default = false 4 | description = "This monitor is based on absolute values and thus less useful. Prefer setting memory_limits_low_perc_enabled to true." 5 | } 6 | 7 | variable "memory_limits_low_warning" { 8 | type = number 9 | default = 4000000000 10 | } 11 | 12 | variable "memory_limits_low_critical" { 13 | type = number 14 | default = 3000000000 15 | } 16 | 17 | variable "memory_limits_low_evaluation_period" { 18 | type = string 19 | default = "last_5m" 20 | } 21 | 22 | variable "memory_limits_low_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "memory_limits_low_docs" { 28 | type = string 29 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 30 | } 31 | 32 | variable "memory_limits_low_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "memory_limits_low_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "memory_limits_low_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "memory_limits_low_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "memory_limits_low_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "memory_limits_low_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 3 62 | } 63 | -------------------------------------------------------------------------------- /memory-limits-low.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | memory_limits_low_filter = coalesce( 3 | var.memory_limits_low_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "memory_limits_low" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available Memory for Limits Low" 13 | query = "avg(${var.memory_limits_low_evaluation_period}):max:system.mem.total{${local.memory_limits_low_filter}} by {host,kube_cluster_name} - max:kubernetes.memory.limits{${local.memory_limits_low_filter}} by {host,kube_cluster_name} < ${var.memory_limits_low_critical}" 14 | alert_message = "Kubernetes cluster memory room for limits is too low" 15 | recovery_message = "Kubernetes cluster memory limits has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.memory_limits_low_enabled 19 | alerting_enabled = var.memory_limits_low_alerting_enabled 20 | critical_threshold = var.memory_limits_low_critical 21 | warning_threshold = var.memory_limits_low_warning 22 | priority = min(var.memory_limits_low_priority + var.priority_offset, 5) 23 | docs = var.memory_limits_low_docs 24 | note = var.memory_limits_low_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /memory-requests-low-perc-state-variables.tf: -------------------------------------------------------------------------------- 1 | variable "memory_requests_low_perc_state_enabled" { 2 | type = bool 3 | default = false 4 | description = "Memory state limits are only available when the state metrics api is deployed https://github.com/kubernetes/kube-state-metrics" 5 | } 6 | 7 | variable "memory_requests_low_perc_state_warning" { 8 | type = number 9 | default = 85 10 | } 11 | 12 | variable "memory_requests_low_perc_state_critical" { 13 | type = number 14 | default = 95 15 | } 16 | 17 | variable "memory_requests_low_perc_state_evaluation_period" { 18 | type = string 19 | default = "last_5m" 20 | } 21 | 22 | variable "memory_requests_low_perc_state_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "memory_requests_low_perc_state_docs" { 28 | type = string 29 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 30 | } 31 | 32 | variable "memory_requests_low_perc_state_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "memory_requests_low_perc_state_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "memory_requests_low_perc_state_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "memory_requests_low_perc_state_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "memory_requests_low_perc_state_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "memory_requests_low_perc_state_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 3 62 | } 63 | -------------------------------------------------------------------------------- /memory-requests-low-perc-state.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | memory_requests_low_perc_state_filter = coalesce( 3 | var.memory_requests_low_perc_state_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "memory_requests_low_perc_state" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available Memory for Requests in percentage Low" 13 | query = "max(${var.memory_requests_low_perc_state_evaluation_period}):( max:kubernetes_state.container.memory_requested{${local.memory_requests_low_perc_state_filter}} / max:kubernetes_state.node.memory_allocatable{${local.memory_requests_low_perc_state_filter}} ) * 100 > ${var.memory_requests_low_perc_state_critical}" 14 | alert_message = "Kubernetes cluster memory room for Requests in percentage is too low" 15 | recovery_message = "Kubernetes cluster memory Requests in percentage has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.memory_requests_low_perc_state_enabled 19 | alerting_enabled = var.memory_requests_low_perc_state_alerting_enabled 20 | critical_threshold = var.memory_requests_low_perc_state_critical 21 | warning_threshold = var.memory_requests_low_perc_state_warning 22 | priority = min(var.memory_requests_low_perc_state_priority + var.priority_offset, 5) 23 | docs = var.memory_requests_low_perc_state_docs 24 | note = var.memory_requests_low_perc_state_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /memory-requests-low-perc-variables.tf: -------------------------------------------------------------------------------- 1 | variable "memory_requests_low_perc_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "memory_requests_low_perc_warning" { 7 | type = number 8 | default = 85 9 | } 10 | 11 | variable "memory_requests_low_perc_critical" { 12 | type = number 13 | default = 95 14 | } 15 | 16 | variable "memory_requests_low_perc_evaluation_period" { 17 | type = string 18 | default = "last_5m" 19 | } 20 | 21 | variable "memory_requests_low_perc_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "memory_requests_low_perc_docs" { 27 | type = string 28 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 29 | } 30 | 31 | variable "memory_requests_low_perc_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "memory_requests_low_perc_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "memory_requests_low_perc_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "memory_requests_low_perc_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "memory_requests_low_perc_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "memory_requests_low_perc_priority" { 57 | description = "Number from 1 (high) to 5 (low)." 58 | 59 | type = number 60 | default = 3 61 | } 62 | -------------------------------------------------------------------------------- /memory-requests-low-perc.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | memory_requests_low_perc_filter = coalesce( 3 | var.memory_requests_low_perc_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "memory_requests_low_perc" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available Memory for Requests in percentage Low" 13 | query = "max(${var.cpu_requests_low_perc_evaluation_period}):( max:kubernetes.memory.requests{${local.cpu_requests_low_perc_filter}} / max:system.mem.total{${local.cpu_requests_low_perc_filter}} ) * 100 > ${var.cpu_requests_low_perc_critical}" 14 | alert_message = "Kubernetes cluster memory room for Requests in percentage is too low" 15 | recovery_message = "Kubernetes cluster memory Requests in percentage has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.memory_requests_low_perc_enabled 19 | alerting_enabled = var.memory_requests_low_perc_alerting_enabled 20 | critical_threshold = var.memory_requests_low_perc_critical 21 | warning_threshold = var.memory_requests_low_perc_warning 22 | priority = min(var.memory_requests_low_perc_priority + var.priority_offset, 5) 23 | docs = var.memory_requests_low_perc_docs 24 | note = var.memory_requests_low_perc_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /memory-requests-low-variables.tf: -------------------------------------------------------------------------------- 1 | variable "memory_requests_low_enabled" { 2 | type = bool 3 | default = false 4 | description = "This monitor is based on absolute values and thus less useful. Prefer setting memory_requests_low_perc_enabled to true." 5 | } 6 | 7 | variable "memory_requests_low_warning" { 8 | type = number 9 | default = 4000000000 # Divided by 1024 = around 4GiB 10 | } 11 | 12 | variable "memory_requests_low_critical" { 13 | type = number 14 | default = 3000000000 # Divided by 1024 = around 3GiB 15 | } 16 | 17 | variable "memory_requests_low_evaluation_period" { 18 | type = string 19 | default = "last_5m" 20 | } 21 | 22 | variable "memory_requests_low_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "memory_requests_low_docs" { 28 | type = string 29 | default = "If the node where a Pod is running has enough of a resource available, it's possible (and allowed) for a container to use more of a resource than its request for that resource specifies. However, a container is not allowed to use more than its resource limit. https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/" 30 | } 31 | 32 | variable "memory_requests_low_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "memory_requests_low_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "memory_requests_low_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "memory_requests_low_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "memory_requests_low_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "memory_requests_low_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 3 62 | } 63 | -------------------------------------------------------------------------------- /memory-requests-low.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | memory_requests_low_filter = coalesce( 3 | var.memory_requests_low_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "memory_requests_low" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Available Memory for Requests Low" 13 | query = "avg(${var.memory_requests_low_evaluation_period}):max:system.mem.total{${local.memory_requests_low_filter}} by {host,kube_cluster_name} - max:kubernetes.memory.requests{${local.memory_requests_low_filter}} by {host,kube_cluster_name} < ${var.memory_requests_low_critical}" 14 | alert_message = "Total memory available for requests on {{ host }} is low ({{value}})" 15 | recovery_message = "Total memory available for requests on {{ host }} has recovered ({{value}})" 16 | 17 | # monitor level vars 18 | enabled = var.memory_requests_low_enabled 19 | alerting_enabled = var.memory_requests_low_alerting_enabled 20 | critical_threshold = var.memory_requests_low_critical 21 | warning_threshold = var.memory_requests_low_warning 22 | priority = min(var.memory_requests_low_priority + var.priority_offset, 5) 23 | docs = var.memory_requests_low_docs 24 | note = var.memory_requests_low_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /module_description.md: -------------------------------------------------------------------------------- 1 | This module mainly check on Kubernetes resource level and cluster health. 2 | System level monitoring can best be implemented with the [system module](https://github.com/kabisa/terraform-datadog-system). 3 | Docker/Container level monitoring can best be implemented with the [docker module](https://github.com/kabisa/terraform-datadog-docker-container). 4 | 5 | # Recent changes: 6 | 7 | - switch from kubernetes_state to kubernetes_state_core as a default https://docs.datadoghq.com/integrations/kubernetes_state_core/?tab=helm 8 | - upgrade provider to ~> 3.12 9 | -------------------------------------------------------------------------------- /network-unavailable-variables.tf: -------------------------------------------------------------------------------- 1 | variable "network_unavailable_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "network_unavailable_critical" { 7 | type = number 8 | default = 0 9 | description = "alert is raised when (desired - running) > network_unavailable_critical" 10 | } 11 | 12 | variable "network_unavailable_evaluation_period" { 13 | type = string 14 | default = "last_5m" 15 | } 16 | 17 | variable "network_unavailable_note" { 18 | type = string 19 | default = "" 20 | } 21 | 22 | variable "network_unavailable_docs" { 23 | type = string 24 | default = "All your nodes need network connections, and this status indicates that there’s something wrong with a node’s network connection. Either it wasn’t set up properly (due to route exhaustion or a misconfiguration), or there’s a physical problem with the network connection to your hardware." 25 | } 26 | 27 | variable "network_unavailable_filter_override" { 28 | type = string 29 | default = "" 30 | } 31 | 32 | variable "network_unavailable_alerting_enabled" { 33 | type = bool 34 | default = true 35 | } 36 | 37 | variable "network_unavailable_no_data_timeframe" { 38 | type = number 39 | default = null 40 | } 41 | 42 | variable "network_unavailable_notify_no_data" { 43 | type = bool 44 | default = false 45 | } 46 | 47 | variable "network_unavailable_ok_threshold" { 48 | type = number 49 | default = null 50 | } 51 | 52 | variable "network_unavailable_priority" { 53 | description = "Number from 1 (high) to 5 (low)." 54 | 55 | type = number 56 | default = 3 57 | } 58 | -------------------------------------------------------------------------------- /network-unavailable.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | network_unavailable_filter = coalesce( 3 | var.network_unavailable_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "network_unavailable" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Nodes with Network Unavailable" 13 | query = "avg(${var.network_unavailable_evaluation_period}):max:kubernetes_state.node.by_condition{${local.network_unavailable_filter} AND condition:networkunavailable AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.network_unavailable_critical}" 14 | alert_message = "Kubernetes cluster node {{node}} has no network. Meaning it is not accessible" 15 | recovery_message = "Kubernetes cluster node {{node}} has come back on the network" 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.network_unavailable_enabled 19 | alerting_enabled = var.network_unavailable_alerting_enabled 20 | critical_threshold = var.network_unavailable_critical 21 | # no warning threshold for this monitor 22 | priority = min(var.network_unavailable_priority + var.priority_offset, 5) 23 | docs = var.network_unavailable_docs 24 | note = var.network_unavailable_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /node-diskpressure-variables.tf: -------------------------------------------------------------------------------- 1 | variable "node_diskpressure_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "node_diskpressure_critical" { 7 | type = number 8 | default = 0 9 | description = "alert is raised when (desired - running) > node_diskpressure_critical" 10 | } 11 | 12 | variable "node_diskpressure_evaluation_period" { 13 | type = string 14 | default = "last_5m" 15 | } 16 | 17 | variable "node_diskpressure_note" { 18 | type = string 19 | default = "" 20 | } 21 | 22 | variable "node_diskpressure_docs" { 23 | type = string 24 | default = "Disk pressure is a condition indicating that a node is using too much disk space or is using disk space too fast, according to the thresholds you have set in your Kubernetes configuration. This is important to monitor because it might mean that you need to add more disk space, if your application legitimately needs more space. Or it might mean that an application is misbehaving and filling up the disk prematurely in an unanticipated manner. Either way, it’s a condition which needs your attention." 25 | } 26 | 27 | variable "node_diskpressure_filter_override" { 28 | type = string 29 | default = "" 30 | } 31 | 32 | variable "node_diskpressure_alerting_enabled" { 33 | type = bool 34 | default = true 35 | } 36 | 37 | variable "node_diskpressure_no_data_timeframe" { 38 | type = number 39 | default = null 40 | } 41 | 42 | variable "node_diskpressure_notify_no_data" { 43 | type = bool 44 | default = false 45 | } 46 | 47 | variable "node_diskpressure_ok_threshold" { 48 | type = number 49 | default = null 50 | } 51 | 52 | variable "node_diskpressure_priority" { 53 | description = "Number from 1 (high) to 5 (low)." 54 | 55 | type = number 56 | default = 3 57 | } 58 | -------------------------------------------------------------------------------- /node-diskpressure.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | node_diskpressure_filter = coalesce( 3 | var.node_diskpressure_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "node_diskpressure" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Nodes with Diskpressure" 13 | query = "avg(${var.node_diskpressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.node_diskpressure_filter} AND condition:diskpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.node_diskpressure_critical}" 14 | alert_message = "Kubernetes cluster node {{node}} has diskpressure. Meaning it is low on disk space (Logging, emptydir volumes, caching, etc)" 15 | recovery_message = "Kubernetes cluster node {{node}} no longer has problems with DiskPressure." 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.node_diskpressure_enabled 19 | alerting_enabled = var.node_diskpressure_alerting_enabled 20 | critical_threshold = var.node_diskpressure_critical 21 | # no warning threshold for this monitor 22 | priority = min(var.node_diskpressure_priority + var.priority_offset, 5) 23 | docs = var.node_diskpressure_docs 24 | note = var.node_diskpressure_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /node-memory-used-percent-variables.tf: -------------------------------------------------------------------------------- 1 | variable "node_memory_used_percent_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "node_memory_used_percent_warning" { 7 | type = number 8 | default = 80 9 | # 80 % 10 | } 11 | 12 | variable "node_memory_used_percent_critical" { 13 | type = number 14 | default = 90 15 | # 90 % 16 | } 17 | 18 | variable "node_memory_used_percent_evaluation_period" { 19 | type = string 20 | default = "last_5m" 21 | } 22 | 23 | variable "node_memory_used_percent_note" { 24 | type = string 25 | default = "" 26 | } 27 | 28 | variable "node_memory_used_percent_docs" { 29 | type = string 30 | default = "" 31 | } 32 | 33 | variable "node_memory_used_percent_filter_override" { 34 | type = string 35 | default = "" 36 | } 37 | 38 | variable "node_memory_used_percent_alerting_enabled" { 39 | type = bool 40 | default = true 41 | } 42 | 43 | variable "node_memory_used_percent_no_data_timeframe" { 44 | type = number 45 | default = null 46 | } 47 | 48 | variable "node_memory_used_percent_notify_no_data" { 49 | type = bool 50 | default = false 51 | } 52 | 53 | variable "node_memory_used_percent_ok_threshold" { 54 | type = number 55 | default = null 56 | } 57 | 58 | variable "node_memory_used_percent_priority" { 59 | description = "Number from 1 (high) to 5 (low)." 60 | 61 | type = number 62 | default = 2 63 | } 64 | -------------------------------------------------------------------------------- /node-memory-used-percent.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | node_memory_used_percent_filter = coalesce( 3 | var.node_memory_used_percent_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "node_memory_used_percent" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Memory Used Percent" 13 | query = "avg(${var.node_memory_used_percent_evaluation_period}):( 100 * max:kubernetes.memory.usage{${local.node_memory_used_percent_filter}} by {host,kube_cluster_name} ) / max:system.mem.total{${local.node_memory_used_percent_filter}} by {host,kube_cluster_name} > ${var.node_memory_used_percent_critical}" 14 | alert_message = "Available memory on ${var.service} Node {{host.name}} has dropped below {{threshold}} and has {{value}}% available" 15 | recovery_message = "Available memory on ${var.service} Node {{host.name}} has recovered {{value}}%" 16 | 17 | # monitor level vars 18 | enabled = var.node_memory_used_percent_enabled 19 | alerting_enabled = var.node_memory_used_percent_alerting_enabled 20 | critical_threshold = var.node_memory_used_percent_critical 21 | warning_threshold = var.node_memory_used_percent_warning 22 | priority = min(var.node_memory_used_percent_priority + var.priority_offset, 5) 23 | docs = var.node_memory_used_percent_docs 24 | note = var.node_memory_used_percent_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /node-memorypressure-variables.tf: -------------------------------------------------------------------------------- 1 | variable "node_memorypressure_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "node_memorypressure_critical" { 7 | type = number 8 | default = 0 9 | description = "alert is raised when (desired - running) > node_memorypressure_critical" 10 | } 11 | 12 | variable "node_memorypressure_evaluation_period" { 13 | type = string 14 | default = "last_5m" 15 | } 16 | 17 | variable "node_memorypressure_note" { 18 | type = string 19 | default = "" 20 | } 21 | 22 | variable "node_memorypressure_docs" { 23 | type = string 24 | default = "Memory pressure is a resourcing condition indicating that your node is running out of memory. Similar to CPU resourcing, you don’t want to run out of memory. You especially need to watch for this condition because it could mean there’s a memory leak in one of your applications." 25 | } 26 | 27 | variable "node_memorypressure_filter_override" { 28 | type = string 29 | default = "" 30 | } 31 | 32 | variable "node_memorypressure_alerting_enabled" { 33 | type = bool 34 | default = true 35 | } 36 | 37 | variable "node_memorypressure_no_data_timeframe" { 38 | type = number 39 | default = null 40 | } 41 | 42 | variable "node_memorypressure_notify_no_data" { 43 | type = bool 44 | default = false 45 | } 46 | 47 | variable "node_memorypressure_ok_threshold" { 48 | type = number 49 | default = null 50 | } 51 | 52 | variable "node_memorypressure_priority" { 53 | description = "Number from 1 (high) to 5 (low)." 54 | 55 | type = number 56 | default = 3 57 | } 58 | -------------------------------------------------------------------------------- /node-memorypressure.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | node_memorypressure_filter = coalesce( 3 | var.node_memorypressure_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "node_memorypressure" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Nodes with Memorypressure" 13 | query = "avg(${var.node_memorypressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.node_memorypressure_filter} AND condition:memorypressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.node_memorypressure_critical}" 14 | alert_message = "Kubernetes cluster node {{node}} has memorypressure. Meaning it is low on memory" 15 | recovery_message = "Kubernetes cluster node {{node}} no longer has Memory Pressure." 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.node_memorypressure_enabled 19 | alerting_enabled = var.node_memorypressure_alerting_enabled 20 | critical_threshold = var.node_memorypressure_critical 21 | # no warning threshold for this monitor 22 | priority = min(var.node_memorypressure_priority + var.priority_offset, 5) 23 | docs = var.node_memorypressure_docs 24 | note = var.node_memorypressure_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /node-ready-variables.tf: -------------------------------------------------------------------------------- 1 | variable "node_ready_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "node_ready_critical" { 7 | type = number 8 | default = 1 9 | } 10 | 11 | variable "node_ready_evaluation_period" { 12 | type = string 13 | default = "last_5m" 14 | } 15 | 16 | variable "node_ready_note" { 17 | type = string 18 | default = "" 19 | } 20 | 21 | variable "node_ready_docs" { 22 | type = string 23 | default = "Checks to see if the node is in ready status or not" 24 | } 25 | 26 | variable "node_ready_filter_override" { 27 | type = string 28 | default = "" 29 | } 30 | 31 | variable "node_ready_alerting_enabled" { 32 | type = bool 33 | default = true 34 | } 35 | 36 | variable "node_ready_no_data_timeframe" { 37 | type = number 38 | default = null 39 | } 40 | 41 | variable "node_ready_notify_no_data" { 42 | type = bool 43 | default = false 44 | } 45 | 46 | variable "node_ready_ok_threshold" { 47 | type = number 48 | default = null 49 | } 50 | 51 | variable "node_ready_priority" { 52 | description = "Number from 1 (high) to 5 (low)." 53 | 54 | type = number 55 | default = 2 56 | } 57 | -------------------------------------------------------------------------------- /node-ready.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | node_ready_filter = coalesce( 3 | var.node_ready_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "node_ready" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Node Not Ready" 13 | query = "avg(${var.node_ready_evaluation_period}):count_nonzero(sum:kubernetes_state.node.by_condition{${local.node_ready_filter} AND (NOT condition:ready) AND (status:true OR status:unknown)} by {kube_cluster_name,host}) > ${var.node_ready_critical}" 14 | alert_message = "Kubernetes cluster node {{host}} is not ready." 15 | recovery_message = "Kubernetes cluster node {{host}} is ready again." 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.node_ready_enabled 19 | alerting_enabled = var.node_ready_alerting_enabled 20 | critical_threshold = var.node_ready_critical 21 | # no warning threshold for this monitor 22 | priority = min(var.node_ready_priority + var.priority_offset, 5) 23 | docs = var.node_ready_docs 24 | note = var.node_ready_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /node-status-variables.tf: -------------------------------------------------------------------------------- 1 | variable "node_status_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "node_status_evaluation_period" { 7 | type = string 8 | default = "last_5m" 9 | } 10 | 11 | variable "node_status_note" { 12 | type = string 13 | default = "" 14 | } 15 | 16 | variable "node_status_docs" { 17 | type = string 18 | default = "This cluster state metric provides a high-level overview of a node’s health and whether the scheduler can place pods on that node. It runs checks on the following node conditions\nhttps://kubernetes.io/docs/concepts/architecture/nodes/#condition" 19 | } 20 | 21 | variable "node_status_filter_override" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "node_status_alerting_enabled" { 27 | type = bool 28 | default = true 29 | } 30 | 31 | variable "node_status_no_data_timeframe" { 32 | type = number 33 | default = null 34 | } 35 | 36 | variable "node_status_notify_no_data" { 37 | type = bool 38 | default = false 39 | } 40 | 41 | variable "node_status_ok_threshold" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "node_status_priority" { 47 | description = "Number from 1 (high) to 5 (low)." 48 | 49 | type = number 50 | default = 2 51 | } 52 | -------------------------------------------------------------------------------- /node-status.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | node_status_filter = coalesce( 3 | var.node_status_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "node_status" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Node Status not OK" 13 | query = "avg(${var.node_status_evaluation_period}):avg:kubernetes_state.node.status{${local.node_status_filter}} by {kube_cluster_name,node} < 1" 14 | alert_message = "Kubernetes Node Status for Node {{node}} is not ok" 15 | recovery_message = "Kubernetes Node Status for Node {{node}} has recovered" 16 | require_full_window = false 17 | 18 | # monitor level vars 19 | enabled = var.state_metrics_monitoring && var.node_status_enabled 20 | alerting_enabled = var.node_status_alerting_enabled 21 | critical_threshold = 1 22 | # No warning possible for status that is either 0 or 1 23 | priority = min(var.node_status_priority + var.priority_offset, 5) 24 | docs = var.node_status_docs 25 | note = var.node_status_note 26 | 27 | # module level vars 28 | env = var.env 29 | service = var.service 30 | service_display_name = var.service_display_name 31 | notification_channel = var.notification_channel 32 | additional_tags = var.additional_tags 33 | locked = var.locked 34 | name_prefix = var.name_prefix 35 | name_suffix = var.name_suffix 36 | } 37 | -------------------------------------------------------------------------------- /persistent-volumes-variables.tf: -------------------------------------------------------------------------------- 1 | variable "persistent_volumes_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "persistent_volumes_warning" { 7 | type = number 8 | default = 0 9 | } 10 | 11 | variable "persistent_volumes_critical" { 12 | type = number 13 | default = 1 14 | } 15 | 16 | variable "persistent_volumes_evaluation_period" { 17 | type = string 18 | default = "last_5m" 19 | } 20 | 21 | variable "persistent_volumes_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "persistent_volumes_docs" { 27 | type = string 28 | default = "" 29 | } 30 | 31 | variable "persistent_volumes_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "persistent_volumes_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "persistent_volumes_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "persistent_volumes_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "persistent_volumes_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "persistent_volumes_priority" { 57 | description = "Number from 1 (high) to 5 (low)." 58 | 59 | type = number 60 | default = 3 61 | } 62 | -------------------------------------------------------------------------------- /persistent-volumes.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | persistent_volumes_filter = coalesce( 3 | var.persistent_volumes_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "persistent_volumes_low" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Failed Persistent Volume Claims" 13 | query = "avg(${var.persistent_volumes_evaluation_period}):max:kubernetes_state.persistentvolume.by_phase{${local.persistent_volumes_filter} AND phase:failed} > ${var.persistent_volumes_critical}" 14 | alert_message = "There are failed Physical Volume Claims, storage has problems" 15 | recovery_message = "There are no failed Physical Volume Claims" 16 | 17 | # monitor level vars 18 | enabled = var.persistent_volumes_enabled 19 | alerting_enabled = var.persistent_volumes_alerting_enabled 20 | critical_threshold = var.persistent_volumes_critical 21 | warning_threshold = var.persistent_volumes_warning 22 | priority = min(var.persistent_volumes_priority + var.priority_offset, 5) 23 | docs = var.persistent_volumes_docs 24 | note = var.persistent_volumes_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /pid-pressure-variables.tf: -------------------------------------------------------------------------------- 1 | variable "pid_pressure_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "pid_pressure_critical" { 7 | type = number 8 | default = 0 9 | description = "alert is raised when (desired - running) > pid_pressure_critical" 10 | } 11 | 12 | variable "pid_pressure_evaluation_period" { 13 | type = string 14 | default = "last_5m" 15 | } 16 | 17 | variable "pid_pressure_note" { 18 | type = string 19 | default = "" 20 | } 21 | 22 | variable "pid_pressure_docs" { 23 | type = string 24 | default = "PID pressure is a rare condition where a pod or container spawns too many processes and starves the node of available process IDs. Each node has a limited number of process IDs to distribute amongst running processes; and if it runs out of IDs, no other processes can be started. Kubernetes lets you set PID thresholds for pods to limit their ability to perform runaway process-spawning, and a PID pressure condition means that one or more pods are using up their allocated PIDs and need to be examined." 25 | } 26 | 27 | variable "pid_pressure_filter_override" { 28 | type = string 29 | default = "" 30 | } 31 | 32 | variable "pid_pressure_alerting_enabled" { 33 | type = bool 34 | default = true 35 | } 36 | 37 | variable "pid_pressure_no_data_timeframe" { 38 | type = number 39 | default = null 40 | } 41 | 42 | variable "pid_pressure_notify_no_data" { 43 | type = bool 44 | default = false 45 | } 46 | 47 | variable "pid_pressure_ok_threshold" { 48 | type = number 49 | default = null 50 | } 51 | 52 | variable "pid_pressure_priority" { 53 | description = "Number from 1 (high) to 5 (low)." 54 | 55 | type = number 56 | default = 3 57 | } 58 | -------------------------------------------------------------------------------- /pid-pressure.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | pid_pressure_filter = coalesce( 3 | var.pid_pressure_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "pid_pressure" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Nodes with PID Pressure" 13 | query = "avg(${var.pid_pressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.pid_pressure_filter} AND condition:pidpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.pid_pressure_critical}" 14 | alert_message = "Kubernetes cluster node {{node}} has PID Pressure, meaning it may not be able to start more containers" 15 | recovery_message = "Kubernetes cluster node {{node}} n olonger has pid pressure." 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.pid_pressure_enabled 19 | alerting_enabled = var.pid_pressure_alerting_enabled 20 | critical_threshold = var.pid_pressure_critical 21 | # no warning threshold for this monitor 22 | priority = min(var.pid_pressure_priority + var.priority_offset, 5) 23 | docs = var.pid_pressure_docs 24 | note = var.pid_pressure_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /pod-count-per-node-high-variables.tf: -------------------------------------------------------------------------------- 1 | variable "pod_count_per_node_high_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "pod_count_per_node_high_warning" { 7 | type = number 8 | default = 90.0 9 | } 10 | 11 | variable "pod_count_per_node_high_critical" { 12 | type = number 13 | default = 100.0 14 | } 15 | 16 | variable "pod_count_per_node_high_warning_recovery" { 17 | type = number 18 | default = null 19 | } 20 | 21 | variable "pod_count_per_node_high_critical_recovery" { 22 | type = number 23 | default = null 24 | } 25 | 26 | variable "pod_count_per_node_high_evaluation_period" { 27 | type = string 28 | default = "last_10m" 29 | } 30 | 31 | variable "pod_count_per_node_high_note" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "pod_count_per_node_high_docs" { 37 | type = string 38 | default = "" 39 | } 40 | 41 | variable "pod_count_per_node_high_filter_override" { 42 | type = string 43 | default = "" 44 | } 45 | 46 | variable "pod_count_per_node_high_alerting_enabled" { 47 | type = bool 48 | default = true 49 | } 50 | 51 | variable "pod_count_per_node_high_no_data_timeframe" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "pod_count_per_node_high_notify_no_data" { 57 | type = bool 58 | default = false 59 | } 60 | 61 | variable "pod_count_per_node_high_ok_threshold" { 62 | type = number 63 | default = null 64 | } 65 | 66 | variable "pod_count_per_node_high_name_prefix" { 67 | type = string 68 | default = "" 69 | } 70 | 71 | variable "pod_count_per_node_high_name_suffix" { 72 | type = string 73 | default = "" 74 | } 75 | 76 | variable "pod_count_per_node_high_priority" { 77 | description = "Number from 1 (high) to 5 (low)." 78 | 79 | type = number 80 | default = 2 81 | } 82 | -------------------------------------------------------------------------------- /pod-count-per-node-high.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | pod_count_per_node_high_filter = coalesce( 3 | var.pod_count_per_node_high_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "pod_count_per_node_high" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Pod count per node high" 13 | query = "min(${var.pod_count_per_node_high_evaluation_period}):sum:kubernetes.pods.running{${local.pod_count_per_node_high_filter}} by {host} > ${var.pod_count_per_node_high_critical}" 14 | 15 | # alert specific configuration 16 | require_full_window = false 17 | alert_message = "Pod count per node high ({{ value }}) in {{ service }} exceeds {{ threshold }}" 18 | recovery_message = "Pod count per node high ({{ value }}) in {{ service }} has recovered" 19 | 20 | # monitor level vars 21 | enabled = var.pod_count_per_node_high_enabled 22 | alerting_enabled = var.pod_count_per_node_high_alerting_enabled 23 | critical_threshold = var.pod_count_per_node_high_critical 24 | critical_recovery = var.pod_count_per_node_high_critical_recovery 25 | warning_threshold = var.pod_count_per_node_high_warning 26 | warning_recovery = var.pod_count_per_node_high_warning_recovery 27 | priority = min(var.pod_count_per_node_high_priority + var.priority_offset, 5) 28 | docs = var.pod_count_per_node_high_docs 29 | note = var.pod_count_per_node_high_note 30 | 31 | # module level vars 32 | env = var.env 33 | service = var.service 34 | service_display_name = var.service_display_name 35 | notification_channel = var.notification_channel 36 | additional_tags = var.additional_tags 37 | locked = var.locked 38 | name_prefix = var.name_prefix 39 | name_suffix = var.name_suffix 40 | } 41 | -------------------------------------------------------------------------------- /pod-ready-variables.tf: -------------------------------------------------------------------------------- 1 | variable "pod_ready_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "pod_ready_evaluation_period" { 7 | type = string 8 | default = "last_30m" 9 | } 10 | 11 | variable "pod_ready_note" { 12 | type = string 13 | default = "" 14 | } 15 | 16 | variable "pod_ready_docs" { 17 | type = string 18 | default = "A pod may be running but not available, meaning it is not ready and able to accept traffic. This is normal during certain circumstances, such as when a pod is newly launched or when a change is made and deployed to the specification of that pod. But if you see spikes in the number of unavailable pods, or pods that are consistently unavailable, it might indicate a problem with their configuration.\nhttps://www.datadoghq.com/blog/monitoring-kubernetes-performance-metrics/" 19 | } 20 | 21 | variable "pod_ready_filter_override" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "pod_ready_alerting_enabled" { 27 | type = bool 28 | default = true 29 | } 30 | 31 | variable "pod_ready_no_data_timeframe" { 32 | type = number 33 | default = null 34 | } 35 | 36 | variable "pod_ready_notify_no_data" { 37 | type = bool 38 | default = false 39 | } 40 | 41 | variable "pod_ready_ok_threshold" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "pod_ready_priority" { 47 | description = "Number from 1 (high) to 5 (low)." 48 | 49 | type = number 50 | default = 3 51 | } 52 | -------------------------------------------------------------------------------- /pod-ready.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | pod_ready_filter = coalesce( 3 | var.pod_ready_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "pod_ready" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Pod status not ready" 13 | query = "min(${var.pod_ready_evaluation_period}):sum:kubernetes_state.pod.count{${local.pod_ready_filter}} by {kube_cluster_name,kube_namespace} - sum:kubernetes_state.pod.ready{${local.pod_ready_filter}} by {kube_cluster_name,kube_namespace} > 0" 14 | alert_message = "Kubernetes Pod {{value}} status not ready in namespace {{kube_namespace}} " 15 | recovery_message = "Kubernetes Pod status recovered in namespace {{kube_namespace}}" 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.pod_ready_enabled 19 | alerting_enabled = var.pod_ready_alerting_enabled 20 | critical_threshold = 0 21 | # No warning possible for status that is either 0 or 1 22 | priority = min(var.pod_ready_priority + var.priority_offset, 5) 23 | docs = var.pod_ready_docs 24 | note = var.pod_ready_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /pod-restarts-variables.tf: -------------------------------------------------------------------------------- 1 | variable "pod_restarts_enabled" { 2 | type = bool 3 | description = "Deprecated in favour of multiple restarts monitoring for Daemonset and Deployment" 4 | default = false 5 | } 6 | 7 | variable "pod_restarts_warning" { 8 | type = number 9 | default = 3 10 | } 11 | 12 | variable "pod_restarts_critical" { 13 | type = number 14 | default = 5 15 | } 16 | 17 | variable "pod_restarts_evaluation_period" { 18 | type = string 19 | default = "last_10m" 20 | } 21 | 22 | variable "pod_restarts_note" { 23 | type = string 24 | default = "" 25 | } 26 | 27 | variable "pod_restarts_docs" { 28 | type = string 29 | default = "" 30 | } 31 | 32 | variable "pod_restarts_filter_override" { 33 | type = string 34 | default = "" 35 | } 36 | 37 | variable "pod_restarts_alerting_enabled" { 38 | type = bool 39 | default = true 40 | } 41 | 42 | variable "pod_restarts_no_data_timeframe" { 43 | type = number 44 | default = null 45 | } 46 | 47 | variable "pod_restarts_notify_no_data" { 48 | type = bool 49 | default = false 50 | } 51 | 52 | variable "pod_restarts_ok_threshold" { 53 | type = number 54 | default = null 55 | } 56 | 57 | variable "pod_restarts_priority" { 58 | description = "Number from 1 (high) to 5 (low)." 59 | 60 | type = number 61 | default = 2 62 | } 63 | -------------------------------------------------------------------------------- /pod-restarts.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | pod_restarts_filter = coalesce( 3 | var.pod_restarts_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "pod_restarts" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Restarting Pods" 13 | query = "change(avg(${var.pod_restarts_evaluation_period}),${var.pod_restarts_evaluation_period}):exclude_null(avg:kubernetes.containers.restarts{${local.pod_restarts_filter}} by {pod_name}) > ${var.pod_restarts_critical}" 14 | alert_message = "Pods are restarting multiple times in the last ${var.pod_restarts_evaluation_period}" 15 | recovery_message = "Pods restarting recovered" 16 | 17 | # monitor level vars 18 | enabled = var.pod_restarts_enabled 19 | alerting_enabled = var.pod_restarts_alerting_enabled 20 | critical_threshold = var.pod_restarts_critical 21 | warning_threshold = var.pod_restarts_warning 22 | priority = min(var.pod_restarts_priority + var.priority_offset, 5) 23 | docs = var.pod_restarts_docs 24 | note = var.pod_restarts_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /pods-failed-variables.tf: -------------------------------------------------------------------------------- 1 | variable "pods_failed_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "pods_failed_warning" { 7 | type = number 8 | default = null 9 | } 10 | 11 | variable "pods_failed_critical" { 12 | type = number 13 | default = 0.0 14 | } 15 | 16 | variable "pods_failed_evaluation_period" { 17 | type = string 18 | default = "last_10m" 19 | } 20 | 21 | variable "pods_failed_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "pods_failed_docs" { 27 | type = string 28 | default = "https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/" 29 | } 30 | 31 | variable "pods_failed_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "pods_failed_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "pods_failed_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "pods_failed_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "pods_failed_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "pods_failed_name_prefix" { 57 | type = string 58 | default = "" 59 | } 60 | 61 | variable "pods_failed_name_suffix" { 62 | type = string 63 | default = "" 64 | } 65 | 66 | variable "pods_failed_priority" { 67 | description = "Number from 1 (high) to 5 (low)." 68 | 69 | type = number 70 | default = 3 71 | } 72 | -------------------------------------------------------------------------------- /pods-failed.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | pods_failed_filter = coalesce( 3 | var.pods_failed_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "pods_failed" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Pods Failed" 13 | query = "min(${var.pods_failed_evaluation_period}):default_zero(max:kubernetes_state.pod.status_phase{phase:failed${var.filter_str_concatenation}${local.pods_failed_filter}} by {kube_namespace}) > ${var.pods_failed_critical}" 14 | 15 | # alert specific configuration 16 | require_full_window = true 17 | alert_message = "Kubernetes pods failed ({{ value }}) in {{ service }} exceeds {{ threshold }}" 18 | recovery_message = "Kubernetes pods failed ({{ value }}) in {{ service }} has recovered" 19 | 20 | # monitor level vars 21 | enabled = var.pods_failed_enabled 22 | alerting_enabled = var.pods_failed_alerting_enabled 23 | warning_threshold = var.pods_failed_warning 24 | critical_threshold = var.pods_failed_critical 25 | priority = min(var.pods_failed_priority + var.priority_offset, 5) 26 | docs = var.pods_failed_docs 27 | note = var.pods_failed_note 28 | 29 | # module level vars 30 | env = var.env 31 | service = var.service 32 | service_display_name = var.service_display_name 33 | notification_channel = var.notification_channel 34 | additional_tags = var.additional_tags 35 | locked = var.locked 36 | name_prefix = var.name_prefix 37 | name_suffix = var.name_suffix 38 | } 39 | -------------------------------------------------------------------------------- /pods-pending-variables.tf: -------------------------------------------------------------------------------- 1 | variable "pods_pending_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "pods_pending_warning" { 7 | type = number 8 | default = null 9 | } 10 | 11 | variable "pods_pending_critical" { 12 | type = number 13 | default = 0.0 14 | } 15 | 16 | variable "pods_pending_evaluation_period" { 17 | type = string 18 | default = "last_10m" 19 | } 20 | 21 | variable "pods_pending_note" { 22 | type = string 23 | default = "" 24 | } 25 | 26 | variable "pods_pending_docs" { 27 | type = string 28 | default = "https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/" 29 | } 30 | 31 | variable "pods_pending_filter_override" { 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "pods_pending_alerting_enabled" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "pods_pending_no_data_timeframe" { 42 | type = number 43 | default = null 44 | } 45 | 46 | variable "pods_pending_notify_no_data" { 47 | type = bool 48 | default = false 49 | } 50 | 51 | variable "pods_pending_ok_threshold" { 52 | type = number 53 | default = null 54 | } 55 | 56 | variable "pods_pending_name_prefix" { 57 | type = string 58 | default = "" 59 | } 60 | 61 | variable "pods_pending_name_suffix" { 62 | type = string 63 | default = "" 64 | } 65 | 66 | variable "pods_pending_priority" { 67 | description = "Number from 1 (high) to 5 (low)." 68 | 69 | type = number 70 | default = 3 71 | } 72 | -------------------------------------------------------------------------------- /pods-pending.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | pods_pending_filter = coalesce( 3 | var.pods_pending_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "pods_pending" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Pods Pending" 13 | query = "min(${var.pods_pending_evaluation_period}):default_zero(max:kubernetes_state.pod.status_phase{phase:pending${var.filter_str_concatenation}${local.pods_pending_filter}} by {kube_namespace}) > ${var.pods_pending_critical}" 14 | 15 | # alert specific configuration 16 | require_full_window = true 17 | alert_message = "Kubernetes pods pending ({{ value }}) in {{ service }} exceeds {{ threshold }}" 18 | recovery_message = "Kubernetes pods pending ({{ value }}) in {{ service }} has recovered" 19 | 20 | # monitor level vars 21 | enabled = var.pods_pending_enabled 22 | alerting_enabled = var.pods_pending_alerting_enabled 23 | warning_threshold = var.pods_pending_warning 24 | critical_threshold = var.pods_pending_critical 25 | priority = min(var.pods_pending_priority + var.priority_offset, 5) 26 | docs = var.pods_pending_docs 27 | note = var.pods_pending_note 28 | 29 | # module level vars 30 | env = var.env 31 | service = var.service 32 | service_display_name = var.service_display_name 33 | notification_channel = var.notification_channel 34 | additional_tags = var.additional_tags 35 | locked = var.locked 36 | name_prefix = var.name_prefix 37 | name_suffix = var.name_suffix 38 | } 39 | -------------------------------------------------------------------------------- /provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | datadog = { 4 | source = "DataDog/datadog" 5 | version = "~> 3.12" 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:base" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /replicaset-incomplete-variables.tf: -------------------------------------------------------------------------------- 1 | variable "replicaset_incomplete_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "replicaset_incomplete_critical" { 7 | type = number 8 | default = 0 9 | description = "alert is raised when (desired - running) > replicaset_incomplete_critical" 10 | } 11 | 12 | variable "replicaset_incomplete_evaluation_period" { 13 | type = string 14 | default = "last_15m" 15 | } 16 | 17 | variable "replicaset_incomplete_note" { 18 | type = string 19 | default = "There's also a monitor defined for when the replicaset is completely unavailable" 20 | } 21 | 22 | variable "replicaset_incomplete_docs" { 23 | type = string 24 | default = "In kubernetes a Replicaset is responsible for making sure a specific number of pods run. An example for a reason when that's not is the case, is when the image cannot be pulled, the pod fails to initialize or no resources are available on the cluster\nThis alert is raised when (desired - running) > 0" 25 | } 26 | 27 | variable "replicaset_incomplete_filter_override" { 28 | type = string 29 | default = "" 30 | } 31 | 32 | variable "replicaset_incomplete_alerting_enabled" { 33 | type = bool 34 | default = true 35 | } 36 | 37 | variable "replicaset_incomplete_no_data_timeframe" { 38 | type = number 39 | default = null 40 | } 41 | 42 | variable "replicaset_incomplete_notify_no_data" { 43 | type = bool 44 | default = false 45 | } 46 | 47 | variable "replicaset_incomplete_ok_threshold" { 48 | type = number 49 | default = null 50 | } 51 | 52 | variable "replicaset_incomplete_priority" { 53 | description = "Number from 1 (high) to 5 (low)." 54 | 55 | type = number 56 | default = 3 57 | } 58 | -------------------------------------------------------------------------------- /replicaset-incomplete.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | replicaset_incomplete_filter = coalesce( 3 | var.replicaset_incomplete_filter_override, 4 | var.filter_str 5 | ) 6 | } 7 | 8 | module "replicaset_incomplete" { 9 | source = "kabisa/generic-monitor/datadog" 10 | version = "1.0.0" 11 | 12 | name = "Replicaset Incomplete" 13 | query = "min(${var.replicaset_incomplete_evaluation_period}):max:kubernetes_state.replicaset.replicas_desired{${local.replicaset_incomplete_filter}} by {kube_replica_set,kube_cluster_name} - min:kubernetes_state.replicaset.replicas_ready{${local.replicaset_incomplete_filter}} by {kube_replica_set,kube_cluster_name} > ${var.replicaset_incomplete_critical}" 14 | alert_message = "Kubernetes Replicaset {{kube_replica_set}} is incomplete. Missing pod count:{{value}}" 15 | recovery_message = "Kubernetes Replicaset {{kube_replica_set}} has recovered" 16 | 17 | # monitor level vars 18 | enabled = var.state_metrics_monitoring && var.replicaset_incomplete_enabled 19 | alerting_enabled = var.replicaset_incomplete_alerting_enabled 20 | critical_threshold = var.replicaset_incomplete_critical 21 | # No warning threshold for this monitor 22 | priority = min(var.replicaset_incomplete_priority + var.priority_offset, 5) 23 | docs = var.replicaset_incomplete_docs 24 | note = var.replicaset_incomplete_note 25 | 26 | # module level vars 27 | env = var.env 28 | service = var.service 29 | service_display_name = var.service_display_name 30 | notification_channel = var.notification_channel 31 | additional_tags = var.additional_tags 32 | locked = var.locked 33 | name_prefix = var.name_prefix 34 | name_suffix = var.name_suffix 35 | } 36 | -------------------------------------------------------------------------------- /replicaset-unavailable-variables.tf: -------------------------------------------------------------------------------- 1 | variable "replicaset_unavailable_enabled" { 2 | type = bool 3 | default = true 4 | } 5 | 6 | variable "replicaset_unavailable_critical" { 7 | type = number 8 | default = 0 9 | description = "alert is raised when (desired - running) == 0" 10 | } 11 | 12 | variable "replicaset_unavailable_evaluation_period" { 13 | type = string 14 | default = "last_5m" 15 | } 16 | 17 | variable "replicaset_unavailable_note" { 18 | type = string 19 | default = "There's also a monitor defined for when the replicaset is only partially available" 20 | } 21 | 22 | variable "replicaset_unavailable_docs" { 23 | type = string 24 | default = "In kubernetes a Replicaset is responsible for making sure a specific number of pods runs. An example for a reason when that's not is the case, is when the image cannot be pulled, the pod fails to initialize or no resources are available on the cluster\nThis alert is raised when running == 0 and desired > 1" 25 | } 26 | 27 | variable "replicaset_unavailable_filter_override" { 28 | type = string 29 | default = "" 30 | } 31 | 32 | variable "replicaset_unavailable_alerting_enabled" { 33 | type = bool 34 | default = true 35 | } 36 | 37 | variable "replicaset_unavailable_no_data_timeframe" { 38 | type = number 39 | default = null 40 | } 41 | 42 | variable "replicaset_unavailable_notify_no_data" { 43 | type = bool 44 | default = false 45 | } 46 | 47 | variable "replicaset_unavailable_ok_threshold" { 48 | type = number 49 | default = null 50 | } 51 | 52 | variable "replicaset_unavailable_priority" { 53 | description = "Number from 1 (high) to 5 (low)." 54 | 55 | type = number 56 | default = 2 57 | } 58 | -------------------------------------------------------------------------------- /replicaset-unavailable.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | replicaset_unavailable_filter = coalesce( 3 | var.replicaset_unavailable_filter_override, 4 | var.filter_str 5 | ) 6 | rs_pods_ready = "min:kubernetes_state.replicaset.replicas_ready{${local.replicaset_unavailable_filter}} by {kube_replica_set,kube_cluster_name}" 7 | rs_pods_desired = "min:kubernetes_state.replicaset.replicas_desired{${local.replicaset_unavailable_filter}} by {kube_replica_set,kube_cluster_name}" 8 | } 9 | 10 | module "replicaset_unavailable" { 11 | source = "kabisa/generic-monitor/datadog" 12 | version = "1.0.0" 13 | 14 | name = "Replicaset Unavailable" 15 | # This (ab)uses a division by zero to make sure we don't get alerts when nr of desired pods < 2 16 | query = "max(${var.replicaset_unavailable_evaluation_period}):( ${local.rs_pods_ready} ) / ${local.rs_pods_desired} / ( ${local.rs_pods_desired} - 1 ) <= 0" 17 | alert_message = "Kubernetes Replicaset {{kube_replica_set}} is unavailable" 18 | recovery_message = "Kubernetes Replicaset {{kube_replica_set}} now has available pods" 19 | 20 | # monitor level vars 21 | enabled = var.state_metrics_monitoring && var.replicaset_unavailable_enabled 22 | alerting_enabled = var.replicaset_unavailable_alerting_enabled 23 | critical_threshold = 0 24 | # No warning threshold for this monitor 25 | priority = min(var.replicaset_unavailable_priority + var.priority_offset, 5) 26 | docs = var.replicaset_unavailable_docs 27 | note = var.replicaset_unavailable_note 28 | 29 | # module level vars 30 | env = var.env 31 | service = var.service 32 | service_display_name = var.service_display_name 33 | notification_channel = var.notification_channel 34 | additional_tags = var.additional_tags 35 | locked = var.locked 36 | name_prefix = var.name_prefix 37 | name_suffix = var.name_suffix 38 | } 39 | -------------------------------------------------------------------------------- /variables.tf: -------------------------------------------------------------------------------- 1 | variable "env" { 2 | type = string 3 | } 4 | 5 | variable "service" { 6 | type = string 7 | default = "Kubernetes" 8 | } 9 | 10 | variable "service_display_name" { 11 | description = "Readable version of service name of what you're monitoring." 12 | type = string 13 | default = null 14 | } 15 | 16 | variable "notification_channel" { 17 | type = string 18 | description = "The @user or @pagerduty parameters that indicate to Datadog where to send the alerts" 19 | } 20 | 21 | variable "additional_tags" { 22 | type = list(string) 23 | default = [] 24 | } 25 | 26 | variable "filter_str" { 27 | type = string 28 | } 29 | 30 | variable "locked" { 31 | type = bool 32 | default = true 33 | description = "Makes sure only the creator or admin can modify the monitor." 34 | } 35 | 36 | variable "state_metrics_monitoring" { 37 | type = bool 38 | default = true 39 | } 40 | 41 | variable "name_prefix" { 42 | type = string 43 | default = "" 44 | } 45 | 46 | variable "name_suffix" { 47 | type = string 48 | default = "" 49 | } 50 | 51 | variable "filter_str_concatenation" { 52 | description = "If you use an IN expression you need to switch from , to AND" 53 | default = "," 54 | } 55 | 56 | variable "priority_offset" { 57 | description = "For non production workloads we can +1 on the priorities" 58 | default = 0 59 | } 60 | --------------------------------------------------------------------------------