├── version.py ├── requirements.txt ├── .gitignore ├── requirements-dev.txt ├── k8s-sa-config ├── 00_service_account.yaml ├── 01_clusterrole.yaml ├── 02_clusterrolebinding.yaml └── get_config.sh ├── check_nodes.spec ├── .travis.yml ├── readme.md ├── check_nodes.py ├── check_pods.py ├── test_check_pods.py ├── .github └── workflows │ └── test.yml ├── test_check_nodes.py ├── INSTALL.md └── LICENSE /version.py: -------------------------------------------------------------------------------- 1 | __version__ = 'dev' 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | kubernetes>=11.0.0 2 | nagiosplugin 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /dist 2 | *.rpm 3 | /.idea 4 | /*.iml 5 | *.pyc 6 | /env 7 | /build 8 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | kubernetes>=11.0.0 2 | nagiosplugin 3 | mock 4 | pyinstaller 5 | six>=0.11 6 | setuptools 7 | -------------------------------------------------------------------------------- /k8s-sa-config/00_service_account.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: icinga-monitoring-sa -------------------------------------------------------------------------------- /k8s-sa-config/01_clusterrole.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: icinga-monitoring 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["pods", "nodes"] 9 | verbs: ["get", "watch", "list"] 10 | -------------------------------------------------------------------------------- /k8s-sa-config/02_clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: icinga-monitor-pods 6 | subjects: 7 | - kind: ServiceAccount 8 | name: icinga-monitoring-sa 9 | namespace: default 10 | roleRef: 11 | kind: ClusterRole 12 | name: icinga-monitoring 13 | apiGroup: rbac.authorization.k8s.io 14 | -------------------------------------------------------------------------------- /check_nodes.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | 3 | block_cipher = None 4 | 5 | 6 | a = Analysis(['check_nodes.py'], 7 | pathex=['/mnt/c/Develop/devops/domon/check_kubernetes'], 8 | binaries=[], 9 | datas=[], 10 | hiddenimports=['nagiosplugin.platform.posix'], 11 | hookspath=[], 12 | runtime_hooks=[], 13 | excludes=[], 14 | win_no_prefer_redirects=False, 15 | win_private_assemblies=False, 16 | cipher=block_cipher) 17 | pyz = PYZ(a.pure, a.zipped_data, 18 | cipher=block_cipher) 19 | exe = EXE(pyz, 20 | a.scripts, 21 | a.binaries, 22 | a.zipfiles, 23 | a.datas, 24 | name='check_nodes', 25 | debug=False, 26 | strip=False, 27 | upx=True, 28 | runtime_tmpdir=None, 29 | console=True ) 30 | -------------------------------------------------------------------------------- /k8s-sa-config/get_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | API_SERVER="https://localhost:6443" 4 | SERVICEACCOUNT_NAME=$(kubectl get sa | grep icinga | awk '{ print $1 }') 5 | SECRET_NAME=$(kubectl get secrets | grep "${SERVICEACCOUNT_NAME}-token" | awk '{ print $1 }') 6 | 7 | if [[ ${SERVICEACCOUNT_NAME} == "" ]]; then 8 | >&2 echo "Service account not found!" 9 | exit 1 10 | else 11 | >&2 echo "Found icinga Service Account: ${SECRET_NAME}" 12 | fi 13 | 14 | CA_CERT=$(kubectl get secret/"${SECRET_NAME}" -o jsonpath='{.data.ca\.crt}') 15 | SA_TOKEN=$(kubectl get secret/"${SECRET_NAME}" -o jsonpath='{.data.token}' | base64 --decode) 16 | NS=$(kubectl get secret/"${SECRET_NAME}" -o jsonpath='{.data.namespace}' | base64 --decode) 17 | 18 | echo " 19 | apiVersion: v1 20 | kind: Config 21 | clusters: 22 | - name: default-cluster 23 | cluster: 24 | certificate-authority-data: ${CA_CERT} 25 | server: ${API_SERVER} 26 | contexts: 27 | - name: default-context 28 | context: 29 | cluster: default-cluster 30 | namespace: ${NS} 31 | user: default-user 32 | current-context: default-context 33 | users: 34 | - name: default-user 35 | user: 36 | token: ${SA_TOKEN} 37 | " 38 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | language: python 3 | 4 | python: 5 | - '2.7' 6 | - '3.6' 7 | - '3.7' 8 | - '3.8' 9 | - '3.9-dev' 10 | 11 | matrix: 12 | allow_failures: 13 | - python: '3.9-dev' 14 | 15 | install: 16 | - pip install -U -r requirements-dev.txt 17 | 18 | script: 19 | - | 20 | if [ -z "$TRAVIS_TAG" ]; then 21 | export version_number="dev-${TRAVIS_COMMIT}" 22 | else 23 | export version_number="${TRAVIS_TAG}" 24 | fi 25 | 26 | echo "Version ${version_number}" 27 | 28 | echo "__version__ = '${version_number}'" > ./version.py 29 | 30 | - python ./test_check_nodes.py 31 | - python ./test_check_pods.py 32 | - pyinstaller check_pods.py --clean --onefile --hidden-import nagiosplugin.platform.posix 33 | - pyinstaller check_nodes.py --clean --onefile --hidden-import nagiosplugin.platform.posix 34 | 35 | deploy: 36 | provider: releases 37 | api_key: 38 | secure: XslaczKUxjcyP2+ooJ6bVMnF/B/mA9WacDxdqB+WW3oVRoxHMrp6ZDPCj4xfH3N/P+sqCgbGskxz1w0Rxt9JEdyeC31uCkPPzaPbVQIryiJewh5XkScZ7DyBgsFCR93NwYUBFeGo81D/sPFWkz+mWXtvZBnvdDboL2OOwxZTLyPYaQjsaxELc1s2LEUrwc+lV6LPWK52bdOUXlgyxLL9QiZJH3Y/KRehNe6ev23VxVLCb9UMnm/VofI7C4L14sV17Fz1VmvriTpWjAsj36m4cG3dacYzAdYaxE6ul88b5sbXIKG7kgkvBn3eU6MYJ2ZLrUjUFBMo4xFve6pwLJ76vTqOz81l+9FxRt+QJ9bq5/Hn5DSuoQZG3u+IL5AmzErS/FmxtC9MpEGDsHHzxLk4USziDx+S6ZYBGRx2QEsx70ut4biPbtC5jCMTOL2GI8EcdEyFZCMALOmF56JJeD1wL63748u1Diy/SvGXJcNySozx64RNflvlMmmBUN9XRtCX6MyAV/90o76nrNI/1u0+lEfwkPJT1ePwWdWeMPhZBxvczJ5QUyp+3prlx7pkyE2aUD9KvNueijTsh47DxX4FDAvpwbT5l/ZiD/VeE/ZdLagGFvPlzxQN6/DgkmKaahODs1VQkRZ4OUr5i8Lsm4HK9zU/IqLU5kfG/gccTcmCOzo= 39 | skip_cleanup: true 40 | file_glob: true 41 | file: dist/* 42 | on: 43 | repo: T-Systems-MMS/check_kubernetes 44 | tags: true 45 | branch: master 46 | python: '3.8' 47 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Nagios/Icinga Checks for Kubernetes 2 | 3 | [![Build Status](https://travis-ci.org/T-Systems-MMS/check_kubernetes.svg?branch=master)](https://travis-ci.org/T-Systems-MMS/check_kubernetes) 4 | 5 | You will need a kubeconfig file for both checks. 6 | 7 | ## Python Compatibility 8 | 9 | Python 2.7.x or Python >= 3.4 10 | 11 | ## check_nodes.py 12 | 13 | Checks the State of your nodes via the Kubernetes API. One node with Problems is a warning, two nodes are critical. Perfdata are supplied. 14 | 15 | ### Usage 16 | ``` 17 | usage: check_nodes.py [-h] [--kube-config KUBE_CONFIG] 18 | 19 | optional arguments: 20 | -h, --help show this help message and exit 21 | --kube-config KUBE_CONFIG 22 | Kubernetes Config File 23 | ``` 24 | 25 | ## check_pods.py 26 | 27 | Checks the State of all pods in the Kubernetes Cluster. 28 | 29 | ### Usage 30 | ``` 31 | usage: check_pods.py [-h] [--kube-config KUBE_CONFIG] 32 | [--warning-pending WARNING_PENDING] 33 | [--critical-pending CRITICAL_PENDING] 34 | [--warning-running WARNING_RUNNING] 35 | [--critical-running CRITICAL_RUNNING] 36 | [--warning-succeeded WARNING_SUCCEEDED] 37 | [--critical-succeeded CRITICAL_SUCCEEDED] 38 | [--warning-failed WARNING_FAILED] 39 | [--critical-failed CRITICAL_FAILED] 40 | [--warning-unknown WARNING_UNKNOWN] 41 | [--critical-unknown CRITICAL_UNKNOWN] 42 | 43 | optional arguments: 44 | -h, --help show this help message and exit 45 | --kube-config KUBE_CONFIG 46 | Kubernetes Config File 47 | --warning-pending WARNING_PENDING 48 | --critical-pending CRITICAL_PENDING 49 | --warning-running WARNING_RUNNING 50 | --critical-running CRITICAL_RUNNING 51 | --warning-succeeded WARNING_SUCCEEDED 52 | --critical-succeeded CRITICAL_SUCCEEDED 53 | --warning-failed WARNING_FAILED 54 | --critical-failed CRITICAL_FAILED 55 | --warning-unknown WARNING_UNKNOWN 56 | --critical-unknown CRITICAL_UNKNOWN 57 | ``` 58 | -------------------------------------------------------------------------------- /check_nodes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Check for Kubernetes Nodes 5 | """ 6 | 7 | import argparse 8 | 9 | from kubernetes import config, client 10 | import nagiosplugin 11 | 12 | from version import __version__ as version 13 | 14 | class Nodes(nagiosplugin.Resource): 15 | """ 16 | Check for Kubernetes Nodes 17 | """ 18 | 19 | def __init__(self, kube_config): 20 | self.kube_config = kube_config 21 | self.nodes = [] 22 | self.nodes_with_problems = [] 23 | 24 | def probe(self): 25 | config.load_kube_config(self.kube_config) 26 | kube = client.CoreV1Api() 27 | 28 | for node in kube.list_node().items: 29 | self.nodes.append(node) 30 | for condition in node.status.conditions: 31 | # OutOfDisk is not postet in k8s > 1.12, but is still listet in node status contitions, 32 | # see https://github.com/kubernetes/kubernetes/pull/72507 33 | if condition.type == "OutOfDisk": 34 | continue 35 | 36 | if (condition.type == 'Ready' and condition.status != 'True') \ 37 | or (condition.type != 'Ready' and condition.status != 'False'): 38 | self.nodes_with_problems.append(node) 39 | break 40 | 41 | return [ 42 | nagiosplugin.Metric('problem_nodes', len(self.nodes_with_problems), min=0), 43 | nagiosplugin.Metric('all_nodes', len(self.nodes), min=0), 44 | ] 45 | 46 | 47 | @nagiosplugin.guarded 48 | def main(): 49 | """ 50 | :return: 51 | """ 52 | argp = argparse.ArgumentParser(description='Nagios/Icinga check for Kubernetes Nodes') 53 | argp.add_argument('-v', '--version', action='version', version='%(prog)s ' + version) 54 | argp.add_argument('--kube-config', help='Kubernetes Config File') 55 | args = argp.parse_args() 56 | 57 | check = nagiosplugin.Check( 58 | Nodes(args.kube_config), 59 | nagiosplugin.ScalarContext('problem_nodes', 1, 2), 60 | nagiosplugin.ScalarContext('all_nodes') 61 | ) 62 | check.main() 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /check_pods.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Check for Kubernetes Pods 5 | """ 6 | 7 | import argparse 8 | 9 | from kubernetes import config, client 10 | import nagiosplugin 11 | 12 | from version import __version__ as version 13 | 14 | 15 | class Pods(nagiosplugin.Resource): 16 | """ 17 | Check for Kubernetes Pods 18 | """ 19 | phases = [ 20 | 'Pending', 21 | 'Running', 22 | 'Succeeded', 23 | 'Failed', 24 | 'Unknown' 25 | ] 26 | 27 | def __init__(self, kube_config=None): 28 | self.kube_config = kube_config 29 | self.pods = [] 30 | self.counts = {} 31 | for phase in self.phases: 32 | self.counts[phase] = 0 33 | 34 | def probe(self): 35 | config.load_kube_config(self.kube_config) 36 | kube = client.CoreV1Api() 37 | self.pods = kube.list_pod_for_all_namespaces().items 38 | for pod in self.pods: 39 | self.counts[pod.status.phase] += 1 40 | 41 | metrics = [] 42 | for phase in self.counts: 43 | metrics.append(nagiosplugin.Metric(phase, self.counts[phase], min=0)) 44 | return metrics 45 | 46 | 47 | class PodsSummary(nagiosplugin.Summary): 48 | """ 49 | Check for Kubernetes Pods Summary 50 | """ 51 | 52 | def ok(self, results): 53 | ret_str = [] 54 | for phase in Pods.phases: 55 | ret_str.append("%s Pods %s" % (str(results[phase].metric), phase)) 56 | return ', '.join(ret_str) 57 | 58 | 59 | @nagiosplugin.guarded 60 | def main(): 61 | """ 62 | :return: 63 | """ 64 | argp = argparse.ArgumentParser(description='Nagios/Icinga check for Kubernetes Pods') 65 | argp.add_argument('-v', '--version', action='version', version='%(prog)s ' + version) 66 | argp.add_argument('--kube-config', help='Kubernetes Config File') 67 | 68 | for phase in Pods.phases: 69 | argp.add_argument('--warning-' + phase.lower()) 70 | argp.add_argument('--critical-' + phase.lower()) 71 | 72 | args = argp.parse_args() 73 | 74 | checks = [Pods(args.kube_config)] 75 | for phase in Pods.phases: 76 | checks.append(nagiosplugin.ScalarContext(phase, 77 | getattr(args, 'warning_' + phase.lower()), 78 | getattr(args, 'critical_' + phase.lower()))) 79 | 80 | checks.append(PodsSummary()) 81 | 82 | check = nagiosplugin.Check(*checks) 83 | check.main() 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /test_check_pods.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import mock 3 | 4 | import check_pods 5 | 6 | nodes_all_ok = [ 7 | mock.Mock( 8 | status=mock.Mock( 9 | phase='Running' 10 | ) 11 | ), 12 | mock.Mock( 13 | status=mock.Mock( 14 | phase='Running' 15 | ) 16 | ), 17 | ] 18 | 19 | nodes_one_failed = [ 20 | mock.Mock( 21 | status=mock.Mock( 22 | phase='Failed' 23 | ) 24 | ), 25 | mock.Mock( 26 | status=mock.Mock( 27 | phase='Running' 28 | ) 29 | ), 30 | ] 31 | 32 | class TestCheckNodes(unittest.TestCase): 33 | 34 | @mock.patch('check_nodes.nagiosplugin.Metric') 35 | @mock.patch('check_nodes.client.CoreV1Api') 36 | @mock.patch('check_nodes.config.load_kube_config') 37 | def test_pods_all_ok(self, mock_config, mock_client, mock_metric): 38 | mock_kube = mock.Mock() 39 | 40 | type(mock_kube.list_pod_for_all_namespaces.return_value).items = mock.PropertyMock(return_value=nodes_all_ok) 41 | 42 | mock_client.return_value = mock_kube 43 | 44 | cls = check_pods.Pods(kube_config='empty') 45 | cls.probe() 46 | 47 | mock_config.assert_called_with('empty') 48 | mock_kube.list_pod_for_all_namespaces.assert_called() 49 | 50 | mock_metric.assert_any_call('Pending', 0, min=0) 51 | mock_metric.assert_any_call('Running', 2, min=0) 52 | mock_metric.assert_any_call('Succeeded', 0, min=0) 53 | mock_metric.assert_any_call('Failed', 0, min=0) 54 | mock_metric.assert_any_call('Unknown', 0, min=0) 55 | 56 | @mock.patch('check_nodes.nagiosplugin.Metric') 57 | @mock.patch('check_nodes.client.CoreV1Api') 58 | @mock.patch('check_nodes.config.load_kube_config') 59 | def test_pods_one_failed(self, mock_config, mock_client, mock_metric): 60 | mock_kube = mock.Mock() 61 | 62 | type(mock_kube.list_pod_for_all_namespaces.return_value).items = mock.PropertyMock(return_value=nodes_one_failed) 63 | 64 | mock_client.return_value = mock_kube 65 | 66 | cls = check_pods.Pods(kube_config='empty') 67 | cls.probe() 68 | 69 | mock_config.assert_called_with('empty') 70 | mock_kube.list_pod_for_all_namespaces.assert_called() 71 | 72 | mock_metric.assert_any_call('Pending', 0, min=0) 73 | mock_metric.assert_any_call('Running', 1, min=0) 74 | mock_metric.assert_any_call('Succeeded', 0, min=0) 75 | mock_metric.assert_any_call('Failed', 1, min=0) 76 | mock_metric.assert_any_call('Unknown', 0, min=0) 77 | 78 | 79 | if __name__ == '__main__': 80 | unittest.main() 81 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-16.04 8 | strategy: 9 | matrix: 10 | python-version: [2.7, 3.5, 3.6, 3.7, 3.8] 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Get the version 18 | id: vars 19 | run: echo ::set-output name=tag::$(echo ${GITHUB_REF##*/}) 20 | - name: Install dependencies 21 | run: | 22 | pip install -U -r requirements-dev.txt 23 | - name: Test 24 | run: | 25 | echo "__version__ = '${{steps.vars.outputs.tag}}'" > ./version.py 26 | python ./test_check_nodes.py 27 | python ./test_check_pods.py 28 | pyinstaller check_pods.py --clean --onefile --hidden-import nagiosplugin.platform.posix 29 | pyinstaller check_nodes.py --clean --onefile --hidden-import nagiosplugin.platform.posix 30 | - name: Upload Artifacts 31 | if: ${{ matrix.python-version == '3.8' }} 32 | uses: actions/upload-artifact@v2 33 | with: 34 | name: binaries 35 | path: dist/* 36 | release: 37 | needs: build 38 | if: startsWith(github.ref, 'refs/tags/v') 39 | runs-on: ubuntu-latest 40 | steps: 41 | - name: Get the version 42 | id: vars 43 | run: echo ::set-output name=tag::$(echo ${GITHUB_REF##*/}) 44 | - uses: actions/download-artifact@v2 45 | with: 46 | name: binaries 47 | path: dist/ 48 | - name: ls 49 | run: | 50 | ls -la 51 | ls -la dist/ 52 | - name: Create Release 53 | id: create_release 54 | uses: actions/create-release@v1 55 | env: 56 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 57 | with: 58 | tag_name: ${{ steps.vars.outputs.tag }} 59 | release_name: Release ${{ steps.vars.outputs.tag }} 60 | draft: false 61 | prerelease: false 62 | - name: Upload Release Asset check_nodes 63 | uses: actions/upload-release-asset@v1 64 | env: 65 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 66 | with: 67 | upload_url: ${{ steps.create_release.outputs.upload_url }} 68 | asset_path: dist/check_nodes 69 | asset_name: check_nodes 70 | asset_content_type: application/octet-stream 71 | - name: Upload Release Asset check_pods 72 | uses: actions/upload-release-asset@v1 73 | env: 74 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 75 | with: 76 | upload_url: ${{ steps.create_release.outputs.upload_url }} 77 | asset_path: dist/check_pods 78 | asset_name: check_pods 79 | asset_content_type: application/octet-stream 80 | -------------------------------------------------------------------------------- /test_check_nodes.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import mock 3 | 4 | import check_nodes 5 | 6 | nodes_all_ok = [ 7 | mock.Mock( 8 | status=mock.Mock( 9 | conditions=[ 10 | mock.Mock(type='Ready', status='True'), 11 | mock.Mock(type='DiskPressure', status='False'), 12 | ], 13 | ) 14 | ), 15 | mock.Mock( 16 | status=mock.Mock( 17 | conditions=[ 18 | mock.Mock(type='Ready', status='True'), 19 | mock.Mock(type='DiskPressure', status='False'), 20 | ], 21 | ) 22 | ), 23 | ] 24 | 25 | nodes_one_problem = [ 26 | mock.Mock( 27 | status=mock.Mock( 28 | conditions=[ 29 | mock.Mock(type='Ready', status='True'), 30 | mock.Mock(type='DiskPressure', status='True'), 31 | ], 32 | ) 33 | ), 34 | mock.Mock( 35 | status=mock.Mock( 36 | conditions=[ 37 | mock.Mock(type='Ready', status='True'), 38 | mock.Mock(type='DiskPressure', status='False'), 39 | ], 40 | ) 41 | ), 42 | ] 43 | 44 | nodes_two_problem = [ 45 | mock.Mock( 46 | status=mock.Mock( 47 | conditions=[ 48 | mock.Mock(type='Ready', status='True'), 49 | mock.Mock(type='DiskPressure', status='True'), 50 | mock.Mock(type='PIDPressure', status='True'), 51 | ], 52 | ) 53 | ), 54 | mock.Mock( 55 | status=mock.Mock( 56 | conditions=[ 57 | mock.Mock(type='Ready', status='True'), 58 | mock.Mock(type='DiskPressure', status='False'), 59 | mock.Mock(type='PIDPressure', status='False'), 60 | ], 61 | ) 62 | ), 63 | ] 64 | 65 | class TestCheckNodes(unittest.TestCase): 66 | 67 | @mock.patch('check_nodes.nagiosplugin.Metric') 68 | @mock.patch('check_nodes.client.CoreV1Api') 69 | @mock.patch('check_nodes.config.load_kube_config') 70 | def test_node_all_ok(self, mock_config, mock_client, mock_metric): 71 | mock_kube = mock.Mock() 72 | 73 | type(mock_kube.list_node.return_value).items = mock.PropertyMock(return_value=nodes_all_ok) 74 | 75 | mock_client.return_value = mock_kube 76 | 77 | cls = check_nodes.Nodes(kube_config='empty') 78 | cls.probe() 79 | 80 | mock_config.assert_called_with('empty') 81 | mock_kube.list_node.assert_called() 82 | 83 | mock_metric.assert_any_call('problem_nodes', 0, min=0) 84 | mock_metric.assert_any_call('all_nodes', 2, min=0) 85 | 86 | @mock.patch('check_nodes.nagiosplugin.Metric') 87 | @mock.patch('check_nodes.client.CoreV1Api') 88 | @mock.patch('check_nodes.config.load_kube_config') 89 | def test_node_one_problem(self, mock_config, mock_client, mock_metric): 90 | mock_kube = mock.Mock() 91 | 92 | type(mock_kube.list_node.return_value).items = mock.PropertyMock(return_value=nodes_one_problem) 93 | 94 | mock_client.return_value = mock_kube 95 | 96 | cls = check_nodes.Nodes(kube_config='empty') 97 | cls.probe() 98 | 99 | mock_config.assert_called_with('empty') 100 | mock_kube.list_node.assert_called() 101 | 102 | mock_metric.assert_any_call('problem_nodes', 1, min=0) 103 | mock_metric.assert_any_call('all_nodes', 2, min=0) 104 | 105 | @mock.patch('check_nodes.nagiosplugin.Metric') 106 | @mock.patch('check_nodes.client.CoreV1Api') 107 | @mock.patch('check_nodes.config.load_kube_config') 108 | def test_node_two_problem(self, mock_config, mock_client, mock_metric): 109 | mock_kube = mock.Mock() 110 | 111 | type(mock_kube.list_node.return_value).items = mock.PropertyMock(return_value=nodes_two_problem) 112 | 113 | mock_client.return_value = mock_kube 114 | 115 | cls = check_nodes.Nodes(kube_config='empty') 116 | cls.probe() 117 | 118 | mock_config.assert_called_with('empty') 119 | mock_kube.list_node.assert_called() 120 | 121 | mock_metric.assert_any_call('problem_nodes', 1, min=0) 122 | mock_metric.assert_any_call('all_nodes', 2, min=0) 123 | 124 | 125 | if __name__ == '__main__': 126 | unittest.main() 127 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation Instructions 2 | ## Pre-Requisites 3 | - Monitoring-User (nagios) with Home-Dir (/home/nagios) is setup 4 | - NRPE is working correctly 5 | 6 | ## Install from release 7 | ### download release versions (for release 1.2.1) 8 | ```bash 9 | [nagios@host ~]$ cd /home/nagios 10 | [nagios@host nagios]$ mkdir check_kubernetes 11 | [nagios@host nagios]$ cd check_kubernetes 12 | [nagios@host check_kubernetes]$ RELEASE_URL="https://github.com/T-Systems-MMS/check_kubernetes/releases/download/" 13 | [nagios@host check_kubernetes]$ VERSION="v1.2.1" 14 | [nagios@host check_kubernetes]$ wget ${RELEASE_URL}/${VERSION}/check_pods 15 | [nagios@host check_kubernetes]$ wget wget ${RELEASE_URL}/${VERSION}/check_nodes 16 | [nagios@host check_kubernetes]$ chmod 0750 check_pods check_nodes 17 | ``` 18 | 19 | ## Install from Source 20 | ### Python environment 21 | The following steps have to be executed as Nagios/NRPE user (user who will run the checks). 22 | 23 | #### venv setup and clone 24 | ```bash 25 | [nagios@host ~]$ cd /home/nagios 26 | [nagios@host nagios]$ python3 -m venv k8s_mon_venv 27 | [nagios@host nagios]$ source k8s_mon_venv/bin/activate 28 | [nagios@host nagios]$ git clone https://github.com/T-Systems-MMS/check_kubernetes.git 29 | [nagios@host nagios]$ cd check_kubernetes 30 | [nagios@host check_kubernetes]$ pip install -r requirements.txt 31 | ``` 32 | 33 | ## Kubernetes Service Account Setup 34 | All files shown here can be found in folder k8s-sa-config. 35 | 36 | ### Service Account - 00_service_account.yaml 37 | ```yaml 38 | apiVersion: v1 39 | kind: ServiceAccount 40 | metadata: 41 | name: icinga-monitoring-sa 42 | ``` 43 | 44 | ### ClusterRole - 01_clusterrole.yaml 45 | ```yaml 46 | --- 47 | apiVersion: rbac.authorization.k8s.io/v1 48 | kind: ClusterRole 49 | metadata: 50 | name: icinga-monitoring 51 | rules: 52 | - apiGroups: [""] 53 | resources: ["pods", "nodes"] 54 | verbs: ["get", "watch", "list"] 55 | ``` 56 | 57 | ### ClusterRoleBinding - 02_clusterrolebinding.yaml 58 | ```yaml 59 | --- 60 | apiVersion: rbac.authorization.k8s.io/v1 61 | kind: ClusterRoleBinding 62 | metadata: 63 | name: icinga-monitor-pods 64 | subjects: 65 | - kind: ServiceAccount 66 | name: icinga-monitoring-sa 67 | namespace: default 68 | roleRef: 69 | kind: ClusterRole 70 | name: icinga-monitoring 71 | apiGroup: rbac.authorization.k8s.io 72 | ``` 73 | 74 | ## Kubernetes - Get kube-config for service Account 75 | ### get_config.sh 76 | We assume here that the script is executed on a master node. If this is not the case you 77 | must change `API_SERVER` here.
78 | If you've used another service account name also change `SERVICEACCOUNT_NAME` to reflect the change. 79 | 80 | ```bash 81 | #!/bin/bash 82 | 83 | API_SERVER="https://localhost:6443" 84 | SERVICEACCOUNT_NAME=$(kubectl get sa | grep icinga | awk '{ print $1 }') 85 | SECRET_NAME=$(kubectl get secrets | grep "${SERVICEACCOUNT_NAME}-token" | awk '{ print $1 }') 86 | 87 | if [[ ${SERVICEACCOUNT_NAME} == "" ]]; then 88 | >&2 echo "Service account not found!" 89 | exit 1 90 | else 91 | >&2 echo "Found icinga Service Account: ${SECRET_NAME}" 92 | fi 93 | 94 | CA_CERT=$(kubectl get secret/"${SECRET_NAME}" -o jsonpath='{.data.ca\.crt}') 95 | SA_TOKEN=$(kubectl get secret/"${SECRET_NAME}" -o jsonpath='{.data.token}' | base64 --decode) 96 | NS=$(kubectl get secret/"${SECRET_NAME}" -o jsonpath='{.data.namespace}' | base64 --decode) 97 | 98 | echo " 99 | apiVersion: v1 100 | kind: Config 101 | clusters: 102 | - name: default-cluster 103 | cluster: 104 | certificate-authority-data: ${CA_CERT} 105 | server: ${API_SERVER} 106 | contexts: 107 | - name: default-context 108 | context: 109 | cluster: default-cluster 110 | namespace: ${NS} 111 | user: default-user 112 | current-context: default-context 113 | users: 114 | - name: default-user 115 | user: 116 | token: ${SA_TOKEN} 117 | " 118 | ``` 119 | 120 | To generate the kube-config for the service account just call it and redirect the output to a location that is 121 | accessible for nagios/nrpe. 122 | The user running the script must have the kubernetes connection and privileges to run kubectl commands 123 | on cluster level ex. root 124 | 125 | ```bash 126 | [root@host ~]# chmod u+x get_config.sh 127 | [root@host ~]# ./get_config.sh > /home/nagios/kube-config 128 | [root@host ~]# chown nagios.nagios /home/nagios/kube-config 129 | [root@host ~]# chmod 0600 /home/nagios/kube-config 130 | ``` 131 | 132 | ## Testing the newly created service account 133 | You can test service account configuration by running the check manually. 134 | 135 | ```bash 136 | [nagios@host ~]$ cd /home/nagios/ 137 | [nagios@host nagios]$ ls 138 | bin check_kubernetes k8s_mon_venv kube-config 139 | [nagios@host nagios]$ cd check_kubernetes/ 140 | [nagios@host check_kubernetes]$ ./check_nodes --kube-config ../kube-config 141 | NODES OK - problem_nodes is 0 | all_nodes=5;;;0 problem_nodes=0;1;2;0 142 | [nagios@host check_kubernetes]$ ./check_pods --kube-config ../kube-config 143 | PODS OK - 0 Pods Pending, 28 Pods Running, 0 Pods Succeeded, 0 Pods Failed, 0 Pods Unknown | Failed=0;;;0 Pending=0;;;0 Running=28;;;0 Succeeded=0;;;0 Unknown=0;;;0 144 | [nagios@host check_kubernetes]$ 145 | ``` 146 | 147 | # Troubleshooting 148 | ## Error: Hostname doesn't match 149 | If you get an exception like this:
150 | ``` 151 | NODES UNKNOWN: urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='localhost', port=6443): Max retries exceeded with url: /api/v1/nodes (Caused by SSLError(SSLCertVerificationError("hostname 'localhost' doesn't match either of '', 'kubernetes', 'kubernetes.default', 'kubernetes.default.svc', 'kubernetes.default.svc.cluster.local', '', ''"))) 152 | ``` 153 | check the "server" line under "cluster" ``kube-config`` and replace localhost with one of names in the error message. 154 |
-------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------