├── .gitignore ├── requirements.txt ├── checksroot ├── nagios │ ├── info │ ├── pre │ ├── ssh │ ├── checks │ └── utils └── profiles │ ├── pre.txt │ ├── ssh.txt │ ├── checks.txt │ └── info.txt ├── .isort.cfg ├── webreport.png ├── .risu.conf ├── .flake8 ├── .github ├── labels.yml ├── workflows │ ├── label.yml │ ├── broken-link-check.yml │ ├── size.yaml │ └── refresh-checksmd.yml └── dependabot.yml ├── info ├── 03-pods ├── 04-machineset ├── 00-clusterversion ├── 01-clusteroperators ├── container-images-running ├── 02-nodes ├── ovs-hostnames ├── bmh-machine-node ├── node-versions ├── container-images-stored ├── biosversion ├── locks ├── ethtool-firmware-version └── mtu ├── scripts ├── update-checksmd ├── locks.sh ├── recover-northd.sh ├── README.md └── ovn_cleanConntrack.sh ├── pre ├── 00-install-config-valid-yaml └── dns-hostnames ├── checks ├── csr ├── terminating ├── mcp ├── pvc ├── pdb ├── notrunningpods ├── restarts ├── ctrlnodes ├── sriov ├── clusterversion_errors ├── bz1948052 ├── operators ├── port-thrashing ├── ovn-pods-memory-usage ├── alertmanager ├── zombies ├── entropy ├── nodes ├── chronyc ├── iptables-22623-22624 └── mellanox-firmware-version ├── Containerfile ├── cronjob.yaml ├── ssh └── bz1941840 ├── .pre-commit-config.yaml ├── checks.md ├── openshift-checks.sh ├── utils └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .history/ 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | risu 2 | -------------------------------------------------------------------------------- /checksroot/nagios/info: -------------------------------------------------------------------------------- 1 | ../../info -------------------------------------------------------------------------------- /checksroot/nagios/pre: -------------------------------------------------------------------------------- 1 | ../../pre -------------------------------------------------------------------------------- /checksroot/nagios/ssh: -------------------------------------------------------------------------------- 1 | ../../ssh -------------------------------------------------------------------------------- /checksroot/nagios/checks: -------------------------------------------------------------------------------- 1 | ../../checks -------------------------------------------------------------------------------- /checksroot/nagios/utils: -------------------------------------------------------------------------------- 1 | ../../utils -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | known_third_party = 3 | -------------------------------------------------------------------------------- /webreport.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RHsyseng/openshift-checks/HEAD/webreport.png -------------------------------------------------------------------------------- /.risu.conf: -------------------------------------------------------------------------------- 1 | {"web": true, "exclude": ["risuclient"], "extraplugintree": "checksroot/", "title": "OpenShift Checks", "output": "osc.json", "quiet": true} 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503, F403, F401, E402, E722, C901 3 | max-line-length = 79 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | -------------------------------------------------------------------------------- /checksroot/profiles/pre.txt: -------------------------------------------------------------------------------- 1 | # priority: 1000 2 | # 3 | # Defines which plugins to include, exclude, etc 4 | # Syntax 5 | # +keyword : includes keyword in plugin search 6 | # -keyword : excludes keyword in plugin search 7 | 8 | +pre/ 9 | -------------------------------------------------------------------------------- /checksroot/profiles/ssh.txt: -------------------------------------------------------------------------------- 1 | # priority: 1000 2 | # 3 | # Defines which plugins to include, exclude, etc 4 | # Syntax 5 | # +keyword : includes keyword in plugin search 6 | # -keyword : excludes keyword in plugin search 7 | 8 | +ssh/ 9 | -------------------------------------------------------------------------------- /checksroot/profiles/checks.txt: -------------------------------------------------------------------------------- 1 | # priority: 1000 2 | # 3 | # Defines which plugins to include, exclude, etc 4 | # Syntax 5 | # +keyword : includes keyword in plugin search 6 | # -keyword : excludes keyword in plugin search 7 | 8 | +checks/ 9 | -------------------------------------------------------------------------------- /checksroot/profiles/info.txt: -------------------------------------------------------------------------------- 1 | # priority: 1000 2 | # 3 | # Defines which plugins to include, exclude, etc 4 | # Syntax 5 | # +keyword : includes keyword in plugin search 6 | # -keyword : excludes keyword in plugin search 7 | 8 | +info/ 9 | -------------------------------------------------------------------------------- /.github/labels.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Add 'repo' label to any root file changes 3 | repo: 4 | - ./* 5 | 6 | checks: 7 | - checks/**/* 8 | 9 | pre: 10 | - pre/**/* 11 | 12 | info: 13 | - info/**/* 14 | 15 | ssh: 16 | - ssh/**/* 17 | 18 | scripts: 19 | - scripts/**/* 20 | -------------------------------------------------------------------------------- /info/03-pods: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the pods running in the cluster 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i get pods -A >/dev/null 2>&1; then 7 | msg "Total pods: $(oc get pods -A --no-headers | wc -l)" 8 | exit ${OCINFO} 9 | else 10 | msg "Couldn't get pods, check permissions" 11 | exit ${OCSKIP} 12 | fi 13 | exit ${OCUNKNOWN} 14 | -------------------------------------------------------------------------------- /scripts/update-checksmd: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Updates the checks.md out of scripts in the folders 'headers' 3 | 4 | for kind in info pre ssh checks; do 5 | echo """ 6 | # ${kind} 7 | | Script | Description | 8 | | - | - |""" 9 | 10 | for file in $(find ${kind} -type f -executable|sort -V); do 11 | echo "| [${file}](${file}) | $(grep '^# description:' ${file} | cut -d ":" -f 2-) |" 12 | done 13 | done 14 | -------------------------------------------------------------------------------- /info/04-machineset: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the machinesets status 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | NS="openshift-machine-api" 7 | 8 | if oc auth can-i get machinesets -n ${NS} >/dev/null 2>&1; then 9 | msg "$(oc get machineset -n ${NS})" 10 | else 11 | msg "Couldn't get machinesets, check permissions" 12 | exit ${OCSKIP} 13 | fi 14 | exit ${OCINFO} 15 | -------------------------------------------------------------------------------- /info/00-clusterversion: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the clusterversion 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i get clusterversion >/dev/null 2>&1; then 7 | msg "Cluster version:\n$(oc get clusterversion/version)" 8 | exit ${OCINFO} 9 | else 10 | msg "Couldn't get clusterversion, check permissions" 11 | exit ${OCSKIP} 12 | fi 13 | exit ${OCUNKNOWN} 14 | -------------------------------------------------------------------------------- /info/01-clusteroperators: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the clusteroperators 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i get clusteroperators >/dev/null 2>&1; then 7 | msg "Cluster operators:\n$(oc get clusteroperators)" 8 | exit ${OCINFO} 9 | else 10 | msg "Couldn't get clusteroperators, check permissions" 11 | exit ${OCSKIP} 12 | fi 13 | exit ${OCUNKNOWN} 14 | -------------------------------------------------------------------------------- /.github/workflows/label.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Label PRs from globs" 3 | on: 4 | schedule: 5 | - cron: "0 * * * *" 6 | 7 | concurrency: 8 | group: ${{ github.workflow }}-${{ github.ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | execute: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: jpmcb/prow-github-actions@v1.1.3 16 | with: 17 | jobs: 'pr-labeler' 18 | github-token: "${{ secrets.GITHUB_TOKEN }}" 19 | -------------------------------------------------------------------------------- /info/container-images-running: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the images of the containers running in the cluster 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i get pods -A >/dev/null 2>&1; then 7 | IMAGES=$(oc get pods -A -o go-template --template='{{range .items}}{{range .spec.containers}}{{printf "%s\n" .image -}} {{end}}{{end}}' | sort -u) 8 | msg "Images:\n${IMAGES}" 9 | exit ${OCINFO} 10 | else 11 | msg "Couldn't get pods, check permissions" 12 | exit ${OCSKIP} 13 | fi 14 | exit ${OCUNKNOWN} 15 | -------------------------------------------------------------------------------- /pre/00-install-config-valid-yaml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if the install-config.yaml file is a valid yaml file 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if ! $(which yq) >/dev/null 2>&1; then 7 | msg "yq command not found" 8 | exit ${OCSKIP} 9 | fi 10 | 11 | if yq eval ${INSTALL_CONFIG_PATH} >/dev/null; then 12 | msg "${INSTALL_CONFIG_PATH} seems valid" 13 | exit ${OCOK} 14 | else 15 | errors=$(("${errors}" + 1)) 16 | msg "${INSTALL_CONFIG_PATH} doesn't seem valid" 17 | if [ ! -z "${ERRORFILE}" ]; then 18 | echo $errors >${ERRORFILE} 19 | fi 20 | exit ${OCERROR} 21 | fi 22 | -------------------------------------------------------------------------------- /.github/workflows/broken-link-check.yml: -------------------------------------------------------------------------------- 1 | --- 2 | on: 3 | schedule: 4 | - cron: "0 0 * * *" # daily 5 | repository_dispatch: # run manually 6 | types: [check-link] 7 | # push: 8 | # ... 9 | 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.ref }} 12 | cancel-in-progress: true 13 | 14 | name: Broken Link Check 15 | jobs: 16 | check: 17 | name: Broken Link Check 18 | runs-on: ubuntu-latest 19 | steps: 20 | - name: Broken Link Check 21 | uses: technote-space/broken-link-checker-action@v2.3.1 22 | with: 23 | EXCLUDED_KEYWORDS: | 24 | docs.github.com 25 | camo.githubusercontent.com 26 | github.com/apps/dependabot 27 | -------------------------------------------------------------------------------- /.github/workflows/size.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Size 3 | 4 | on: 5 | pull_request_target: 6 | types: [opened, synchronize] 7 | 8 | jobs: 9 | update_labels: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - uses: actions-ecosystem/action-size@v2 15 | id: size 16 | 17 | - uses: actions-ecosystem/action-remove-labels@v1 18 | with: 19 | github_token: ${{ secrets.github_token }} 20 | labels: ${{ steps.size.outputs.stale_labels }} 21 | 22 | - uses: actions-ecosystem/action-add-labels@v1 23 | with: 24 | github_token: ${{ secrets.github_token }} 25 | labels: ${{ steps.size.outputs.new_label }} 26 | -------------------------------------------------------------------------------- /info/02-nodes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the nodes status 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i get nodes -A >/dev/null 2>&1; then 7 | msg "Nodes:\n$(oc get nodes -o wide)" 8 | msg "Masters: $(oc get nodes -o name --no-headers --selector='node-role.kubernetes.io/master' | wc -l)" 9 | msg "Workers: $(oc get nodes -o name --no-headers --selector='node-role.kubernetes.io/worker' | wc -l)" 10 | msg "Others: $(oc get nodes -o name --no-headers --selector='!node-role.kubernetes.io/worker,!node-role.kubernetes.io/master' | wc -l)" 11 | msg "Total nodes: $(oc get nodes -o name --no-headers | wc -l)" 12 | exit ${OCINFO} 13 | else 14 | msg "Couldn't get nodes, check permissions" 15 | exit ${OCSKIP} 16 | fi 17 | exit ${OCUNKNOWN} 18 | -------------------------------------------------------------------------------- /checks/csr: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are pending csr 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get csr >/dev/null 2>&1; then 9 | pending_csr=$(oc get csr --no-headers --ignore-not-found=true | grep -ci 'pending') 10 | if [[ ${pending_csr} -ge 1 ]]; then 11 | PCSR=$(oc get csr --no-headers | grep -i 'pending') 12 | msg "Pending CSRs (${pending_csr}): ${PCSR}" 13 | errors=$(("${errors}" + 1)) 14 | error=true 15 | fi 16 | if [ ! -z "${ERRORFILE}" ]; then 17 | echo $errors >${ERRORFILE} 18 | fi 19 | if [[ $error == true ]]; then 20 | exit ${OCERROR} 21 | else 22 | exit ${OCOK} 23 | fi 24 | else 25 | msg "Couldn't get csr, check permissions" 26 | exit ${OCSKIP} 27 | fi 28 | exit ${OCUNKNOWN} 29 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # To get started with Dependabot version updates, you'll need to specify which 3 | # package ecosystems to update and where the package manifests are located. 4 | # Please see the documentation for all configuration options: 5 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 6 | 7 | version: 2 8 | updates: 9 | - package-ecosystem: "pip" # See documentation for possible values 10 | directory: "/" # Location of package manifests 11 | schedule: 12 | interval: "daily" 13 | 14 | # Maintain dependencies for GitHub Actions 15 | - package-ecosystem: "github-actions" 16 | directory: "/" 17 | schedule: 18 | interval: "daily" 19 | commit-message: 20 | prefix: build 21 | prefix-development: chore 22 | include: scope 23 | -------------------------------------------------------------------------------- /checks/terminating: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are pods terminating 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get pods -A >/dev/null 2>&1; then 9 | terminating_pods=$(oc get pods -A | grep -c 'Terminating') 10 | if [[ $terminating_pods -ge 1 ]]; then 11 | TERMPODS=$(oc get pods -A | grep 'Terminating') 12 | msg "Pods in Terminating state ($terminating_pods):\n${RED}${TERMPODS}${NOCOLOR}" 13 | errors=$(("${errors}" + 1)) 14 | error=true 15 | fi 16 | if [ ! -z "${ERRORFILE}" ]; then 17 | echo $errors >${ERRORFILE} 18 | fi 19 | if [[ $error == true ]]; then 20 | exit ${OCERROR} 21 | else 22 | exit ${OCOK} 23 | fi 24 | else 25 | msg "Couldn't get all pods, check permissions" 26 | exit ${OCSKIP} 27 | fi 28 | exit ${OCUNKNOWNN} 29 | -------------------------------------------------------------------------------- /checks/mcp: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are degraded mcp 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get mcp >/dev/null 2>&1; then 9 | degrated_mcps=$(oc get mcp -o json | jq '.items[] | { name: .metadata.name, status: .status } | select (.status.degradedMachineCount >= 1) | { name: .name, status: .status.degradedMachineCount}') 10 | if [[ -n $degrated_mcps ]]; then 11 | DEGRADED=$(echo "${degrated_mcps}" | jq .) 12 | msg "MachineConfigProfiles in Degraded State: ${RED}${DEGRADED}${NOCOLOR}" 13 | errors=$(("${errors}" + 1)) 14 | error=true 15 | fi 16 | if [ ! -z "${ERRORFILE}" ]; then 17 | echo $errors >${ERRORFILE} 18 | fi 19 | if [[ $error == true ]]; then 20 | exit ${OCERROR} 21 | else 22 | exit ${OCOK} 23 | fi 24 | 25 | else 26 | msg "Couldn't get mcp, check permissions" 27 | exit ${OCSKIP} 28 | fi 29 | exit ${OCUNKNOWN} 30 | -------------------------------------------------------------------------------- /checks/pvc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are persistent volume claims that are not bound 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get pvc -A >/dev/null 2>&1; then 9 | pvc_not_bound=$(oc get pvc -A -o json | jq '.items[] | { name: .metadata.name, namespace: .metadata.namespace, phase: .status.phase } | select (.phase!="Bound")') 10 | if [[ -n ${pvc_not_bound} ]]; then 11 | PVCNOTBOUND=$(echo "${pvc_not_bound}" | jq .) 12 | msg "Persistent Volume Claims ${RED}NotBound${NOCOLOR}: ${PVCNOTBOUND}" 13 | errors=$(("${errors}" + 1)) 14 | error=true 15 | fi 16 | if [ ! -z "${ERRORFILE}" ]; then 17 | echo $errors >${ERRORFILE} 18 | fi 19 | if [[ $error == true ]]; then 20 | exit ${OCERROR} 21 | else 22 | exit ${OCOK} 23 | fi 24 | 25 | else 26 | msg "Couldn't get pvc, check permissions" 27 | exit ${OCSKIP} 28 | fi 29 | exit ${OCUNKNOWN} 30 | -------------------------------------------------------------------------------- /info/ovs-hostnames: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the ovs database chassis hostnames 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i exec pod -n openshift-ovn-kubernetes >/dev/null 2>&1 && oc get ns openshift-ovn-kubernetes >/dev/null 2>&1; then 7 | OVSHOSTNAMES=$(oc -n openshift-ovn-kubernetes exec pod/"$(oc -n openshift-ovn-kubernetes get pod -l app=ovnkube-master,component=network -o jsonpath='{.items[0].metadata.name}')" -c northd -- ovn-sbctl --no-leader-only list chassis | awk '/hostname/ { print $3 }' | sort -n) 8 | if [ -n "${OVSHOSTNAMES}" ]; then 9 | msg "OVS hostnames:\n${OVSHOSTNAMES}" 10 | exit ${OCINFO} 11 | else 12 | msg "Couldn't get ovs-hostnames, check permissions" 13 | exit ${OCSKIP} 14 | fi 15 | else 16 | msg "Couldn't get ovs-hostnames, either the cluster is not using OVN, or the running user has insufficient permissions" 17 | exit ${OCSKIP} 18 | fi 19 | 20 | exit ${OCUNKNOWN} 21 | -------------------------------------------------------------------------------- /checks/pdb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are PodDisruptionBudgets with 0 disruptions allowed 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get pdb >/dev/null 2>&1; then 9 | if [ $(is_sno) -eq 1 ]; then 10 | exit ${OCSKIP} 11 | fi 12 | wrong_pdb=$(oc get pdb -A -o json | jq '.items[] | { name: .metadata.name, status: .status } | select (.status.disruptionsAllowed == 0) | { name: .name}') 13 | if [[ -n $wrong_pdb ]]; then 14 | DEGRADED=$(echo "${wrong_pdb}" | jq .) 15 | msg "PodDisruptionBudget with 0 disruptions allowed: ${RED}${DEGRADED}${NOCOLOR}" 16 | errors=$((errors + 1)) 17 | error=true 18 | fi 19 | if [ ! -z "${ERRORFILE}" ]; then 20 | echo $errors >${ERRORFILE} 21 | fi 22 | if [[ $error == true ]]; then 23 | exit ${OCERROR} 24 | else 25 | exit ${OCOK} 26 | fi 27 | 28 | else 29 | msg "Couldn't get pdb, check permissions" 30 | exit ${OCSKIP} 31 | fi 32 | exit ${OCUNKNOWNN} 33 | -------------------------------------------------------------------------------- /checks/notrunningpods: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are not running pods 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get pods -A >/dev/null 2>&1; then 9 | # Get all nonrunning pods with headers even if they are not found 10 | notrunning=$(oc get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --ignore-not-found=true) 11 | HEADER=$(echo "${notrunning}" | head -n1) 12 | PODS=$(echo "${notrunning}" | tail -n +2) 13 | if [[ -n ${PODS} ]]; then 14 | msg "Pods not running ($(echo "${PODS}" | wc -l)):\n${HEADER}\n${RED}${PODS}${NOCOLOR}" 15 | errors=$(("${errors}" + 1)) 16 | error=true 17 | fi 18 | if [ ! -z "${ERRORFILE}" ]; then 19 | echo $errors >${ERRORFILE} 20 | fi 21 | if [[ $error == true ]]; then 22 | exit ${OCERROR} 23 | else 24 | exit ${OCOK} 25 | fi 26 | 27 | else 28 | msg "Couldn't get all pods, check permissions" 29 | exit ${OCSKIP} 30 | fi 31 | exit ${OCUNKNOWNN} 32 | -------------------------------------------------------------------------------- /checks/restarts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are pods restarted > n times (10 by default) 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get pods -A >/dev/null 2>&1; then 9 | restarts=$(oc get pods -o json -A | jq -r ".items[] | { name: .metadata.name, project: .metadata.namespace, restarts: .status.containerStatuses[].restartCount } | select(.restarts > $RESTART_THRESHOLD)" 2>/dev/null) 10 | if [[ -n $restarts ]]; then 11 | RESTARTS=$(echo "${restarts}" | jq -r '. | "\(.project)\t\(.name)\t\(.restarts)"' | column -t -N "NAMESPACE,NAME,RESTARTS") 12 | msg "Pods that have a high restart count (> $RESTART_THRESHOLD):\n${RED}${RESTARTS}${NOCOLOR}" 13 | errors=$(("${errors}" + 1)) 14 | error=true 15 | fi 16 | if [ ! -z "${ERRORFILE}" ]; then 17 | echo $errors >${ERRORFILE} 18 | fi 19 | if [[ $error == true ]]; then 20 | exit ${OCERROR} 21 | else 22 | exit ${OCOK} 23 | fi 24 | else 25 | msg "Couldn't get all pods, check permissions" 26 | exit ${OCSKIP} 27 | fi 28 | exit ${OCUNKNOWNN} 29 | -------------------------------------------------------------------------------- /checks/ctrlnodes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if any controller nodes have had the NoSchedule taint removed 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get nodes >/dev/null 2>&1; then 9 | if [ $(is_sno) -eq 1 ]; then 10 | exit ${OCSKIP} 11 | fi 12 | scheduable_controllers=$(oc get nodes -o json | jq '.items[] | { name: .metadata.name, scheduable: .spec.taints, control: .metadata.labels."node-role.kubernetes.io/master" } | select((.control == "") and (.scheduable == null))') 13 | if [[ -n ${scheduable_controllers} ]]; then 14 | SCHEDCTRL=$(echo "${scheduable_controllers}" | jq '. | { name: .name }') 15 | msg "Controllers ${RED}Scheduable${NOCOLOR}: ${SCHEDCTRL}" 16 | errors=$(("${errors}" + 1)) 17 | error=true 18 | fi 19 | if [ ! -z "${ERRORFILE}" ]; then 20 | echo $errors >${ERRORFILE} 21 | fi 22 | if [[ $error == true ]]; then 23 | exit ${OCERROR} 24 | else 25 | exit ${OCOK} 26 | fi 27 | else 28 | msg "Couldn't get nodes, check permissions" 29 | exit ${OCSKIP} 30 | fi 31 | exit ${OCUNKNOWN} 32 | -------------------------------------------------------------------------------- /info/bmh-machine-node: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the node,machine and bmh relationship 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | NS="openshift-machine-api" 7 | 8 | if oc auth can-i get nodes -A >/dev/null 2>&1; then 9 | if oc auth can-i get bmh -n ${NS} >/dev/null 2>&1; then 10 | if oc auth can-i get machines -n ${NS} >/dev/null 2>&1; then 11 | for bmh in $(oc get bmh -n openshift-machine-api -o jsonpath='{.items[*].metadata.name}'); do 12 | MACHINE=$(oc get -n openshift-machine-api bmh/${bmh} -o jsonpath='{.spec.consumerRef.name}') 13 | NODE=$(oc get -n openshift-machine-api machine/${MACHINE} -o jsonpath='{.status.nodeRef.name}') 14 | msg "Node ${NODE} => Machine: ${MACHINE}, BMH: ${bmh}" 15 | done 16 | exit ${OCINFO} 17 | else 18 | msg "Couldn't get machines, check permissions" 19 | exit ${OCSKIP} 20 | fi 21 | else 22 | msg "Couldn't get baremetalhosts, check permissions" 23 | exit ${OCSKIP} 24 | fi 25 | else 26 | msg "Couldn't get nodes, check permissions" 27 | exit ${OCSKIP} 28 | fi 29 | 30 | exit ${OCUNKNOWN} 31 | -------------------------------------------------------------------------------- /info/node-versions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show node components versions such as kubelet, crio, kernel, etc. 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i get nodes -A >/dev/null 2>&1; then 7 | KUBELETVERSIONS=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' | column -t -N "NODE,KUBELET") 8 | CRIOVERSIONS=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.nodeInfo.containerRuntimeVersion}{"\n"}{end}' | column -t -N "NODE,CRIO") 9 | KERNELVERSIONS=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.nodeInfo.kernelVersion}{"\n"}{end}' | column -t -N "NODE,KERNEL") 10 | OSIMAGEVERSIONS=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{":"}{.status.nodeInfo.osImage}{"\n"}{end}' | column -t -s ":" -N "NODE,OSIMAGE") 11 | msg "${KUBELETVERSIONS}" 12 | msg "${CRIOVERSIONS}" 13 | msg "${KERNELVERSIONS}" 14 | msg "${OSIMAGEVERSIONS}" 15 | exit ${OCINFO} 16 | else 17 | msg "Couldn't get nodes, check permissions" 18 | exit ${OCSKIP} 19 | fi 20 | exit ${OCUNKNOWN} 21 | -------------------------------------------------------------------------------- /Containerfile: -------------------------------------------------------------------------------- 1 | FROM registry.access.redhat.com/ubi8/ubi:latest 2 | 3 | WORKDIR /opt/openshift-checks 4 | 5 | # Some required binaries 6 | RUN dnf clean all && \ 7 | dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \ 8 | dnf update -y && \ 9 | dnf install -y jq curl util-linux bind-utils python38 && \ 10 | dnf clean all 11 | 12 | # YQ doesn't provide a RPM, download the latest 13 | RUN curl -sL $(curl -sL https://api.github.com/repos/mikefarah/yq/releases/latest | jq -r '.assets[] | select(.name == "yq_linux_amd64") | .browser_download_url') -o /usr/local/bin/yq &&\ 14 | chmod a+x /usr/local/bin/yq 15 | 16 | # Download latest oc binary 17 | RUN curl -sL https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz | tar -C /usr/local/bin -xzf - oc kubectl 18 | 19 | RUN groupadd -g 9999 appuser && \ 20 | useradd -r -u 9999 -g appuser appuser 21 | 22 | COPY . /opt/openshift-checks 23 | RUN pip3 install -r requirements.txt 24 | 25 | RUN chown -R appuser.appuser /opt/openshift-checks 26 | 27 | USER appuser 28 | 29 | ENTRYPOINT [ "/opt/openshift-checks/openshift-checks.sh" ] 30 | -------------------------------------------------------------------------------- /info/container-images-stored: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the container images stored in the cluster hosts 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i debug node >/dev/null 2>&1; then 7 | msg "Checking container images stored in the cluster (${BLUE}using oc debug, it can take a while${NOCOLOR})" 8 | # shellcheck disable=SC2016 9 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 10 | # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 11 | ((i = i % PARALLELJOBS)) 12 | ((i++ == 0)) && wait 13 | ( 14 | ocdebugorwait # Pause for no OC debug running 15 | oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "crictl images -o json" 2>/dev/null | jq -r .images[].repoTags[] 16 | ) & 17 | done | sort -u 18 | wait 19 | exit ${OCINFO} 20 | else 21 | msg "Couldn't debug nodes, check permissions" 22 | exit ${OCSKIP} 23 | fi 24 | exit ${OCUNKNOWN} 25 | -------------------------------------------------------------------------------- /checks/sriov: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if the SR-IOV network state is synced 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | oc get subs sriov-network-operator-subscription -n openshift-sriov-network-operator &>/dev/null 9 | if [ $? -ne 0 ]; then 10 | # SR-IOV operator is not installed 11 | exit ${OCSKIP} 12 | fi 13 | 14 | if oc auth can-i get SriovNetworkNodeState >/dev/null 2>&1; then 15 | sriov_bad_state=$(oc get SriovNetworkNodeState -n openshift-sriov-network-operator -o json | jq '.items[] | { name: .metadata.name, syncStatus: .status.syncStatus } | select (.syncStatus !="Succeeded")') 16 | if [[ -n ${sriov_bad_state} ]]; then 17 | SRIOVBADSTATE=$(echo "${sriov_bad_state}" | jq .) 18 | msg "Nodes ${RED}NotSynced${NOCOLOR}: ${SRIOVBADSTATE}" 19 | errors=$(("${errors}" + 1)) 20 | error=true 21 | fi 22 | if [ ! -z "${ERRORFILE}" ]; then 23 | echo $errors >${ERRORFILE} 24 | fi 25 | if [[ $error == true ]]; then 26 | exit ${OCERROR} 27 | else 28 | exit ${OCOK} 29 | fi 30 | 31 | else 32 | msg "Couldn't get SriovNetworkNodeState, check permissions" 33 | exit ${OCSKIP} 34 | fi 35 | exit ${OCUNKNOWN} 36 | -------------------------------------------------------------------------------- /checks/clusterversion_errors: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are clusterversion errors 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get clusterversion >/dev/null 2>&1; then 9 | clusterversion_msgs=$(oc get clusterversion -o json | jq '.items[].status.conditions[] | select ((.status == "True") and (.type == "Failing") and (.message != null)) | { message: .message }') 10 | count_errors=$(echo "${clusterversion_msgs}" | jq .message | wc -l) 11 | if [[ ${count_errors} -ge 1 ]]; then 12 | final="" 13 | OLDIFS=$IFS 14 | IFS=$'\n' 15 | 16 | for message in $(echo "${clusterversion_msgs}" | jq .message); do 17 | final="${final} ${message}" 18 | done 19 | IFS=${OLDIFS} 20 | msg "Clusterversion error status message: ${RED}${final}${NOCOLOR}" 21 | errors=$(("${errors}" + 1)) 22 | error=true 23 | fi 24 | if [ ! -z "${ERRORFILE}" ]; then 25 | echo $errors >${ERRORFILE} 26 | fi 27 | if [[ $error == true ]]; then 28 | exit ${OCERROR} 29 | else 30 | exit ${OCOK} 31 | fi 32 | else 33 | msg "Couldn't get clusterversion, check permissions" 34 | exit ${OCSKIP} 35 | fi 36 | exit ${OCUNKNOWN} 37 | -------------------------------------------------------------------------------- /checks/bz1948052: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # long_name: Checks for BZ 1948052 3 | # description: Checks for BZ 1948052 based on kernel version 4 | # bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1948052 5 | # priority: 600 6 | 7 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 8 | 9 | BADKERNEL="4.18.0-193.24.1.el8_2.dt1.x86_64" 10 | error=false 11 | 12 | if oc auth can-i get nodes >/dev/null 2>&1; then 13 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 14 | kernel_version=$(oc get ${node} -o jsonpath={.status.nodeInfo.kernelVersion}) 15 | if [[ ${kernel_version} == ${BADKERNEL} ]]; then 16 | msg "${RED}Node ${node} contains ${BADKERNEL} kernel version${NOCOLOR}" 17 | errors=$(("${errors}" + 1)) 18 | error=true 19 | fi 20 | done 21 | if [ ! -z "${ERRORFILE}" ]; then 22 | echo $errors >${ERRORFILE} 23 | fi 24 | if [[ $error == true ]]; then 25 | exit ${OCERROR} 26 | else 27 | exit ${OCOK} 28 | fi 29 | else 30 | msg "Couldn't get nodes, check permissions" 31 | exit ${OCSKIP} 32 | fi 33 | exit ${OCUNKNOWN} 34 | -------------------------------------------------------------------------------- /scripts/locks.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | declare -A ns_pods 4 | ORIG_IFS=$IFS 5 | IFS=$(echo -en "\n\b") 6 | 7 | for line in $(sudo lslocks | egrep -v '(unknown)' | awk '{print $2}' | sort -nr | uniq -c | sort -nr | egrep -v 'unknown|-1' | grep -v PID); do 8 | count=$(echo $line | awk '{print $1}') 9 | pid=$(echo $line | awk '{print $2}') 10 | orig_pid=$pid 11 | ppid=$(grep PPid /proc/${pid}/status | awk '{print $2}') 12 | while [[ $ppid -gt 1 ]]; do 13 | pid=$ppid 14 | ppid=$(grep PPid /proc/${pid}/status | awk '{print $2}') 15 | done 16 | if [[ $ppid -eq 1 ]]; then 17 | ppid=$pid 18 | fi 19 | if [[ $(ps -hp $ppid -o cmd | grep -c conmon) -eq 1 ]]; then 20 | ns=$(ps -hp $ppid -o cmd | grep conmon | awk '{print $9}' | awk -F/ '{print $5}' | awk -F_ '{print $1}') 21 | pod=$(ps -hp $ppid -o cmd | grep conmon | awk '{print $9}' | awk -F/ '{print $5}' | awk -F_ '{print $2}') 22 | if [ ${ns_pods["${ns}/${pod}"]} ]; then 23 | ns_pods["${ns}/${pod}"]=$(expr ${ns_pods["${ns}/${pod}"]} + $count) 24 | else 25 | ns_pods["${ns}/${pod}"]=$count 26 | fi 27 | fi 28 | done 29 | for pod in "${!ns_pods[@]}"; do 30 | echo $pod ${ns_pods[$pod]} 31 | done | sort -nr -k2 | column -t 32 | 33 | IFS=$ORIG_IFS 34 | -------------------------------------------------------------------------------- /checks/operators: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are operators in 'bad' state 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get co >/dev/null 2>&1; then 9 | bad_cluster_operators=$(oc get co --no-headers | grep -E -civ 'True.*False.*False') 10 | if [[ ${bad_cluster_operators} -ge 1 ]]; then 11 | BADCOPS=$(oc get co --no-headers | grep -E -iv 'True.*False.*False') 12 | msg "Cluster Operators in Bad State (${bad_cluster_operators}):\n${RED}${BADCOPS}${NOCOLOR}" 13 | errors=$(("${errors}" + 1)) 14 | fi 15 | bad_operators=$(oc get csv -l \!olm.copiedFrom -A -o json | jq '.items[] | { name: .metadata.name, namespace: .metadata.namespace, phase: .status.phase } | select (.phase!="Succeeded")') 16 | if [[ -n ${bad_operators} ]]; then 17 | BADOPS=$(echo "${bad_operators}" | jq .) 18 | msg "Operators in ${RED}Bad State${NOCOLOR}: ${BADOPS}" 19 | errors=$(("${errors}" + 1)) 20 | error=true 21 | fi 22 | if [ ! -z "${ERRORFILE}" ]; then 23 | echo $errors >${ERRORFILE} 24 | fi 25 | if [[ $error == true ]]; then 26 | exit ${OCERROR} 27 | else 28 | exit ${OCOK} 29 | fi 30 | 31 | else 32 | msg "Couldn't get co, check permissions" 33 | exit ${OCSKIP} 34 | fi 35 | exit ${OCUNKNOWNN} 36 | -------------------------------------------------------------------------------- /info/biosversion: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the nodes' BIOS version 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | # Check BIOS version to begin with 7 | if oc auth can-i debug node >/dev/null 2>&1; then 8 | msg "Checking bios versions (${BLUE}using oc debug, it can take a while${NOCOLOR})" 9 | # shellcheck disable=SC2016 10 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 11 | # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 12 | ((i = i % PARALLELJOBS)) 13 | ((i++ == 0)) && wait 14 | ( 15 | ocdebugorwait # Pause for no OC debug running 16 | if ! BIOSVER=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "cat /sys/class/dmi/id/bios_version" 2>/dev/null); then 17 | msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" 18 | else 19 | if [ -n "${BIOSVER}" ]; then 20 | msg "${node}: ${BIOSVER}" 21 | else 22 | msg "Couldn't found /sys/class/dmi/id/bios_version in ${node}" 23 | fi 24 | fi 25 | ) & 26 | done 27 | wait 28 | exit ${OCINFO} 29 | else 30 | msg "Couldn't debug nodes, check permissions" 31 | exit ${OCSKIP} 32 | fi 33 | exit ${OCUNKNOWN} 34 | -------------------------------------------------------------------------------- /info/locks: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: List all pods with locks on each node 3 | 4 | ORIG_IFS=$IFS 5 | IFS=$(echo -en "\n\b") 6 | 7 | SCRIPT64=$(cat ./scripts/locks.sh | base64 -w 0) 8 | 9 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 10 | if oc auth can-i debug node >/dev/null 2>&1; then 11 | msg "Checking for locks by pod, per node (${BLUE}using oc debug, it can take a while${NOCOLOR})" 12 | fw_errors=0 13 | # shellcheck disable=SC2016 14 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 15 | ocdebugorwait # Pause for no OC debug running 16 | # shellcheck disable=SC1083 17 | if ! FILE_LOCKS=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "echo $SCRIPT64 | base64 -d > /tmp/locks.sh; chmod 755 /tmp/locks.sh; /tmp/locks.sh; rm -f /tmp/locks.sh"); then 18 | msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" 19 | else 20 | if [ -n "${FILE_LOCKS}" ]; then 21 | msg "File locks found on ${RED}${node}${NOCOLOR}" 22 | for line in ${FILE_LOCKS}; do 23 | echo $line 24 | done 25 | else 26 | msg "Couldn't check for locks on ${node}" 27 | fi 28 | fi 29 | done 30 | exit ${OCINFO} 31 | else 32 | msg "Couldn't debug nodes, check permissions" 33 | exit ${OCSKIP} 34 | fi 35 | exit ${OCUNKNOWN} 36 | -------------------------------------------------------------------------------- /checks/port-thrashing: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are OVN pods thrashing 3 | 4 | THRASHINGMSG="Changing chassis for lport" 5 | NAMESPACE="openshift-ovn-kubernetes" 6 | 7 | error=false 8 | 9 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 10 | 11 | if [[ $(oc get network/cluster -o jsonpath={.spec.networkType}) != "OVNKubernetes" ]]; then 12 | msg "This check only works for OVNKubernetes SDN" 13 | exit ${OCSKIP} 14 | else 15 | if oc auth can-i get pods -n ${NAMESPACE} >/dev/null 2>&1; then 16 | if oc auth can-i get pods --subresource=log -n ${NAMESPACE} >/dev/null 2>&1; then 17 | for pod in $(oc get pods -o name -n ${NAMESPACE} -l app=ovnkube-node); do 18 | numerrors=$(oc logs -n ${NAMESPACE} ${pod} -c ovn-controller | grep "${THRASHINGMSG}" -c) 19 | if [[ ${numerrors} -gt ${THRASHING_THRESHOLD} ]]; then 20 | msg "${RED}${pod} port thrashing errors detected${NOCOLOR}" 21 | errors=$(("${errors}" + 1)) 22 | error=true 23 | fi 24 | 25 | done 26 | if [ ! -z "${ERRORFILE}" ]; then 27 | echo $errors >${ERRORFILE} 28 | fi 29 | if [[ $error == true ]]; then 30 | exit ${OCERROR} 31 | else 32 | exit ${OCOK} 33 | fi 34 | else 35 | msg "Couldn't get pods logs, check permissions" 36 | exit ${OCSKIP} 37 | fi 38 | else 39 | msg "Couldn't get pods, check permissions" 40 | exit ${OCSKIP} 41 | fi 42 | fi 43 | exit ${OCUNKNOWNN} 44 | -------------------------------------------------------------------------------- /checks/ovn-pods-memory-usage: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if the memory usage of the OVN pods is under the LIMIT threshold 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i adm top -A >/dev/null 2>&1; then 9 | LIMIT="${OVN_MEMORY_LIMIT:=5000}" 10 | FLAG=0 11 | pods_memory_usage=$(oc adm top pod -n openshift-ovn-kubernetes -l app=ovnkube-node --no-headers | awk '{ print $1 " " $3 }' | awk '{$2 = substr($2,0,length($2)-2)} 1') 12 | MESSAGE="" 13 | 14 | OLDIFS=${IFS} 15 | IFS=$'\n' 16 | for pod_line in ${pods_memory_usage}; do 17 | pod_name=$(echo $pod_line | awk '{ print $1 }') 18 | pod_size=$(echo $pod_line | awk '{ print $2 }') 19 | if [[ ${pod_size} -ge ${LIMIT} ]]; then 20 | MESSAGE="${MESSAGE}The OVN pod memory usage for ${pod_name} is extremely high: ${RED}${pod_size}${NOCOLOR}Mi\n" 21 | FLAG=1 22 | fi 23 | done 24 | IFS=${OLDIFS} 25 | 26 | if [[ ${FLAG} -ne 0 ]]; then 27 | MESSAGE="${MESSAGE}For more information you can check the KCS https://access.redhat.com/solutions/6493321\n" 28 | msg "${MESSAGE}" 29 | errors=$(("${errors}" + 1)) 30 | error=true 31 | fi 32 | 33 | if [ ! -z "${ERRORFILE}" ]; then 34 | echo $errors >${ERRORFILE} 35 | fi 36 | 37 | if [[ $error == true ]]; then 38 | exit ${OCERROR} 39 | else 40 | exit ${OCOK} 41 | fi 42 | else 43 | msg "Couldn't adm top pods, check permissions" 44 | exit ${OCSKIP} 45 | fi 46 | exit ${OCUNKNOWNN} 47 | -------------------------------------------------------------------------------- /info/ethtool-firmware-version: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the nodes' NIC firmware version using ethtool 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i debug node >/dev/null 2>&1; then 7 | msg "Checking NIC firmware version using ethtool (${BLUE}using oc debug, it can take a while${NOCOLOR})" 8 | # shellcheck disable=SC2016 9 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 10 | # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 11 | ((i = i % PARALLELJOBS)) 12 | ((i++ == 0)) && wait 13 | ( 14 | ocdebugorwait # Pause for no OC debug running 15 | if ! FIRMWAREVERS=$(oc debug --image="${OSETOOLSIMAGE}" "${node}" -- sh -c "for interface in \$(ls -d /sys/class/net/*/device | cut -d/ -f5); do echo -n \"\${interface} => \"; ethtool -i \${interface} | awk '/firmware-version/ { print substr(\$0, index(\$0,\$2)) }';done" 2>/dev/null); then 16 | msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" 17 | else 18 | if [ -n "${FIRMWAREVERS}" ]; then 19 | msg "${node}:\n${FIRMWAREVERS}" 20 | else 21 | msg "Couldn't find NIC firmware version in ${node}" 22 | fi 23 | fi 24 | ) & 25 | done 26 | wait 27 | exit ${OCINFO} 28 | else 29 | msg "Couldn't debug nodes, check permissions" 30 | exit ${OCSKIP} 31 | fi 32 | exit ${OCUNKNOWN} 33 | -------------------------------------------------------------------------------- /checks/alertmanager: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are warning or error alerts firing 3 | # kb: https://access.redhat.com/solutions/4250221 4 | 5 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 6 | if oc auth can-i get routes -n openshift-monitoring >/dev/null 2>&1; then 7 | alert_url=$(oc -n openshift-monitoring get routes/alertmanager-main -o json | jq -r .spec.host) 8 | raw_alerts=$(curl -s -k -H "Authorization: Bearer $(oc -n openshift-monitoring sa get-token prometheus-k8s)" https://$alert_url/api/v2/alerts) 9 | if [ $? -eq 35 ]; then 10 | # Error code 35 might mean an issue with a proxy server 11 | raw_alerts=$(curl --noproxy '*' -s -k -H "Authorization: Bearer $(oc -n openshift-monitoring sa get-token prometheus-k8s)" https://$alert_url/api/v2/alerts) 12 | fi 13 | alerts=$(echo $raw_alerts | jq '.[] | {alert:.labels.alertname, severity:.labels.severity, namespace:.labels.namespace, instance:.labels.instance, message:(.annotations.message // .annotations.summary)} | select((.severity == "warning") or (.severity == "critical"))') 14 | if [[ -n ${alerts} ]]; then 15 | ALERTS=$(echo "${alerts}" | jq -r '. | "\(.severity)\t\(.alert)\t\(.namespace)\t\(.instance)\t\(.message)"' | column -t -s $'\t' -N "SEVERITY,ALERT,NAMESPACE,INSTANCE,MESSAGE") 16 | msg "Alerts currently firing:\n${RED}${ALERTS}${NOCOLOR}\n" 17 | errors=$(("${errors}" + 1)) 18 | if [ ! -z "${ERRORFILE}" ]; then 19 | echo $errors >${ERRORFILE} 20 | fi 21 | exit ${OCERROR} 22 | fi 23 | exit ${OCOK} 24 | else 25 | msg "Couldn't get routes, check permissions" 26 | exit ${OCSKIP} 27 | fi 28 | exit ${OCUNKNOWN} 29 | -------------------------------------------------------------------------------- /cronjob.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Namespace 4 | metadata: 5 | name: checks-openshift 6 | --- 7 | apiVersion: v1 8 | kind: ServiceAccount 9 | metadata: 10 | name: checks-openshift 11 | namespace: checks-openshift 12 | --- 13 | apiVersion: rbac.authorization.k8s.io/v1 14 | kind: ClusterRoleBinding 15 | metadata: 16 | name: checks-openshift 17 | roleRef: 18 | apiGroup: rbac.authorization.k8s.io 19 | kind: ClusterRole 20 | name: cluster-admin 21 | subjects: 22 | - kind: ServiceAccount 23 | name: checks-openshift 24 | namespace: checks-openshift 25 | --- 26 | apiVersion: batch/v1beta1 27 | kind: CronJob 28 | metadata: 29 | name: checks-openshift 30 | namespace: checks-openshift 31 | spec: 32 | concurrencyPolicy: Forbid 33 | failedJobsHistoryLimit: 3 34 | jobTemplate: 35 | spec: 36 | template: 37 | spec: 38 | tolerations: 39 | - effect: NoSchedule 40 | key: node-role.kubernetes.io/master 41 | operator: Exists 42 | affinity: {} 43 | containers: 44 | - name: checks-openshift 45 | image: quay.io/rhsysdeseng/openshift-checks:latest 46 | imagePullPolicy: IfNotPresent 47 | command: ["/bin/sh", "-c", "/opt/openshift-checks/openshift-checks.sh"] 48 | resources: 49 | requests: 50 | cpu: 100m 51 | memory: 256Mi 52 | serviceAccountName: checks-openshift 53 | restartPolicy: Never 54 | terminationGracePeriodSeconds: 30 55 | backoffLimit: 0 56 | schedule: "53 * * * *" 57 | successfulJobsHistoryLimit: 3 58 | suspend: false 59 | -------------------------------------------------------------------------------- /checks/zombies: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if more than 5 zombie processes exist on the hosts 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | tmperrorfile=$(mktemp) 7 | trap "rm ${errorfile}" EXIT 8 | echo 0 >$tmperrorfile 9 | 10 | if oc auth can-i debug node >/dev/null 2>&1; then 11 | msg "Collecting zombie processes... (${BLUE}using oc debug, it can take a while${NOCOLOR})" 12 | # shellcheck disable=SC2016 13 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 14 | # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 15 | ((i = i % PARALLELJOBS)) 16 | ((i++ == 0)) && wait 17 | ( 18 | ocdebugorwait # Pause for no OC debug running 19 | ZOMBIES=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'ps -ef | grep -c "[d]efunct"' 2>/dev/null) 20 | if [ -n "${ZOMBIES}" ] && [ "${ZOMBIES}" -gt 0 ]; then 21 | msg "${ORANGE}${ZOMBIES}${NOCOLOR} zombie processes found in ${node}" 22 | if [ "${ZOMBIES}" -ge 5 ]; then 23 | echo 1 >$tmperrorfile 24 | fi 25 | fi 26 | ) & 27 | done 28 | wait 29 | if [ "$(cat $tmperrorfile)" -eq 1 ]; then 30 | errors=$(("${errors}" + 1)) 31 | if [ ! -z "${ERRORFILE}" ]; then 32 | echo $errors >${ERRORFILE} 33 | fi 34 | exit ${OCERROR} 35 | else 36 | exit ${OCOK} 37 | fi 38 | else 39 | msg "Couldn't debug nodes, check permissions" 40 | exit ${OCSKIP} 41 | fi 42 | exit ${OCUNKNOWN} 43 | -------------------------------------------------------------------------------- /checks/entropy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if the workers have enough entropy 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | tmperrorfile=$(mktemp) 7 | trap "rm ${errorfile}" EXIT 8 | echo 0 >$tmperrorfile 9 | 10 | if oc auth can-i debug node >/dev/null 2>&1; then 11 | msg "Collecting entropy data... (${BLUE}using oc debug, it can take a while${NOCOLOR})" 12 | # shellcheck disable=SC2016 13 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 14 | # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 15 | ((i = i % PARALLELJOBS)) 16 | ((i++ == 0)) && wait 17 | ( 18 | ocdebugorwait # Pause for no OC debug running 19 | if ! ENTROPY=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'cat /proc/sys/kernel/random/entropy_avail' 2>/dev/null); then 20 | msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" 21 | else 22 | if [ -n "${ENTROPY}" ] && [ "${ENTROPY}" -lt 200 ]; then 23 | msg "${RED}Low entropy in ${node}${NOCOLOR}" 24 | echo 1 >$tmperrorfile 25 | fi 26 | fi 27 | ) & 28 | done 29 | wait 30 | if [ "$(cat $tmperrorfile)" -eq 1 ]; then 31 | errors=$(("${errors}" + 1)) 32 | if [ ! -z "${ERRORFILE}" ]; then 33 | echo $errors >${ERRORFILE} 34 | fi 35 | exit ${OCERROR} 36 | else 37 | exit ${OCOK} 38 | fi 39 | else 40 | msg "Couldn't debug nodes, check permissions" 41 | exit ${OCSKIP} 42 | fi 43 | exit ${OCUNKNOWN} 44 | -------------------------------------------------------------------------------- /ssh/bz1941840: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if the authentication-operator is using excessive RAM -> hung kubelet BZ1941840 3 | # bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1948052 4 | 5 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 6 | 7 | error=false 8 | 9 | if oc auth can-i get pods -n openshift-authentication-operator >/dev/null 2>&1; then 10 | msg "Checking for a hung kubelet..." 11 | # shellcheck disable=SC2016 12 | node=$(oc get pods -n openshift-authentication-operator -l app=authentication-operator -o json | jq -r .items[0].spec.nodeName) 13 | container_id=$(oc get pods -n openshift-authentication-operator -l app=authentication-operator -o json | jq -r .items[0].status.containerStatuses[0].containerID | awk -F// '{print $2}' | cut -c-13) 14 | if ! AUTH_OPERATOR_MEMORY=$(ssh -q core@$node "sudo crictl stats --id ${container_id} -o json | jq -r .stats[0].memory.workingSetBytes.value"); then 15 | msg "${ORANGE}Error running crictl stats openshift-authentication-operator/${pod}${NOCOLOR}" 16 | else 17 | if [ -n "${AUTH_OPERATOR_MEMORY}" ] && [ "${AUTH_OPERATOR_MEMORY}" -gt 2147483648 ]; then # more than 2GB is a bad sign 18 | msg "${RED}High memory usage detected for openshift-authentication-operator, which likely means that kubelet on ${node} is hung. Terminate the pod to remediate${NOCOLOR}" 19 | errors=$(("${errors}" + 1)) 20 | error=true 21 | fi 22 | fi 23 | if [ ! -z "${ERRORFILE}" ]; then 24 | echo $errors >${ERRORFILE} 25 | fi 26 | if [[ $error == true ]]; then 27 | exit ${OCERROR} 28 | else 29 | exit ${OCOK} 30 | fi 31 | else 32 | msg "Couldn't get pods, check permissions" 33 | exit ${OCSKIP} 34 | fi 35 | exit ${OCUNKNOWN} 36 | -------------------------------------------------------------------------------- /checks/nodes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if there are not ready or not schedulable nodes 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | if oc auth can-i get nodes >/dev/null 2>&1; then 9 | nodes_not_ready=$(oc get nodes -o json | jq '.items[] | { name: .metadata.name, type: .status.conditions[] } | select ((.type.type == "Ready") and (.type.status != "True"))') 10 | if [[ -n ${nodes_not_ready} ]]; then 11 | NODESNOTREADY=$(echo "${nodes_not_ready}" | jq .) 12 | msg "Nodes ${RED}NotReady${NOCOLOR}: ${NODESNOTREADY}" 13 | errors=$(("${errors}" + 1)) 14 | error=true 15 | fi 16 | disabled_nodes=$(oc get nodes -o json | jq '.items[] | { name: .metadata.name, status: .spec.unschedulable } | select (.status == true)') 17 | if [[ -n ${disabled_nodes} ]]; then 18 | NODESDISABLED=$(echo "${disabled_nodes}" | jq .) 19 | msg "Nodes ${RED}Disabled${NOCOLOR}: ${NODESDISABLED}" 20 | errors=$(("${errors}" + 1)) 21 | error=true 22 | fi 23 | pressure_nodes=$(oc get node -o json | jq '.items[] | { name: .metadata.name, conditions: .status.conditions[] } | select ((.conditions.type | contains("Pressure")) and .conditions.status != "False")') 24 | if [[ -n ${pressure_nodes} ]]; then 25 | NODESPRESSURE=$(echo "${pressure_nodes}" | jq .) 26 | msg "Nodes with ${RED}Pressure${NOCOLOR}: ${NODESPRESSURE}" 27 | errors=$(("${errors}" + 1)) 28 | fi 29 | if [ ! -z "${ERRORFILE}" ]; then 30 | echo $errors >${ERRORFILE} 31 | fi 32 | if [[ $error == true ]]; then 33 | exit ${OCERROR} 34 | else 35 | exit ${OCOK} 36 | fi 37 | 38 | else 39 | msg "Couldn't get nodes, check permissions" 40 | exit ${OCSKIP} 41 | fi 42 | exit ${OCUNKNOWNN} 43 | -------------------------------------------------------------------------------- /checks/chronyc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if the worker clocks are synced using chronyc 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | tmperrorfile=$(mktemp) 7 | trap "rm ${errorfile}" EXIT 8 | echo 0 >$tmperrorfile 9 | 10 | if oc auth can-i debug node >/dev/null 2>&1; then 11 | msg "Collecting NTP data... (${BLUE}using oc debug, it can take a while${NOCOLOR})" 12 | # shellcheck disable=SC2016 13 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 14 | # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 15 | ((i = i % PARALLELJOBS)) 16 | ((i++ == 0)) && wait 17 | ( 18 | # shellcheck disable=2016 19 | ocdebugorwait # Pause for no OC debug running 20 | if ! SOURCES=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'chronyc activity' 2>/dev/null | awk '/sources online/ { print $1 }'); then 21 | msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" 22 | else 23 | if [ -n "${SOURCES}" ] && [ "${SOURCES}" -lt 1 ]; then 24 | msg "${RED}Clock doesn't seem to be synced in ${node}${NOCOLOR}" 25 | echo 1 >$tmperrorfile 26 | fi 27 | fi 28 | ) & 29 | done 30 | wait 31 | if [ "$(cat $tmperrorfile)" -eq 1 ]; then 32 | errors=$(("${errors}" + 1)) 33 | if [ ! -z "${ERRORFILE}" ]; then 34 | echo $errors >${ERRORFILE} 35 | fi 36 | exit ${OCERROR} 37 | else 38 | exit ${OCOK} 39 | fi 40 | else 41 | msg "Couldn't debug nodes, check permissions" 42 | exit ${OCSKIP} 43 | fi 44 | exit ${OCUNKNOWN} 45 | -------------------------------------------------------------------------------- /pre/dns-hostnames: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if the api and wildcard DNS entries are correct 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | error=false 7 | 8 | BASEDOMAIN=$(yq e '.baseDomain' ${INSTALL_CONFIG_PATH} 2>/dev/null) 9 | 10 | if [ -z ${BASEDOMAIN} ]; then 11 | errors=$(("${errors}" + 1)) 12 | error=true 13 | msg ".baseDomain not found in ${INSTALL_CONFIG_PATH}" 14 | fi 15 | 16 | CLUSTERNAME=$(yq e '.metadata.name' ${INSTALL_CONFIG_PATH} 2>/dev/null) 17 | if [ -z ${CLUSTERNAME} ]; then 18 | errors=$(("${errors}" + 1)) 19 | error=true 20 | msg ".metadata.name not found in ${INSTALL_CONFIG_PATH}" 21 | fi 22 | 23 | #MASTERS=$(yq e '.platform.baremetal.hosts[] | select(.role == "master") | .name' ${INSTALL_CONFIG_PATH} 2> /dev/null) 24 | #WORKERS=$(yq e '.platform.baremetal.hosts[] | select(.role == "worker") | .name' ${INSTALL_CONFIG_PATH} 2> /dev/null) 25 | # NS1="ns1."${CLUSTERNAME}"."${BASEDOMAIN} 26 | 27 | API="api."${CLUSTERNAME}"."${BASEDOMAIN}"." 28 | WILDCARD="foobar.apps."${CLUSTERNAME}"."${BASEDOMAIN}"." 29 | 30 | IP_API=$(dig +short ${API}) 31 | if [ -z ${IP_API} ]; then 32 | errors=$(("${errors}" + 1)) 33 | error=true 34 | msg "${RED}${API} doesn't resolve${NOCOLOR}" 35 | fi 36 | 37 | IP_WILDCARD=$(dig +short ${WILDCARD}) 38 | if [ -z ${IP_WILDCARD} ]; then 39 | errors=$(("${errors}" + 1)) 40 | error=true 41 | msg "${RED}${WILDCARD} doesn't resolve${NOCOLOR}" 42 | fi 43 | 44 | IP_API_REVERSE=$(dig +short -x ${IP_API}) 45 | if [ -z ${IP_API_REVERSE} ]; then 46 | errors=$(("${errors}" + 1)) 47 | error=true 48 | msg "${YELLOW}api reverse not found${NOCOLOR}" 49 | else 50 | if [ ${IP_API_REVERSE} != ${API} ]; then 51 | errors=$(("${errors}" + 1)) 52 | error=true 53 | msg "${YELLOW}${API} doesn't match the reverse ${IP_API_REVERSE}${NOCOLOR}" 54 | fi 55 | fi 56 | 57 | # Wildcard reverse DNS doesn't seem to be a thing 58 | #IP_WILDCARD_REVERSE=$(dig +short -x ${IP_WILDCARD}) 59 | #if [ -z ${IP_WILDCARD_REVERSE} ]; then 60 | # msg "${YELLOW}wildcard reverse not found${NOCOLOR}" 61 | # else 62 | # if [ ${IP_WILDCARD_REVERSE} != ${WILDCARD} ]; then 63 | # msg "${YELLOW}${WILDCARD} doesn't match the reverse ${IP_WILDCARD_REVERSE}${NOCOLOR}" 64 | # fi 65 | #fi 66 | 67 | if [ ! -z "${ERRORFILE}" ]; then 68 | echo $errors >${ERRORFILE} 69 | fi 70 | if [[ $error == true ]]; then 71 | exit ${OCERROR} 72 | else 73 | exit ${OCOK} 74 | fi 75 | -------------------------------------------------------------------------------- /.github/workflows/refresh-checksmd.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Checks.md updater 3 | 4 | on: 5 | # Compare the preceeding commit of main -> to the current commit of the main branch. 6 | # (Note: To compare changes between the last pushed commit to the remote main branch set `since_last_remote_commit: true`) 7 | push: 8 | branches: 9 | - main 10 | # Compare the last commit of main -> to the current commit of a PR branch. 11 | # (Note: To compare changes between the last pushed commit to the remote PR branch set `since_last_remote_commit: true`) 12 | pull_request: 13 | branches: 14 | - main 15 | 16 | jobs: 17 | build: 18 | runs-on: ubuntu-latest # windows-latest | macos-latest 19 | name: Test changed-files 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 # OR "2" -> To retrieve the preceding commit. 24 | 25 | - name: Get changed files in the checks folder 26 | id: changed-files-specific 27 | uses: tj-actions/changed-files@v40 28 | with: 29 | files: | 30 | checks/** 31 | info/** 32 | pre/** 33 | ssh/** 34 | 35 | - name: Run step if any file(s) in the watched folder change 36 | if: steps.changed-files-specific.outputs.any_changed == 'true' 37 | run: | 38 | echo "One or more files in the scripts folder has changed, updating checks.md" 39 | echo "List all the files that have changed: ${{ steps.changed-files-specific.outputs.all_changed_files }}" 40 | ./scripts/update-checksmd > checks.md 41 | 42 | - name: Commit back the checks.md to the repository 43 | if: steps.changed-files-specific.outputs.any_changed == 'true' 44 | uses: stefanzweifel/git-auto-commit-action@v5 45 | with: 46 | # Optional. Commit message for the created commit. 47 | # Defaults to "Apply automatic changes" 48 | commit_message: "[skip ci] Autoupdate Checks.md on change" 49 | 50 | # Optional. Local and remote branch name where commit is going to be pushed 51 | # to. Defaults to the current branch. 52 | # You might need to set `create_branch: true` if the branch does not exist. 53 | branch: main 54 | 55 | # Optional. Options used by `git-commit`. 56 | # See https://git-scm.com/docs/git-commit#_options 57 | commit_options: '--no-verify --signoff' 58 | 59 | # Optional glob pattern of files which should be added to the commit 60 | # Defaults to all (.) 61 | # See the `pathspec`-documentation for git 62 | # - https://git-scm.com/docs/git-add#Documentation/git-add.txt-ltpathspecgt82308203 63 | # - https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec 64 | file_pattern: 'checks.md' 65 | 66 | # Optional. Local file path to the repository. 67 | # Defaults to the root of the repository. 68 | repository: . 69 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | fail_fast: true 3 | repos: 4 | - hooks: 5 | - id: commitizen 6 | stages: 7 | - commit-msg 8 | repo: https://github.com/commitizen-tools/commitizen 9 | rev: v2.42.0 10 | - hooks: 11 | - id: check-useless-excludes 12 | repo: meta 13 | - hooks: 14 | - files: \.(css|js|md|markdown|json) 15 | id: prettier 16 | repo: https://github.com/pre-commit/mirrors-prettier 17 | rev: v3.0.0-alpha.4 18 | - hooks: 19 | - id: seed-isort-config 20 | repo: https://github.com/asottile/seed-isort-config 21 | rev: v2.2.0 22 | - hooks: 23 | - id: isort 24 | repo: https://github.com/pre-commit/mirrors-isort 25 | rev: v5.10.1 26 | - hooks: 27 | - id: black 28 | repo: https://github.com/python/black 29 | rev: 23.1.0 30 | - hooks: 31 | - id: check-added-large-files 32 | - id: check-ast 33 | - id: check-case-conflict 34 | - id: check-executables-have-shebangs 35 | - id: check-json 36 | - id: check-merge-conflict 37 | - id: check-symlinks 38 | - id: check-vcs-permalinks 39 | - id: debug-statements 40 | - id: check-xml 41 | - args: 42 | - --unsafe 43 | id: check-yaml 44 | - id: end-of-file-fixer 45 | - id: forbid-new-submodules 46 | - args: 47 | - --branch 48 | - gh-pages 49 | id: no-commit-to-branch 50 | - id: requirements-txt-fixer 51 | - id: sort-simple-yaml 52 | - id: trailing-whitespace 53 | - id: mixed-line-ending 54 | - id: detect-private-key 55 | - id: check-byte-order-marker 56 | - id: check-docstring-first 57 | repo: https://github.com/pre-commit/pre-commit-hooks 58 | rev: v4.4.0 59 | - hooks: 60 | - id: flake8 61 | repo: https://github.com/pycqa/flake8 62 | rev: 6.0.0 63 | - hooks: 64 | - additional_dependencies: 65 | - mvdan.cc/sh/v3/cmd/shfmt@v3.1.1 66 | args: 67 | - -w 68 | - -i 69 | - "2" 70 | - -s 71 | entry: shfmt 72 | id: shfmt 73 | language: golang 74 | minimum_pre_commit_version: 2.4.0 75 | name: shfmt 76 | types: 77 | - shell 78 | repo: local 79 | - hooks: 80 | - id: blacken-docs 81 | repo: https://github.com/asottile/blacken-docs 82 | rev: 1.13.0 83 | 84 | # - repo: https://github.com/asottile/pyupgrade 85 | # rev: v2.38.0 86 | # hooks: 87 | # - id: pyupgrade 88 | # args: [--py39-plus] 89 | 90 | - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt 91 | rev: 0.2.2 # or other specific tag 92 | hooks: 93 | - id: yamlfmt 94 | args: [--mapping, '2', --sequence, '4', --offset, '2', '--preserve-quotes'] 95 | 96 | 97 | - repo: https://github.com/hcodes/yaspeller.git 98 | rev: v8.0.1 99 | hooks: 100 | - id: yaspeller 101 | types: 102 | - markdown 103 | -------------------------------------------------------------------------------- /checks/iptables-22623-22624: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # kb: https://access.redhat.com/solutions/5709711 3 | # description: Checks if the nodes iptables rules are blocking 22623/tpc or 22624/tcp 4 | 5 | # 6 | # To check if the rule exist, we use iptables -C, it returns 0 if the rule exist 7 | # and if it doesn't exist, it exits 1 with the following message: 8 | # "iptables: Bad rule (does a matching rule exist in that chain?)." 9 | # 10 | # To save cycles, we run every command in the same oc debug session. 11 | # We concatenate all commands with || meaning it will stop if 12 | # some command fails (returns 0, so if the rule exist) 13 | 14 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 15 | 16 | tmperrorfile=$(mktemp) 17 | trap "rm ${errorfile}" EXIT 18 | echo 0 >$tmperrorfile 19 | 20 | if oc auth can-i debug node >/dev/null 2>&1; then 21 | msg "Checking if ports 22623/tcp and 22624/tcp are blocked (${BLUE}using oc debug, it can take a while${NOCOLOR})" 22 | # shellcheck disable=SC2016 23 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 24 | # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 25 | ((i = i % PARALLELJOBS)) 26 | ((i++ == 0)) && wait 27 | ( 28 | ocdebugorwait # Pause for no OC debug running 29 | # shellcheck disable=2016 30 | OUTPUT=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c \ 31 | "iptables -C FORWARD -p tcp --dport 22623 -j REJECT --reject-with icmp-port-unreachable &>/dev/null || \ 32 | iptables -C FORWARD -p tcp --dport 22624 -j REJECT --reject-with icmp-port-unreachable &>/dev/null || \ 33 | iptables -C OUTPUT -p tcp --dport 22623 -j REJECT --reject-with icmp-port-unreachable &>/dev/null || \ 34 | iptables -C OUTPUT -p tcp --dport 22624 -j REJECT --reject-with icmp-port-unreachable &>/dev/null || \ 35 | echo 'allok'" 2>&1) 36 | # The command stderr and stdout are captured 37 | # If the command output is 'allok' is because every other command 38 | # failed, meaning the iptables rules weren't found 39 | if [[ ${OUTPUT} =~ "allok" ]]; then 40 | # Do nothing 41 | : 42 | elif [[ ${OUTPUT} =~ "Back-off" ]]; then 43 | msg "${ORANGE}Error pulling the oc debug image in ${node}${NOCOLOR}" 44 | elif [[ ${OUTPUT} =~ "unable to create" ]]; then 45 | msg "${ORANGE}Unable to create debug pod in ${node}${NOCOLOR}" 46 | else 47 | msg "${RED}iptables rules for 22623/tcp or 22624/tcp found in ${node}${NOCOLOR}" 48 | echo 1 >$tmperrorfile 49 | fi 50 | ) & 51 | done 52 | wait 53 | if [ "$(cat $tmperrorfile)" -eq 1 ]; then 54 | errors=$(("${errors}" + 1)) 55 | if [ ! -z "${ERRORFILE}" ]; then 56 | echo $errors >${ERRORFILE} 57 | fi 58 | exit ${OCERROR} 59 | else 60 | exit ${OCOK} 61 | fi 62 | 63 | else 64 | msg "Couldn't debug nodes, check permissions" 65 | exit ${OCSKIP} 66 | fi 67 | exit ${OCUNKNOWN} 68 | -------------------------------------------------------------------------------- /checks/mellanox-firmware-version: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Checks if the nodes' Mellanox Connect-4 firmware version is below the recommended version. 3 | 4 | # lspci -nn shows PCI vendor and device codes (and names) 5 | # Mellanox MT27710 Family [ConnectX-4 Lx] 25GbE dual-port SFP28 with **vendor ID 0x15b3 and device ID 0x1015** 6 | # Mellanox MT27800 Family [ConnectX-5] 25GbE dual-port SFP28 with **vendor ID 0x15b3 and device ID 0x1017** 7 | # Mellanox MT27800 Family [ConnectX-5] 100GbE with **vendor ID 0x15b3 and device ID 0x1017** 8 | # Mellanox MT27700 Family [ConnectX-4] VPI adapter card, EDR IB (100Gb/s), single-port QSFP28 with **vendor ID 0x15b3 and device ID 0x1013** 9 | # Mellanox MT27800 Family [ConnectX-5] VPI adapter card, EDR IB (100Gb/s), single-port QSFP28 with **vendor ID 0x15b3 and device ID 0x1017** 10 | # Mellanox MT28908 Family [ConnectX-6] VPI adapter card, 100Gb/s (HDR100, EDR IB), single-port QSFP56 with **vendor ID 0x15b3 and device ID 0x101b** 11 | # Mellanox MT28908 Family [ConnectX-6] VPI adapter card, HDR200 IB (200Gb/s), single-port QSFP56 with vendor ID **0x15b3 and device ID 0x101b** 12 | 13 | IDS="15b3:1015 15b3:1017 15b3:1013 15b3:101b" 14 | MIN_VERS=16.28 15 | 16 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 17 | if oc auth can-i debug node >/dev/null 2>&1; then 18 | msg "Checking Mellanox firmware version (${BLUE}using oc debug, it can take a while${NOCOLOR})" 19 | fw_errors=0 20 | # shellcheck disable=SC2016 21 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 22 | # shellcheck disable=SC1083 23 | ocdebugorwait # Pause for no OC debug running 24 | if ! FIRMWAREVERS=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "for id in ${IDS}; do for device in \$(lspci -D -d "\${id}" | awk '{ print \$1 }'); do echo -n \"\${device},\" ; ethtool -i \$(ls /sys/bus/pci/devices/\${device}/net/)|grep firmware-version|cut -d: -f2-|xargs echo|awk '{ print \$1 }';done;done" 2>/dev/null); then 25 | msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" 26 | else 27 | if [ -n "${FIRMWAREVERS}" ]; then 28 | for result in ${FIRMWAREVERS}; do 29 | dev=$(echo ${result} | awk -F, '{print $1}') 30 | fw=$(echo ${result} | awk -F, '{print $2}' | awk -F. '{print $1"."$2}') 31 | if [[ $(expr ${fw} \< ${MIN_VERS}) -eq 1 ]]; then 32 | msg "Firmware for Mellanox card ${RED}${dev}${NOCOLOR} (${fw}) on ${RED}${node}${NOCOLOR} is below the minimum recommended version. Please upgrade to at least ${GREEN}${MIN_VERS}${NOCOLOR}." 33 | errors=$(("${errors}" + 1)) 34 | fw_errors=$(("${fw_errors}" + 1)) 35 | if [ ! -z "${ERRORFILE}" ]; then 36 | echo $errors >${ERRORFILE} 37 | fi 38 | fi 39 | done 40 | else 41 | msg "Couldn't find Mellanox firmware version in ${node}" 42 | fi 43 | fi 44 | done 45 | if [[ $fw_errors -gt 0 ]]; then 46 | exit ${OCERROR} 47 | fi 48 | exit ${OCINFO} 49 | else 50 | msg "Couldn't debug nodes, check permissions" 51 | exit ${OCSKIP} 52 | fi 53 | exit ${OCUNKNOWN} 54 | -------------------------------------------------------------------------------- /info/mtu: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # description: Show the nodes' MTU for some interfaces 3 | 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") 5 | 6 | if oc auth can-i get network/cluster >/dev/null 2>&1; then 7 | network_type=$(oc get network/cluster -ojson | jq -r .status.networkType) 8 | if [[ $network_type != "OVNKubernetes" ]]; then 9 | msg "MTU checks not supported yet for clusters using $network_type" 10 | exit ${OCSKIP} 11 | else 12 | # If the crd doesn't exist it will return 0 if you are cluster-admin as you have permissions to get * 13 | # So it is needed to 'get' the object as well to verify it does exist 14 | if oc auth can-i get nodenetworkstates.nmstate.io -A >/dev/null 2>&1 && oc get nodenetworkstates.nmstate.io -o jsonpath='{.items[*].metadata.name}' >/dev/null 2>&1; then 15 | # We need to split the next command using spaces, hence, using standard IFS 16 | OLDIFS=$IFS 17 | IFS=$' \t\n' 18 | for nns in $(oc get nodenetworkstates.nmstate.io -o jsonpath='{.items[*].metadata.name}'); do 19 | NNS=$(oc get nodenetworkstates.nmstate.io "${nns}" -o json 2>/dev/null) 20 | BREXMTU=$(echo "${NNS}" | jq '.status.currentState.interfaces[] | select(.name == "br-ex" and .type == "ovs-interface") | .mtu') 21 | BREXPHYSINT=$(echo "${NNS}" | jq -r '.status.currentState.interfaces[] | select(.name == "br-ex" and .type == "ovs-bridge") | .bridge.port[] | select(.name != "br-ex") | .name') 22 | PHYSINTMTU=$(echo "${NNS}" | jq ".status.currentState.interfaces[] | select(.name == \"${BREXPHYSINT}\") | .mtu") 23 | OVNK8SMP0MTU=$(echo "${NNS}" | jq '.status.currentState.interfaces[] | select(.name == "ovn-k8s-mp0") | .mtu') 24 | msg "${nns} => br-ex:${BREXMTU}, ${BREXPHYSINT}:${PHYSINTMTU}, ovn-k8s-mp0:${OVNK8SMP0MTU}" 25 | done 26 | IFS=${OLDIFS} 27 | else 28 | if oc auth can-i debug node -A >/dev/null 2>&1 && oc auth can-i get nodes >/dev/null 2>&1; then 29 | msg "Collecting MTUs... (${BLUE}using oc debug, it can take a while${NOCOLOR}))" 30 | # shellcheck disable=SC2016 31 | for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do 32 | # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 33 | ((i = i % PARALLELJOBS)) 34 | ((i++ == 0)) && wait 35 | ( 36 | # Get all the information in a single debug to avoid rescheduling unneeded pods 37 | # then convert the output into an array for easily consumption 38 | ocdebugorwait # Pause for no OC debug running 39 | # shellcheck disable=2016 40 | mapfile -t MTUS < <(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'export EXTBR="br-ex"; export OVNBR="ovn-k8s-mp0"; export BMINTERFACE=$(ovs-vsctl list-ports "${EXTBR}" | grep -v patch) ; echo "${BMINTERFACE}"; nmcli -g GENERAL.MTU dev show "${BMINTERFACE}"; nmcli -g GENERAL.MTU dev show "${EXTBR}"; nmcli -g GENERAL.MTU dev show "${OVNBR}"' 2>/dev/null) 41 | # If the array is empty, something has happened 42 | if [ ${#MTUS[@]} -eq 0 ]; then 43 | msg "${YELLOW}Couldn't get MTU settings in ${node}${NOCOLOR}" 44 | else 45 | # MTUS[0] = Baremetal interface name 46 | # MTUS[1] = Baremetal interface MTU 47 | # MTUS[2] = br-ex MTU 48 | # MTUS[3] = ovn-k8s-mp0 MTU 49 | msg "${node} => br-ex:${MTUS[2]}, ${MTUS[0]}:${MTUS[1]}, ovn-k8s-mp0: ${MTUS[3]}" 50 | fi 51 | ) & 52 | done 53 | wait 54 | fi 55 | fi 56 | fi 57 | exit ${OCINFO} 58 | else 59 | msg "Couldn't debug nodes, check permissions" 60 | exit ${OCSKIP} 61 | fi 62 | exit ${OCUNKNOWN} 63 | -------------------------------------------------------------------------------- /scripts/recover-northd.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ########################################################### 3 | # recover-northd.sh script to unwedge northd in the event # 4 | # of a node failure # 5 | ########################################################### 6 | 7 | # Timestamp to be used in the logfile name 8 | NOW=$(date +"%Y-%m-%d_%H-%M-%S") 9 | # Logfile to save some DEBUG output 10 | LOG="/tmp/recover-northd.sh.${NOW}.log" 11 | # Debug var to write DEBUG lines into the log 12 | DEBUG=false 13 | # Whether to intervene if northd is wedged 14 | REMDIATE=false 15 | 16 | ########################################################### 17 | # usage(): prints the usage of the script 18 | ########################################################### 19 | function usage() { 20 | echo "This script checks if northd is stuck and optionally intervene" 21 | echo -e 22 | echo -e "\tUsage: $(basename "$0")" 23 | echo -e "\tHelp: $(basename "$0") -h" 24 | echo -e "\tSave extra DEBUG lines into the log: $(basename "$0") -d" 25 | echo -e "\tSet the KUBECONFIG env var to /kubeconfig/file: $(basename "$0") -k /kubeconfig/file" 26 | echo -e "\tRemediate the issue: $(basename "$0") -r" 27 | echo -e 28 | echo "After the execution a logfile will be generated with the name recover-northd.DATE.log" 29 | } 30 | 31 | ########################################################### 32 | # check_northd(): check the current status of northd 33 | ########################################################### 34 | function check_northd() { 35 | 36 | pods=$(oc get pods -n openshift-ovn-kubernetes -l app=ovnkube-master --no-headers | grep Running | awk '{print $1}') 37 | for pod in ${pods}; do 38 | pod_status=$(oc exec -n openshift-ovn-kubernetes -c northd "${pod}" -- ovn-appctl -t ovn-northd status | awk '{print $2}') 39 | if [[ ${pod_status} == 'active' ]]; then 40 | active_pod=${pod} 41 | node=$(oc get pod/"$active_pod" -n openshift-ovn-kubernetes -o json | jq .spec.nodeName | sed -e 's/\"//g') 42 | date=$(date +"%Y-%m-%d %H:%M:%S") 43 | if eval "${DEBUG}"; then echo "[check_northd:${date}] pod ${pod} is active" >>"${LOG}"; fi 44 | else 45 | date=$(date +"%Y-%m-%d %H:%M:%S") 46 | if eval "${DEBUG}"; then echo "[check_northd:${date}] pod ${pod} NOT active, status:${pod_status}" >>"${LOG}"; fi 47 | fi 48 | done 49 | 50 | if [[ -z ${active_pod} ]]; then 51 | date=$(date +"%Y-%m-%d %H:%M:%S") 52 | if eval "${DEBUG}"; then echo "[check_northd:${date}] no active northd leader found" >>"${LOG}"; else 53 | echo "no active northd leader found..." 54 | fi 55 | if eval "${REMDIATE}"; then 56 | if eval "${DEBUG}"; then echo "[check_northd:${date}] ...recovering northd" >>"${LOG}"; else 57 | echo "...recovering northd" 58 | fi 59 | for pod in ${pods}; do 60 | oc exec -n openshift-ovn-kubernetes -c northd "${pod}" -- ovn-appctl -t ovn-northd exit 61 | date=$(date +"%Y-%m-%d %H:%M:%S") 62 | if eval "${DEBUG}"; then echo "[check_northd:${date}] recovering pod ${pod}" >>"${LOG}"; else 63 | echo "recovering pod ${pod}" 64 | fi 65 | done 66 | fi 67 | else 68 | date=$(date +"%Y-%m-%d %H:%M:%S") 69 | if eval "${DEBUG}"; then echo "[check_northd:${date}] found active northd leader (${active_pod}) on ${node}" >>"${LOG}"; else 70 | echo "found active northd leader (${active_pod}) on ${node}" 71 | fi 72 | fi 73 | 74 | } 75 | 76 | # Main 77 | while getopts "dhk:r" flag; do 78 | case "${flag}" in 79 | d) 80 | DEBUG=true 81 | ;; 82 | h) 83 | usage 84 | exit 1 85 | ;; 86 | k) 87 | export KUBECONFIG="${OPTARG}" 88 | echo "Exported KUBECONFIG=${KUBECONFIG}" >>"${LOG}" 89 | ;; 90 | r) 91 | REMDIATE=true 92 | ;; 93 | *) 94 | echo >&2 "Invalid option: $*" 95 | usage 96 | exit 1 97 | ;; 98 | esac 99 | done 100 | 101 | check_northd 102 | 103 | if [[ -f ${LOG} ]]; then 104 | echo "# Logged operations into the file ${LOG}" 105 | fi 106 | -------------------------------------------------------------------------------- /checks.md: -------------------------------------------------------------------------------- 1 | 2 | # info 3 | | Script | Description | 4 | | - | - | 5 | | [info/mtu](info/mtu) | Show the nodes' MTU for some interfaces | 6 | | [info/node-versions](info/node-versions) | Show node components versions such as kubelet, crio, kernel, etc. | 7 | | [info/04-machineset](info/04-machineset) | Show the machinesets status | 8 | | [info/00-clusterversion](info/00-clusterversion) | Show the clusterversion | 9 | | [info/biosversion](info/biosversion) | Show the nodes' BIOS version | 10 | | [info/locks](info/locks) | List all pods with locks on each node | 11 | | [info/01-clusteroperators](info/01-clusteroperators) | Show the clusteroperators | 12 | | [info/03-pods](info/03-pods) | Show the pods running in the cluster | 13 | | [info/ethtool-firmware-version](info/ethtool-firmware-version) | Show the nodes' NIC firmware version using ethtool | 14 | | [info/ovs-hostnames](info/ovs-hostnames) | Show the ovs database chassis hostnames | 15 | | [info/02-nodes](info/02-nodes) | Show the nodes status | 16 | | [info/bmh-machine-node](info/bmh-machine-node) | Show the node,machine and bmh relationship | 17 | | [info/container-images-running](info/container-images-running) | Show the images of the containers running in the cluster | 18 | | [info/container-images-stored](info/container-images-stored) | Show the container images stored in the cluster hosts | 19 | 20 | # pre 21 | | Script | Description | 22 | | - | - | 23 | | [pre/00-install-config-valid-yaml](pre/00-install-config-valid-yaml) | Checks if the install-config.yaml file is a valid yaml file | 24 | | [pre/dns-hostnames](pre/dns-hostnames) | Checks if the api and wildcard DNS entries are correct | 25 | 26 | # ssh 27 | | Script | Description | 28 | | - | - | 29 | | [ssh/bz1941840](ssh/bz1941840) | Checks if the authentication-operator is using excessive RAM -> hung kubelet BZ1941840 | 30 | 31 | # checks 32 | | Script | Description | 33 | | - | - | 34 | | [checks/port-thrashing](checks/port-thrashing) | Checks if there are OVN pods thrashing | 35 | | [checks/entropy](checks/entropy) | Checks if the workers have enough entropy | 36 | | [checks/chronyc](checks/chronyc) | Checks if the worker clocks are synced using chronyc | 37 | | [checks/pdb](checks/pdb) | Checks if there are PodDisruptionBudgets with 0 disruptions allowed | 38 | | [checks/clusterversion_errors](checks/clusterversion_errors) | Checks if there are clusterversion errors | 39 | | [checks/mellanox-firmware-version](checks/mellanox-firmware-version) | Checks if the nodes' Mellanox Connect-4 firmware version is below the recommended version. | 40 | | [checks/pvc](checks/pvc) | Checks if there are persistent volume claims that are not bound | 41 | | [checks/flow-control](checks/flow-control) | Checks if either TX or RX flow control is enabled on a NIC | 42 | | [checks/ctrlnodes](checks/ctrlnodes) | Checks if any controller nodes have had the NoSchedule taint removed | 43 | | [checks/iptables-22623-22624](checks/iptables-22623-22624) | Checks if the nodes iptables rules are blocking 22623/tpc or 22624/tcp | 44 | | [checks/mcp](checks/mcp) | Checks if there are degraded mcp | 45 | | [checks/zombies](checks/zombies) | Checks if more than 5 zombie processes exist on the hosts | 46 | | [checks/notrunningpods](checks/notrunningpods) | Checks if there are not running pods | 47 | | [checks/alertmanager](checks/alertmanager) | Checks if there are warning or error alerts firing | 48 | | [checks/ovn-pods-memory-usage](checks/ovn-pods-memory-usage) | Checks if the memory usage of the OVN pods is under the LIMIT threshold | 49 | | [checks/operators](checks/operators) | Checks if there are operators in 'bad' state | 50 | | [checks/bz1948052](checks/bz1948052) | Checks for BZ 1948052 based on kernel version | 51 | | [checks/csr](checks/csr) | Checks if there are pending csr | 52 | | [checks/sriov](checks/sriov) | Checks if the SR-IOV network state is synced | 53 | | [checks/nodes](checks/nodes) | Checks if there are not ready or not schedulable nodes | 54 | | [checks/restarts](checks/restarts) | Checks if there are pods restarted > n times (10 by default) | 55 | | [checks/terminating](checks/terminating) | Checks if there are pods terminating | 56 | -------------------------------------------------------------------------------- /openshift-checks.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Inherit the shell variables in the subprocesses 4 | # Useful for the -v flag 5 | export SHELLOPTS 6 | 7 | # https://betterdev.blog/minimal-safe-bash-script-template/ 8 | 9 | #set -Eeuo pipefail 10 | 11 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 12 | IFS=$'\n\t' 13 | 14 | # shellcheck disable=2164 15 | cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 16 | 17 | # shellcheck disable=SC1091 18 | source $(pwd)/utils 19 | 20 | #trap cleanup SIGINT SIGTERM ERR EXIT 21 | export ERRORFILE=$(mktemp) 22 | trap "rm ${ERRORFILE}" EXIT 23 | 24 | errors=0 25 | # Flags 26 | INFO=1 27 | CHECKS=1 28 | PRE=0 29 | SSH=1 30 | LIST=0 31 | SINGLE=0 32 | RESULTS_ONLY=0 33 | SCRIPT_PROVIDED='' 34 | RESTART_THRESHOLD=${RESTART_THRESHOLD:=10} #arbitray 35 | THRASHING_THRESHOLD=${THRASHING_THRESHOLD:=10} 36 | 37 | OCDEBUGIMAGE=${OCDEBUGIMAGE:=registry.redhat.io/rhel8/support-tools:latest} 38 | OSETOOLSIMAGE=${OSETOOLSIMAGE:=registry.redhat.io/openshift4/ose-tools-rhel8:latest} 39 | 40 | parse_params "$@" 41 | setup_colors 42 | 43 | main() { 44 | # Check if only list is needed 45 | if [ "${LIST}" -ne 0 ]; then 46 | msg "${GREEN}Available scripts:${NOCOLOR}" 47 | find checks/ info/ pre/ ssh/ -type f | sort -n 48 | exit 0 49 | else 50 | # Check binaries availability 51 | for i in oc yq jq curl column; do 52 | check_command ${i} 53 | done 54 | # If only prechecks are needed: 55 | if [ "${PRE}" -gt 0 ]; then 56 | INSTALL_CONFIG_PATH=${INSTALL_CONFIG_PATH:=./install-config.yaml} 57 | if [ ! -f ${INSTALL_CONFIG_PATH} ]; then 58 | die "${RED}install-config.yaml not found${NOCOLOR}" 59 | fi 60 | msg "Running prechecks:" 61 | for pre in ./pre/*; do 62 | # shellcheck disable=SC1090,SC1091 63 | "${pre}" 64 | done 65 | else 66 | # Check kubeconfig and current user 67 | kubeconfig 68 | OCUSER=$(oc_whoami) 69 | # If only a single script is needed: 70 | if [ "${SINGLE}" -ne 0 ]; then 71 | # Disable all the other checks 72 | INFO=0 73 | CHECKS=0 74 | PRE=0 75 | SSH=0 76 | # shellcheck disable=SC1090,SC1091 77 | "${SCRIPT_PROVIDED}" 78 | fi 79 | # If only info data is needed: 80 | if [ "${INFO}" -gt 0 ]; then 81 | msg "Gathering cluster information as ${GREEN}${OCUSER}${NOCOLOR}:" 82 | for info in ./info/*; do 83 | # shellcheck disable=SC1090,SC1091 84 | "${info}" 85 | done 86 | fi 87 | # If only checks are needed: 88 | if [ "${CHECKS}" -gt 0 ]; then 89 | msg "Running basic health checks as ${GREEN}${OCUSER}${NOCOLOR}" 90 | for check in ./checks/*; do 91 | # Refresh error count before execution 92 | export errors=$(expr $(cat ${ERRORFILE}) + 0) 93 | # shellcheck disable=SC1090,SC1091 94 | if [ "${RESULTS_ONLY}" -gt 0 ]; then 95 | "${check}" &>/dev/null 96 | case $? in 97 | 0 | 1) msg "${check:2} ${GREEN}PASS${NOCOLOR}" ;; 98 | 2) msg "${check:2} ${RED}FAIL${NOCOLOR}" ;; 99 | 3) msg "${check:2} ${ORANGE}SKIPPED${NOCOLOR}" ;; 100 | 4) msg "${check:2} ${YELLOW}UNKNOWN${NOCOLOR}" ;; 101 | *) msg "${check:2} ${RED}UNKNOWN RETURN CODE${NOCOLOR}" ;; 102 | esac 103 | else 104 | "${check}" 105 | fi 106 | done 107 | fi 108 | # If only ssh checks are needed: 109 | if [ "${SSH}" -gt 0 ]; then 110 | msg "Running ssh-based health checks as ${GREEN}${OCUSER}${NOCOLOR}" 111 | for ssh in ./ssh/*; do 112 | # Refresh error count before execution 113 | export errors=$(expr $(cat ${ERRORFILE}) + 0) 114 | # shellcheck disable=SC1090,SC1091 115 | if [ "${RESULTS_ONLY}" -gt 0 ]; then 116 | "${ssh}" &>/dev/null 117 | case $? in 118 | 0 | 1) msg "${ssh:2} ${GREEN}PASS${NOCOLOR}" ;; 119 | 2) msg "${ssh:2} ${RED}FAIL${NOCOLOR}" ;; 120 | 3) msg "${ssh:2} ${ORANGE}SKIPPED${NOCOLOR}" ;; 121 | 4) msg "${ssh:2} ${YELLOW}UNKNOWN${NOCOLOR}" ;; 122 | *) msg "${ssh:2} ${RED}UNKNOWN RETURN CODE${NOCOLOR}" ;; 123 | esac 124 | else 125 | "${ssh}" 126 | fi 127 | done 128 | fi 129 | fi 130 | fi 131 | export errors=$(expr $(cat ${ERRORFILE}) + 0) 132 | if [ ${errors} -gt 0 ]; then 133 | die "${RED}Total issues found: ${errors}${NOCOLOR}" 134 | else 135 | msg "${GREEN}No issues found${NOCOLOR}" 136 | fi 137 | } 138 | 139 | main "$@" 140 | -------------------------------------------------------------------------------- /utils: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Global vars 4 | 5 | # Set error codes for scripts https://tecadmin.net/how-to-create-own-nagios-plugin-using-bash-shell-script/ 6 | # This should probably masked to other values to hilight issues in bash coding, etc 7 | OCOK=0 8 | OCINFO=1 9 | OCERROR=2 10 | OCSKIP=3 11 | OCUNKNOWN=4 12 | 13 | PARALLELJOBS="${PARALLELJOBS:=1}" 14 | 15 | usage() { 16 | cat <, --single