├── .gitignore
├── requirements.txt
├── checksroot
    ├── nagios
    │   ├── info
    │   ├── pre
    │   ├── ssh
    │   ├── checks
    │   └── utils
    └── profiles
    │   ├── pre.txt
    │   ├── ssh.txt
    │   ├── checks.txt
    │   └── info.txt
├── .isort.cfg
├── webreport.png
├── .risu.conf
├── .flake8
├── .github
    ├── labels.yml
    ├── workflows
    │   ├── label.yml
    │   ├── broken-link-check.yml
    │   ├── size.yaml
    │   └── refresh-checksmd.yml
    └── dependabot.yml
├── info
    ├── 03-pods
    ├── 04-machineset
    ├── 00-clusterversion
    ├── 01-clusteroperators
    ├── container-images-running
    ├── 02-nodes
    ├── ovs-hostnames
    ├── bmh-machine-node
    ├── node-versions
    ├── container-images-stored
    ├── biosversion
    ├── locks
    ├── ethtool-firmware-version
    └── mtu
├── scripts
    ├── update-checksmd
    ├── locks.sh
    ├── recover-northd.sh
    ├── README.md
    └── ovn_cleanConntrack.sh
├── pre
    ├── 00-install-config-valid-yaml
    └── dns-hostnames
├── checks
    ├── csr
    ├── terminating
    ├── mcp
    ├── pvc
    ├── pdb
    ├── notrunningpods
    ├── restarts
    ├── ctrlnodes
    ├── sriov
    ├── clusterversion_errors
    ├── bz1948052
    ├── operators
    ├── port-thrashing
    ├── ovn-pods-memory-usage
    ├── alertmanager
    ├── zombies
    ├── entropy
    ├── nodes
    ├── chronyc
    ├── iptables-22623-22624
    └── mellanox-firmware-version
├── Containerfile
├── cronjob.yaml
├── ssh
    └── bz1941840
├── .pre-commit-config.yaml
├── checks.md
├── openshift-checks.sh
├── utils
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .history/
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | risu
2 | 


--------------------------------------------------------------------------------
/checksroot/nagios/info:
--------------------------------------------------------------------------------
1 | ../../info


--------------------------------------------------------------------------------
/checksroot/nagios/pre:
--------------------------------------------------------------------------------
1 | ../../pre


--------------------------------------------------------------------------------
/checksroot/nagios/ssh:
--------------------------------------------------------------------------------
1 | ../../ssh


--------------------------------------------------------------------------------
/checksroot/nagios/checks:
--------------------------------------------------------------------------------
1 | ../../checks


--------------------------------------------------------------------------------
/checksroot/nagios/utils:
--------------------------------------------------------------------------------
1 | ../../utils


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | known_third_party =
3 | 


--------------------------------------------------------------------------------
/webreport.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RHsyseng/openshift-checks/HEAD/webreport.png


--------------------------------------------------------------------------------
/.risu.conf:
--------------------------------------------------------------------------------
1 | {"web": true, "exclude": ["risuclient"], "extraplugintree": "checksroot/", "title": "OpenShift Checks", "output": "osc.json", "quiet": true}
2 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503, F403, F401, E402, E722, C901
3 | max-line-length = 79
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B9
6 | 


--------------------------------------------------------------------------------
/checksroot/profiles/pre.txt:
--------------------------------------------------------------------------------
1 | # priority: 1000
2 | #
3 | # Defines which plugins to include, exclude, etc
4 | # Syntax
5 | # +keyword : includes keyword in plugin search
6 | # -keyword : excludes keyword in plugin search
7 | 
8 | +pre/
9 | 


--------------------------------------------------------------------------------
/checksroot/profiles/ssh.txt:
--------------------------------------------------------------------------------
1 | # priority: 1000
2 | #
3 | # Defines which plugins to include, exclude, etc
4 | # Syntax
5 | # +keyword : includes keyword in plugin search
6 | # -keyword : excludes keyword in plugin search
7 | 
8 | +ssh/
9 | 


--------------------------------------------------------------------------------
/checksroot/profiles/checks.txt:
--------------------------------------------------------------------------------
1 | # priority: 1000
2 | #
3 | # Defines which plugins to include, exclude, etc
4 | # Syntax
5 | # +keyword : includes keyword in plugin search
6 | # -keyword : excludes keyword in plugin search
7 | 
8 | +checks/
9 | 


--------------------------------------------------------------------------------
/checksroot/profiles/info.txt:
--------------------------------------------------------------------------------
1 | # priority: 1000
2 | #
3 | # Defines which plugins to include, exclude, etc
4 | # Syntax
5 | # +keyword : includes keyword in plugin search
6 | # -keyword : excludes keyword in plugin search
7 | 
8 | +info/
9 | 


--------------------------------------------------------------------------------
/.github/labels.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Add 'repo' label to any root file changes
 3 | repo:
 4 |   - ./*
 5 | 
 6 | checks:
 7 |   - checks/**/*
 8 | 
 9 | pre:
10 |   - pre/**/*
11 | 
12 | info:
13 |   - info/**/*
14 | 
15 | ssh:
16 |   - ssh/**/*
17 | 
18 | scripts:
19 |   - scripts/**/*
20 | 


--------------------------------------------------------------------------------
/info/03-pods:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the pods running in the cluster
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i get pods -A >/dev/null 2>&1; then
 7 |   msg "Total pods: $(oc get pods -A --no-headers | wc -l)"
 8 |   exit ${OCINFO}
 9 | else
10 |   msg "Couldn't get pods, check permissions"
11 |   exit ${OCSKIP}
12 | fi
13 | exit ${OCUNKNOWN}
14 | 


--------------------------------------------------------------------------------
/scripts/update-checksmd:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Updates the checks.md out of scripts in the folders 'headers'
 3 | 
 4 | for kind in info pre ssh checks; do
 5 |   echo """
 6 | # ${kind}
 7 | | Script | Description |
 8 | | - | - |"""
 9 | 
10 |   for file in $(find ${kind} -type f -executable|sort -V); do
11 |     echo "| [${file}](${file}) | $(grep '^# description:' ${file} | cut -d ":" -f 2-) |"
12 |   done
13 | done
14 | 


--------------------------------------------------------------------------------
/info/04-machineset:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the machinesets status
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | NS="openshift-machine-api"
 7 | 
 8 | if oc auth can-i get machinesets -n ${NS} >/dev/null 2>&1; then
 9 |   msg "$(oc get machineset -n ${NS})"
10 | else
11 |   msg "Couldn't get machinesets, check permissions"
12 |   exit ${OCSKIP}
13 | fi
14 | exit ${OCINFO}
15 | 


--------------------------------------------------------------------------------
/info/00-clusterversion:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the clusterversion
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i get clusterversion >/dev/null 2>&1; then
 7 |   msg "Cluster version:\n$(oc get clusterversion/version)"
 8 |   exit ${OCINFO}
 9 | else
10 |   msg "Couldn't get clusterversion, check permissions"
11 |   exit ${OCSKIP}
12 | fi
13 | exit ${OCUNKNOWN}
14 | 


--------------------------------------------------------------------------------
/info/01-clusteroperators:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the clusteroperators
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i get clusteroperators >/dev/null 2>&1; then
 7 |   msg "Cluster operators:\n$(oc get clusteroperators)"
 8 |   exit ${OCINFO}
 9 | else
10 |   msg "Couldn't get clusteroperators, check permissions"
11 |   exit ${OCSKIP}
12 | fi
13 | exit ${OCUNKNOWN}
14 | 


--------------------------------------------------------------------------------
/.github/workflows/label.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "Label PRs from globs"
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 * * * *"
 6 | 
 7 | concurrency:
 8 |   group: ${{ github.workflow }}-${{ github.ref }}
 9 |   cancel-in-progress: true
10 | 
11 | jobs:
12 |   execute:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: jpmcb/prow-github-actions@v1.1.3
16 |         with:
17 |           jobs: 'pr-labeler'
18 |           github-token: "${{ secrets.GITHUB_TOKEN }}"
19 | 


--------------------------------------------------------------------------------
/info/container-images-running:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the images of the containers running in the cluster
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i get pods -A >/dev/null 2>&1; then
 7 |   IMAGES=$(oc get pods -A -o go-template --template='{{range .items}}{{range .spec.containers}}{{printf "%s\n" .image -}} {{end}}{{end}}' | sort -u)
 8 |   msg "Images:\n${IMAGES}"
 9 |   exit ${OCINFO}
10 | else
11 |   msg "Couldn't get pods, check permissions"
12 |   exit ${OCSKIP}
13 | fi
14 | exit ${OCUNKNOWN}
15 | 


--------------------------------------------------------------------------------
/pre/00-install-config-valid-yaml:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if the install-config.yaml file is a valid yaml file
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if ! $(which yq) >/dev/null 2>&1; then
 7 |   msg "yq command not found"
 8 |   exit ${OCSKIP}
 9 | fi
10 | 
11 | if yq eval ${INSTALL_CONFIG_PATH} >/dev/null; then
12 |   msg "${INSTALL_CONFIG_PATH} seems valid"
13 |   exit ${OCOK}
14 | else
15 |   errors=$(("${errors}" + 1))
16 |   msg "${INSTALL_CONFIG_PATH} doesn't seem valid"
17 |   if [ ! -z "${ERRORFILE}" ]; then
18 |     echo $errors >${ERRORFILE}
19 |   fi
20 |   exit ${OCERROR}
21 | fi
22 | 


--------------------------------------------------------------------------------
/.github/workflows/broken-link-check.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | on:
 3 |   schedule:
 4 |     - cron: "0 0 * * *" # daily
 5 |   repository_dispatch: # run manually
 6 |     types: [check-link]
 7 |   # push:
 8 |   # ...
 9 | 
10 | concurrency:
11 |   group: ${{ github.workflow }}-${{ github.ref }}
12 |   cancel-in-progress: true
13 | 
14 | name: Broken Link Check
15 | jobs:
16 |   check:
17 |     name: Broken Link Check
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - name: Broken Link Check
21 |         uses: technote-space/broken-link-checker-action@v2.3.1
22 |         with:
23 |           EXCLUDED_KEYWORDS: |
24 |             docs.github.com
25 |             camo.githubusercontent.com
26 |             github.com/apps/dependabot
27 | 


--------------------------------------------------------------------------------
/.github/workflows/size.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Size
 3 | 
 4 | on:
 5 |   pull_request_target:
 6 |     types: [opened, synchronize]
 7 | 
 8 | jobs:
 9 |   update_labels:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 | 
14 |       - uses: actions-ecosystem/action-size@v2
15 |         id: size
16 | 
17 |       - uses: actions-ecosystem/action-remove-labels@v1
18 |         with:
19 |           github_token: ${{ secrets.github_token }}
20 |           labels: ${{ steps.size.outputs.stale_labels }}
21 | 
22 |       - uses: actions-ecosystem/action-add-labels@v1
23 |         with:
24 |           github_token: ${{ secrets.github_token }}
25 |           labels: ${{ steps.size.outputs.new_label }}
26 | 


--------------------------------------------------------------------------------
/info/02-nodes:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the nodes status
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i get nodes -A >/dev/null 2>&1; then
 7 |   msg "Nodes:\n$(oc get nodes -o wide)"
 8 |   msg "Masters: $(oc get nodes -o name --no-headers --selector='node-role.kubernetes.io/master' | wc -l)"
 9 |   msg "Workers: $(oc get nodes -o name --no-headers --selector='node-role.kubernetes.io/worker' | wc -l)"
10 |   msg "Others: $(oc get nodes -o name --no-headers --selector='!node-role.kubernetes.io/worker,!node-role.kubernetes.io/master' | wc -l)"
11 |   msg "Total nodes: $(oc get nodes -o name --no-headers | wc -l)"
12 |   exit ${OCINFO}
13 | else
14 |   msg "Couldn't get nodes, check permissions"
15 |   exit ${OCSKIP}
16 | fi
17 | exit ${OCUNKNOWN}
18 | 


--------------------------------------------------------------------------------
/checks/csr:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are pending csr
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get csr >/dev/null 2>&1; then
 9 |   pending_csr=$(oc get csr --no-headers --ignore-not-found=true | grep -ci 'pending')
10 |   if [[ ${pending_csr} -ge 1 ]]; then
11 |     PCSR=$(oc get csr --no-headers | grep -i 'pending')
12 |     msg "Pending CSRs (${pending_csr}): ${PCSR}"
13 |     errors=$(("${errors}" + 1))
14 |     error=true
15 |   fi
16 |   if [ ! -z "${ERRORFILE}" ]; then
17 |     echo $errors >${ERRORFILE}
18 |   fi
19 |   if [[ $error == true ]]; then
20 |     exit ${OCERROR}
21 |   else
22 |     exit ${OCOK}
23 |   fi
24 | else
25 |   msg "Couldn't get csr, check permissions"
26 |   exit ${OCSKIP}
27 | fi
28 | exit ${OCUNKNOWN}
29 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # To get started with Dependabot version updates, you'll need to specify which
 3 | # package ecosystems to update and where the package manifests are located.
 4 | # Please see the documentation for all configuration options:
 5 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 6 | 
 7 | version: 2
 8 | updates:
 9 |   - package-ecosystem: "pip" # See documentation for possible values
10 |     directory: "/" # Location of package manifests
11 |     schedule:
12 |       interval: "daily"
13 | 
14 |   # Maintain dependencies for GitHub Actions
15 |   - package-ecosystem: "github-actions"
16 |     directory: "/"
17 |     schedule:
18 |       interval: "daily"
19 |     commit-message:
20 |       prefix: build
21 |       prefix-development: chore
22 |       include: scope
23 | 


--------------------------------------------------------------------------------
/checks/terminating:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are pods terminating
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get pods -A >/dev/null 2>&1; then
 9 |   terminating_pods=$(oc get pods -A | grep -c 'Terminating')
10 |   if [[ $terminating_pods -ge 1 ]]; then
11 |     TERMPODS=$(oc get pods -A | grep 'Terminating')
12 |     msg "Pods in Terminating state ($terminating_pods):\n${RED}${TERMPODS}${NOCOLOR}"
13 |     errors=$(("${errors}" + 1))
14 |     error=true
15 |   fi
16 |   if [ ! -z "${ERRORFILE}" ]; then
17 |     echo $errors >${ERRORFILE}
18 |   fi
19 |   if [[ $error == true ]]; then
20 |     exit ${OCERROR}
21 |   else
22 |     exit ${OCOK}
23 |   fi
24 | else
25 |   msg "Couldn't get all pods, check permissions"
26 |   exit ${OCSKIP}
27 | fi
28 | exit ${OCUNKNOWNN}
29 | 


--------------------------------------------------------------------------------
/checks/mcp:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are degraded mcp
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get mcp >/dev/null 2>&1; then
 9 |   degrated_mcps=$(oc get mcp -o json | jq '.items[] | { name: .metadata.name, status: .status } | select (.status.degradedMachineCount >= 1) | { name: .name, status: .status.degradedMachineCount}')
10 |   if [[ -n $degrated_mcps ]]; then
11 |     DEGRADED=$(echo "${degrated_mcps}" | jq .)
12 |     msg "MachineConfigProfiles in Degraded State: ${RED}${DEGRADED}${NOCOLOR}"
13 |     errors=$(("${errors}" + 1))
14 |     error=true
15 |   fi
16 |   if [ ! -z "${ERRORFILE}" ]; then
17 |     echo $errors >${ERRORFILE}
18 |   fi
19 |   if [[ $error == true ]]; then
20 |     exit ${OCERROR}
21 |   else
22 |     exit ${OCOK}
23 |   fi
24 | 
25 | else
26 |   msg "Couldn't get mcp, check permissions"
27 |   exit ${OCSKIP}
28 | fi
29 | exit ${OCUNKNOWN}
30 | 


--------------------------------------------------------------------------------
/checks/pvc:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are persistent volume claims that are not bound
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get pvc -A >/dev/null 2>&1; then
 9 |   pvc_not_bound=$(oc get pvc -A -o json | jq '.items[] | { name: .metadata.name, namespace: .metadata.namespace, phase: .status.phase } | select (.phase!="Bound")')
10 |   if [[ -n ${pvc_not_bound} ]]; then
11 |     PVCNOTBOUND=$(echo "${pvc_not_bound}" | jq .)
12 |     msg "Persistent Volume Claims ${RED}NotBound${NOCOLOR}: ${PVCNOTBOUND}"
13 |     errors=$(("${errors}" + 1))
14 |     error=true
15 |   fi
16 |   if [ ! -z "${ERRORFILE}" ]; then
17 |     echo $errors >${ERRORFILE}
18 |   fi
19 |   if [[ $error == true ]]; then
20 |     exit ${OCERROR}
21 |   else
22 |     exit ${OCOK}
23 |   fi
24 | 
25 | else
26 |   msg "Couldn't get pvc, check permissions"
27 |   exit ${OCSKIP}
28 | fi
29 | exit ${OCUNKNOWN}
30 | 


--------------------------------------------------------------------------------
/info/ovs-hostnames:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the ovs database chassis hostnames
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i exec pod -n openshift-ovn-kubernetes >/dev/null 2>&1 && oc get ns openshift-ovn-kubernetes >/dev/null 2>&1; then
 7 |   OVSHOSTNAMES=$(oc -n openshift-ovn-kubernetes exec pod/"$(oc -n openshift-ovn-kubernetes get pod -l app=ovnkube-master,component=network -o jsonpath='{.items[0].metadata.name}')" -c northd -- ovn-sbctl --no-leader-only list chassis | awk '/hostname/ { print $3 }' | sort -n)
 8 |   if [ -n "${OVSHOSTNAMES}" ]; then
 9 |     msg "OVS hostnames:\n${OVSHOSTNAMES}"
10 |     exit ${OCINFO}
11 |   else
12 |     msg "Couldn't get ovs-hostnames, check permissions"
13 |     exit ${OCSKIP}
14 |   fi
15 | else
16 |   msg "Couldn't get ovs-hostnames, either the cluster is not using OVN, or the running user has insufficient permissions"
17 |   exit ${OCSKIP}
18 | fi
19 | 
20 | exit ${OCUNKNOWN}
21 | 


--------------------------------------------------------------------------------
/checks/pdb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are PodDisruptionBudgets with 0 disruptions allowed
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get pdb >/dev/null 2>&1; then
 9 |   if [ $(is_sno) -eq 1 ]; then
10 |     exit ${OCSKIP}
11 |   fi
12 |   wrong_pdb=$(oc get pdb -A -o json | jq '.items[] | { name: .metadata.name, status: .status } | select (.status.disruptionsAllowed == 0) | { name: .name}')
13 |   if [[ -n $wrong_pdb ]]; then
14 |     DEGRADED=$(echo "${wrong_pdb}" | jq .)
15 |     msg "PodDisruptionBudget with 0 disruptions allowed: ${RED}${DEGRADED}${NOCOLOR}"
16 |     errors=$((errors + 1))
17 |     error=true
18 |   fi
19 |   if [ ! -z "${ERRORFILE}" ]; then
20 |     echo $errors >${ERRORFILE}
21 |   fi
22 |   if [[ $error == true ]]; then
23 |     exit ${OCERROR}
24 |   else
25 |     exit ${OCOK}
26 |   fi
27 | 
28 | else
29 |   msg "Couldn't get pdb, check permissions"
30 |   exit ${OCSKIP}
31 | fi
32 | exit ${OCUNKNOWNN}
33 | 


--------------------------------------------------------------------------------
/checks/notrunningpods:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are not running pods
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get pods -A >/dev/null 2>&1; then
 9 |   # Get all nonrunning pods with headers even if they are not found
10 |   notrunning=$(oc get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --ignore-not-found=true)
11 |   HEADER=$(echo "${notrunning}" | head -n1)
12 |   PODS=$(echo "${notrunning}" | tail -n +2)
13 |   if [[ -n ${PODS} ]]; then
14 |     msg "Pods not running ($(echo "${PODS}" | wc -l)):\n${HEADER}\n${RED}${PODS}${NOCOLOR}"
15 |     errors=$(("${errors}" + 1))
16 |     error=true
17 |   fi
18 |   if [ ! -z "${ERRORFILE}" ]; then
19 |     echo $errors >${ERRORFILE}
20 |   fi
21 |   if [[ $error == true ]]; then
22 |     exit ${OCERROR}
23 |   else
24 |     exit ${OCOK}
25 |   fi
26 | 
27 | else
28 |   msg "Couldn't get all pods, check permissions"
29 |   exit ${OCSKIP}
30 | fi
31 | exit ${OCUNKNOWNN}
32 | 


--------------------------------------------------------------------------------
/checks/restarts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are pods restarted > n times (10 by default)
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get pods -A >/dev/null 2>&1; then
 9 |   restarts=$(oc get pods -o json -A | jq -r ".items[] | { name: .metadata.name, project: .metadata.namespace, restarts: .status.containerStatuses[].restartCount } | select(.restarts > $RESTART_THRESHOLD)" 2>/dev/null)
10 |   if [[ -n $restarts ]]; then
11 |     RESTARTS=$(echo "${restarts}" | jq -r '. | "\(.project)\t\(.name)\t\(.restarts)"' | column -t -N "NAMESPACE,NAME,RESTARTS")
12 |     msg "Pods that have a high restart count (> $RESTART_THRESHOLD):\n${RED}${RESTARTS}${NOCOLOR}"
13 |     errors=$(("${errors}" + 1))
14 |     error=true
15 |   fi
16 |   if [ ! -z "${ERRORFILE}" ]; then
17 |     echo $errors >${ERRORFILE}
18 |   fi
19 |   if [[ $error == true ]]; then
20 |     exit ${OCERROR}
21 |   else
22 |     exit ${OCOK}
23 |   fi
24 | else
25 |   msg "Couldn't get all pods, check permissions"
26 |   exit ${OCSKIP}
27 | fi
28 | exit ${OCUNKNOWNN}
29 | 


--------------------------------------------------------------------------------
/checks/ctrlnodes:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if any controller nodes have had the NoSchedule taint removed
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get nodes >/dev/null 2>&1; then
 9 |   if [ $(is_sno) -eq 1 ]; then
10 |     exit ${OCSKIP}
11 |   fi
12 |   scheduable_controllers=$(oc get nodes -o json | jq '.items[] | { name: .metadata.name, scheduable: .spec.taints, control: .metadata.labels."node-role.kubernetes.io/master" } | select((.control == "") and (.scheduable == null))')
13 |   if [[ -n ${scheduable_controllers} ]]; then
14 |     SCHEDCTRL=$(echo "${scheduable_controllers}" | jq '. | { name: .name }')
15 |     msg "Controllers ${RED}Scheduable${NOCOLOR}: ${SCHEDCTRL}"
16 |     errors=$(("${errors}" + 1))
17 |     error=true
18 |   fi
19 |   if [ ! -z "${ERRORFILE}" ]; then
20 |     echo $errors >${ERRORFILE}
21 |   fi
22 |   if [[ $error == true ]]; then
23 |     exit ${OCERROR}
24 |   else
25 |     exit ${OCOK}
26 |   fi
27 | else
28 |   msg "Couldn't get nodes, check permissions"
29 |   exit ${OCSKIP}
30 | fi
31 | exit ${OCUNKNOWN}
32 | 


--------------------------------------------------------------------------------
/info/bmh-machine-node:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the node,machine and bmh relationship
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | NS="openshift-machine-api"
 7 | 
 8 | if oc auth can-i get nodes -A >/dev/null 2>&1; then
 9 |   if oc auth can-i get bmh -n ${NS} >/dev/null 2>&1; then
10 |     if oc auth can-i get machines -n ${NS} >/dev/null 2>&1; then
11 |       for bmh in $(oc get bmh -n openshift-machine-api -o jsonpath='{.items[*].metadata.name}'); do
12 |         MACHINE=$(oc get -n openshift-machine-api bmh/${bmh} -o jsonpath='{.spec.consumerRef.name}')
13 |         NODE=$(oc get -n openshift-machine-api machine/${MACHINE} -o jsonpath='{.status.nodeRef.name}')
14 |         msg "Node ${NODE} => Machine: ${MACHINE}, BMH: ${bmh}"
15 |       done
16 |       exit ${OCINFO}
17 |     else
18 |       msg "Couldn't get machines, check permissions"
19 |       exit ${OCSKIP}
20 |     fi
21 |   else
22 |     msg "Couldn't get baremetalhosts, check permissions"
23 |     exit ${OCSKIP}
24 |   fi
25 | else
26 |   msg "Couldn't get nodes, check permissions"
27 |   exit ${OCSKIP}
28 | fi
29 | 
30 | exit ${OCUNKNOWN}
31 | 


--------------------------------------------------------------------------------
/info/node-versions:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show node components versions such as kubelet, crio, kernel, etc.
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i get nodes -A >/dev/null 2>&1; then
 7 |   KUBELETVERSIONS=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' | column -t -N "NODE,KUBELET")
 8 |   CRIOVERSIONS=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.nodeInfo.containerRuntimeVersion}{"\n"}{end}' | column -t -N "NODE,CRIO")
 9 |   KERNELVERSIONS=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.nodeInfo.kernelVersion}{"\n"}{end}' | column -t -N "NODE,KERNEL")
10 |   OSIMAGEVERSIONS=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{":"}{.status.nodeInfo.osImage}{"\n"}{end}' | column -t -s ":" -N "NODE,OSIMAGE")
11 |   msg "${KUBELETVERSIONS}"
12 |   msg "${CRIOVERSIONS}"
13 |   msg "${KERNELVERSIONS}"
14 |   msg "${OSIMAGEVERSIONS}"
15 |   exit ${OCINFO}
16 | else
17 |   msg "Couldn't get nodes, check permissions"
18 |   exit ${OCSKIP}
19 | fi
20 | exit ${OCUNKNOWN}
21 | 


--------------------------------------------------------------------------------
/Containerfile:
--------------------------------------------------------------------------------
 1 | FROM registry.access.redhat.com/ubi8/ubi:latest
 2 | 
 3 | WORKDIR /opt/openshift-checks
 4 | 
 5 | # Some required binaries
 6 | RUN dnf clean all && \
 7 |     dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \
 8 |     dnf update -y && \
 9 |     dnf install -y jq curl util-linux bind-utils python38 && \
10 |     dnf clean all
11 | 
12 | # YQ doesn't provide a RPM, download the latest
13 | RUN curl -sL $(curl -sL https://api.github.com/repos/mikefarah/yq/releases/latest  | jq -r '.assets[] | select(.name == "yq_linux_amd64") | .browser_download_url') -o /usr/local/bin/yq &&\
14 |     chmod a+x /usr/local/bin/yq
15 | 
16 | # Download latest oc binary
17 | RUN curl -sL https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz | tar -C /usr/local/bin -xzf - oc kubectl
18 | 
19 | RUN groupadd -g 9999 appuser && \
20 |     useradd -r -u 9999 -g appuser appuser
21 | 
22 | COPY . /opt/openshift-checks
23 | RUN pip3 install -r requirements.txt
24 | 
25 | RUN chown -R appuser.appuser /opt/openshift-checks
26 | 
27 | USER appuser
28 | 
29 | ENTRYPOINT [ "/opt/openshift-checks/openshift-checks.sh" ]
30 | 


--------------------------------------------------------------------------------
/info/container-images-stored:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the container images stored in the cluster hosts
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i debug node >/dev/null 2>&1; then
 7 |   msg "Checking container images stored in the cluster (${BLUE}using oc debug, it can take a while${NOCOLOR})"
 8 |   # shellcheck disable=SC2016
 9 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
10 |     # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
11 |     ((i = i % PARALLELJOBS))
12 |     ((i++ == 0)) && wait
13 |     (
14 |       ocdebugorwait # Pause for no OC debug running
15 |       oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "crictl images -o json" 2>/dev/null | jq -r .images[].repoTags[]
16 |     ) &
17 |   done | sort -u
18 |   wait
19 |   exit ${OCINFO}
20 | else
21 |   msg "Couldn't debug nodes, check permissions"
22 |   exit ${OCSKIP}
23 | fi
24 | exit ${OCUNKNOWN}
25 | 


--------------------------------------------------------------------------------
/checks/sriov:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if the SR-IOV network state is synced
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | oc get subs sriov-network-operator-subscription -n openshift-sriov-network-operator &>/dev/null
 9 | if [ $? -ne 0 ]; then
10 |   # SR-IOV operator is not installed
11 |   exit ${OCSKIP}
12 | fi
13 | 
14 | if oc auth can-i get SriovNetworkNodeState >/dev/null 2>&1; then
15 |   sriov_bad_state=$(oc get SriovNetworkNodeState -n openshift-sriov-network-operator -o json | jq '.items[] | { name: .metadata.name, syncStatus: .status.syncStatus } | select (.syncStatus !="Succeeded")')
16 |   if [[ -n ${sriov_bad_state} ]]; then
17 |     SRIOVBADSTATE=$(echo "${sriov_bad_state}" | jq .)
18 |     msg "Nodes ${RED}NotSynced${NOCOLOR}: ${SRIOVBADSTATE}"
19 |     errors=$(("${errors}" + 1))
20 |     error=true
21 |   fi
22 |   if [ ! -z "${ERRORFILE}" ]; then
23 |     echo $errors >${ERRORFILE}
24 |   fi
25 |   if [[ $error == true ]]; then
26 |     exit ${OCERROR}
27 |   else
28 |     exit ${OCOK}
29 |   fi
30 | 
31 | else
32 |   msg "Couldn't get SriovNetworkNodeState, check permissions"
33 |   exit ${OCSKIP}
34 | fi
35 | exit ${OCUNKNOWN}
36 | 


--------------------------------------------------------------------------------
/checks/clusterversion_errors:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are clusterversion errors
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get clusterversion >/dev/null 2>&1; then
 9 |   clusterversion_msgs=$(oc get clusterversion -o json | jq '.items[].status.conditions[] | select ((.status == "True") and (.type == "Failing") and (.message != null)) | { message: .message }')
10 |   count_errors=$(echo "${clusterversion_msgs}" | jq .message | wc -l)
11 |   if [[ ${count_errors} -ge 1 ]]; then
12 |     final=""
13 |     OLDIFS=$IFS
14 |     IFS=$'\n'
15 | 
16 |     for message in $(echo "${clusterversion_msgs}" | jq .message); do
17 |       final="${final} ${message}"
18 |     done
19 |     IFS=${OLDIFS}
20 |     msg "Clusterversion error status message: ${RED}${final}${NOCOLOR}"
21 |     errors=$(("${errors}" + 1))
22 |     error=true
23 |   fi
24 |   if [ ! -z "${ERRORFILE}" ]; then
25 |     echo $errors >${ERRORFILE}
26 |   fi
27 |   if [[ $error == true ]]; then
28 |     exit ${OCERROR}
29 |   else
30 |     exit ${OCOK}
31 |   fi
32 | else
33 |   msg "Couldn't get clusterversion, check permissions"
34 |   exit ${OCSKIP}
35 | fi
36 | exit ${OCUNKNOWN}
37 | 


--------------------------------------------------------------------------------
/checks/bz1948052:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # long_name: Checks for BZ 1948052
 3 | # description: Checks for BZ 1948052 based on kernel version
 4 | # bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1948052
 5 | # priority: 600
 6 | 
 7 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 8 | 
 9 | BADKERNEL="4.18.0-193.24.1.el8_2.dt1.x86_64"
10 | error=false
11 | 
12 | if oc auth can-i get nodes >/dev/null 2>&1; then
13 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
14 |     kernel_version=$(oc get ${node} -o jsonpath={.status.nodeInfo.kernelVersion})
15 |     if [[ ${kernel_version} == ${BADKERNEL} ]]; then
16 |       msg "${RED}Node ${node} contains ${BADKERNEL} kernel version${NOCOLOR}"
17 |       errors=$(("${errors}" + 1))
18 |       error=true
19 |     fi
20 |   done
21 |   if [ ! -z "${ERRORFILE}" ]; then
22 |     echo $errors >${ERRORFILE}
23 |   fi
24 |   if [[ $error == true ]]; then
25 |     exit ${OCERROR}
26 |   else
27 |     exit ${OCOK}
28 |   fi
29 | else
30 |   msg "Couldn't get nodes, check permissions"
31 |   exit ${OCSKIP}
32 | fi
33 | exit ${OCUNKNOWN}
34 | 


--------------------------------------------------------------------------------
/scripts/locks.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | declare -A ns_pods
 4 | ORIG_IFS=$IFS
 5 | IFS=$(echo -en "\n\b")
 6 | 
 7 | for line in $(sudo lslocks | egrep -v '(unknown)' | awk '{print $2}' | sort -nr | uniq -c | sort -nr | egrep -v 'unknown|-1' | grep -v PID); do
 8 |   count=$(echo $line | awk '{print $1}')
 9 |   pid=$(echo $line | awk '{print $2}')
10 |   orig_pid=$pid
11 |   ppid=$(grep PPid /proc/${pid}/status | awk '{print $2}')
12 |   while [[ $ppid -gt 1 ]]; do
13 |     pid=$ppid
14 |     ppid=$(grep PPid /proc/${pid}/status | awk '{print $2}')
15 |   done
16 |   if [[ $ppid -eq 1 ]]; then
17 |     ppid=$pid
18 |   fi
19 |   if [[ $(ps -hp $ppid -o cmd | grep -c conmon) -eq 1 ]]; then
20 |     ns=$(ps -hp $ppid -o cmd | grep conmon | awk '{print $9}' | awk -F/ '{print $5}' | awk -F_ '{print $1}')
21 |     pod=$(ps -hp $ppid -o cmd | grep conmon | awk '{print $9}' | awk -F/ '{print $5}' | awk -F_ '{print $2}')
22 |     if [ ${ns_pods["${ns}/${pod}"]} ]; then
23 |       ns_pods["${ns}/${pod}"]=$(expr ${ns_pods["${ns}/${pod}"]} + $count)
24 |     else
25 |       ns_pods["${ns}/${pod}"]=$count
26 |     fi
27 |   fi
28 | done
29 | for pod in "${!ns_pods[@]}"; do
30 |   echo $pod ${ns_pods[$pod]}
31 | done | sort -nr -k2 | column -t
32 | 
33 | IFS=$ORIG_IFS
34 | 


--------------------------------------------------------------------------------
/checks/operators:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are operators in 'bad' state
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get co >/dev/null 2>&1; then
 9 |   bad_cluster_operators=$(oc get co --no-headers | grep -E -civ 'True.*False.*False')
10 |   if [[ ${bad_cluster_operators} -ge 1 ]]; then
11 |     BADCOPS=$(oc get co --no-headers | grep -E -iv 'True.*False.*False')
12 |     msg "Cluster Operators in Bad State (${bad_cluster_operators}):\n${RED}${BADCOPS}${NOCOLOR}"
13 |     errors=$(("${errors}" + 1))
14 |   fi
15 |   bad_operators=$(oc get csv -l \!olm.copiedFrom -A -o json | jq '.items[] | { name: .metadata.name, namespace: .metadata.namespace, phase: .status.phase } | select (.phase!="Succeeded")')
16 |   if [[ -n ${bad_operators} ]]; then
17 |     BADOPS=$(echo "${bad_operators}" | jq .)
18 |     msg "Operators in ${RED}Bad State${NOCOLOR}: ${BADOPS}"
19 |     errors=$(("${errors}" + 1))
20 |     error=true
21 |   fi
22 |   if [ ! -z "${ERRORFILE}" ]; then
23 |     echo $errors >${ERRORFILE}
24 |   fi
25 |   if [[ $error == true ]]; then
26 |     exit ${OCERROR}
27 |   else
28 |     exit ${OCOK}
29 |   fi
30 | 
31 | else
32 |   msg "Couldn't get co, check permissions"
33 |   exit ${OCSKIP}
34 | fi
35 | exit ${OCUNKNOWNN}
36 | 


--------------------------------------------------------------------------------
/info/biosversion:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the nodes' BIOS version
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | # Check BIOS version to begin with
 7 | if oc auth can-i debug node >/dev/null 2>&1; then
 8 |   msg "Checking bios versions (${BLUE}using oc debug, it can take a while${NOCOLOR})"
 9 |   # shellcheck disable=SC2016
10 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
11 |     # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
12 |     ((i = i % PARALLELJOBS))
13 |     ((i++ == 0)) && wait
14 |     (
15 |       ocdebugorwait # Pause for no OC debug running
16 |       if ! BIOSVER=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "cat /sys/class/dmi/id/bios_version" 2>/dev/null); then
17 |         msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}"
18 |       else
19 |         if [ -n "${BIOSVER}" ]; then
20 |           msg "${node}: ${BIOSVER}"
21 |         else
22 |           msg "Couldn't found /sys/class/dmi/id/bios_version in ${node}"
23 |         fi
24 |       fi
25 |     ) &
26 |   done
27 |   wait
28 |   exit ${OCINFO}
29 | else
30 |   msg "Couldn't debug nodes, check permissions"
31 |   exit ${OCSKIP}
32 | fi
33 | exit ${OCUNKNOWN}
34 | 


--------------------------------------------------------------------------------
/info/locks:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: List all pods with locks on each node
 3 | 
 4 | ORIG_IFS=$IFS
 5 | IFS=$(echo -en "\n\b")
 6 | 
 7 | SCRIPT64=$(cat ./scripts/locks.sh | base64 -w 0)
 8 | 
 9 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
10 | if oc auth can-i debug node >/dev/null 2>&1; then
11 |   msg "Checking for locks by pod, per node (${BLUE}using oc debug, it can take a while${NOCOLOR})"
12 |   fw_errors=0
13 |   # shellcheck disable=SC2016
14 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
15 |     ocdebugorwait # Pause for no OC debug running
16 |     # shellcheck disable=SC1083
17 |     if ! FILE_LOCKS=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "echo $SCRIPT64 | base64 -d > /tmp/locks.sh; chmod 755 /tmp/locks.sh; /tmp/locks.sh; rm -f /tmp/locks.sh"); then
18 |       msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}"
19 |     else
20 |       if [ -n "${FILE_LOCKS}" ]; then
21 |         msg "File locks found on ${RED}${node}${NOCOLOR}"
22 |         for line in ${FILE_LOCKS}; do
23 |           echo $line
24 |         done
25 |       else
26 |         msg "Couldn't check for locks on ${node}"
27 |       fi
28 |     fi
29 |   done
30 |   exit ${OCINFO}
31 | else
32 |   msg "Couldn't debug nodes, check permissions"
33 |   exit ${OCSKIP}
34 | fi
35 | exit ${OCUNKNOWN}
36 | 


--------------------------------------------------------------------------------
/checks/port-thrashing:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are OVN pods thrashing
 3 | 
 4 | THRASHINGMSG="Changing chassis for lport"
 5 | NAMESPACE="openshift-ovn-kubernetes"
 6 | 
 7 | error=false
 8 | 
 9 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
10 | 
11 | if [[ $(oc get network/cluster -o jsonpath={.spec.networkType}) != "OVNKubernetes" ]]; then
12 |   msg "This check only works for OVNKubernetes SDN"
13 |   exit ${OCSKIP}
14 | else
15 |   if oc auth can-i get pods -n ${NAMESPACE} >/dev/null 2>&1; then
16 |     if oc auth can-i get pods --subresource=log -n ${NAMESPACE} >/dev/null 2>&1; then
17 |       for pod in $(oc get pods -o name -n ${NAMESPACE} -l app=ovnkube-node); do
18 |         numerrors=$(oc logs -n ${NAMESPACE} ${pod} -c ovn-controller | grep "${THRASHINGMSG}" -c)
19 |         if [[ ${numerrors} -gt ${THRASHING_THRESHOLD} ]]; then
20 |           msg "${RED}${pod} port thrashing errors detected${NOCOLOR}"
21 |           errors=$(("${errors}" + 1))
22 |           error=true
23 |         fi
24 | 
25 |       done
26 |       if [ ! -z "${ERRORFILE}" ]; then
27 |         echo $errors >${ERRORFILE}
28 |       fi
29 |       if [[ $error == true ]]; then
30 |         exit ${OCERROR}
31 |       else
32 |         exit ${OCOK}
33 |       fi
34 |     else
35 |       msg "Couldn't get pods logs, check permissions"
36 |       exit ${OCSKIP}
37 |     fi
38 |   else
39 |     msg "Couldn't get pods, check permissions"
40 |     exit ${OCSKIP}
41 |   fi
42 | fi
43 | exit ${OCUNKNOWNN}
44 | 


--------------------------------------------------------------------------------
/checks/ovn-pods-memory-usage:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if the memory usage of the OVN pods is under the LIMIT threshold
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i adm top -A >/dev/null 2>&1; then
 9 |   LIMIT="${OVN_MEMORY_LIMIT:=5000}"
10 |   FLAG=0
11 |   pods_memory_usage=$(oc adm top pod -n openshift-ovn-kubernetes -l app=ovnkube-node --no-headers | awk '{ print $1 " " $3 }' | awk '{$2 = substr($2,0,length($2)-2)}  1')
12 |   MESSAGE=""
13 | 
14 |   OLDIFS=${IFS}
15 |   IFS=$'\n'
16 |   for pod_line in ${pods_memory_usage}; do
17 |     pod_name=$(echo $pod_line | awk '{ print $1 }')
18 |     pod_size=$(echo $pod_line | awk '{ print $2 }')
19 |     if [[ ${pod_size} -ge ${LIMIT} ]]; then
20 |       MESSAGE="${MESSAGE}The OVN pod memory usage for ${pod_name} is extremely high: ${RED}${pod_size}${NOCOLOR}Mi\n"
21 |       FLAG=1
22 |     fi
23 |   done
24 |   IFS=${OLDIFS}
25 | 
26 |   if [[ ${FLAG} -ne 0 ]]; then
27 |     MESSAGE="${MESSAGE}For more information you can check the KCS https://access.redhat.com/solutions/6493321\n"
28 |     msg "${MESSAGE}"
29 |     errors=$(("${errors}" + 1))
30 |     error=true
31 |   fi
32 | 
33 |   if [ ! -z "${ERRORFILE}" ]; then
34 |     echo $errors >${ERRORFILE}
35 |   fi
36 | 
37 |   if [[ $error == true ]]; then
38 |     exit ${OCERROR}
39 |   else
40 |     exit ${OCOK}
41 |   fi
42 | else
43 |   msg "Couldn't adm top pods, check permissions"
44 |   exit ${OCSKIP}
45 | fi
46 | exit ${OCUNKNOWNN}
47 | 


--------------------------------------------------------------------------------
/info/ethtool-firmware-version:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the nodes' NIC firmware version using ethtool
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i debug node >/dev/null 2>&1; then
 7 |   msg "Checking NIC firmware version using ethtool (${BLUE}using oc debug, it can take a while${NOCOLOR})"
 8 |   # shellcheck disable=SC2016
 9 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
10 |     # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
11 |     ((i = i % PARALLELJOBS))
12 |     ((i++ == 0)) && wait
13 |     (
14 |       ocdebugorwait # Pause for no OC debug running
15 |       if ! FIRMWAREVERS=$(oc debug --image="${OSETOOLSIMAGE}" "${node}" -- sh -c "for interface in \$(ls -d /sys/class/net/*/device | cut -d/ -f5);  do echo -n \"\${interface} => \"; ethtool -i \${interface} | awk '/firmware-version/ { print substr(\$0, index(\$0,\$2)) }';done" 2>/dev/null); then
16 |         msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}"
17 |       else
18 |         if [ -n "${FIRMWAREVERS}" ]; then
19 |           msg "${node}:\n${FIRMWAREVERS}"
20 |         else
21 |           msg "Couldn't find NIC firmware version in ${node}"
22 |         fi
23 |       fi
24 |     ) &
25 |   done
26 |   wait
27 |   exit ${OCINFO}
28 | else
29 |   msg "Couldn't debug nodes, check permissions"
30 |   exit ${OCSKIP}
31 | fi
32 | exit ${OCUNKNOWN}
33 | 


--------------------------------------------------------------------------------
/checks/alertmanager:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are warning or error alerts firing
 3 | # kb: https://access.redhat.com/solutions/4250221
 4 | 
 5 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 6 | if oc auth can-i get routes -n openshift-monitoring >/dev/null 2>&1; then
 7 |   alert_url=$(oc -n openshift-monitoring get routes/alertmanager-main -o json | jq -r .spec.host)
 8 |   raw_alerts=$(curl -s -k -H "Authorization: Bearer $(oc -n openshift-monitoring sa get-token prometheus-k8s)" https://$alert_url/api/v2/alerts)
 9 |   if [ $? -eq 35 ]; then
10 |     # Error code 35 might mean an issue with a proxy server
11 |     raw_alerts=$(curl --noproxy '*' -s -k -H "Authorization: Bearer $(oc -n openshift-monitoring sa get-token prometheus-k8s)" https://$alert_url/api/v2/alerts)
12 |   fi
13 |   alerts=$(echo $raw_alerts | jq '.[] | {alert:.labels.alertname, severity:.labels.severity, namespace:.labels.namespace, instance:.labels.instance, message:(.annotations.message // .annotations.summary)} | select((.severity == "warning") or (.severity == "critical"))')
14 |   if [[ -n ${alerts} ]]; then
15 |     ALERTS=$(echo "${alerts}" | jq -r '. | "\(.severity)\t\(.alert)\t\(.namespace)\t\(.instance)\t\(.message)"' | column -t -s $'\t' -N "SEVERITY,ALERT,NAMESPACE,INSTANCE,MESSAGE")
16 |     msg "Alerts currently firing:\n${RED}${ALERTS}${NOCOLOR}\n"
17 |     errors=$(("${errors}" + 1))
18 |     if [ ! -z "${ERRORFILE}" ]; then
19 |       echo $errors >${ERRORFILE}
20 |     fi
21 |     exit ${OCERROR}
22 |   fi
23 |   exit ${OCOK}
24 | else
25 |   msg "Couldn't get routes, check permissions"
26 |   exit ${OCSKIP}
27 | fi
28 | exit ${OCUNKNOWN}
29 | 


--------------------------------------------------------------------------------
/cronjob.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Namespace
 4 | metadata:
 5 |   name: checks-openshift
 6 | ---
 7 | apiVersion: v1
 8 | kind: ServiceAccount
 9 | metadata:
10 |   name: checks-openshift
11 |   namespace: checks-openshift
12 | ---
13 | apiVersion: rbac.authorization.k8s.io/v1
14 | kind: ClusterRoleBinding
15 | metadata:
16 |   name: checks-openshift
17 | roleRef:
18 |   apiGroup: rbac.authorization.k8s.io
19 |   kind: ClusterRole
20 |   name: cluster-admin
21 | subjects:
22 |   - kind: ServiceAccount
23 |     name: checks-openshift
24 |     namespace: checks-openshift
25 | ---
26 | apiVersion: batch/v1beta1
27 | kind: CronJob
28 | metadata:
29 |   name: checks-openshift
30 |   namespace: checks-openshift
31 | spec:
32 |   concurrencyPolicy: Forbid
33 |   failedJobsHistoryLimit: 3
34 |   jobTemplate:
35 |     spec:
36 |       template:
37 |         spec:
38 |           tolerations:
39 |             - effect: NoSchedule
40 |               key: node-role.kubernetes.io/master
41 |               operator: Exists
42 |           affinity: {}
43 |           containers:
44 |             - name: checks-openshift
45 |               image: quay.io/rhsysdeseng/openshift-checks:latest
46 |               imagePullPolicy: IfNotPresent
47 |               command: ["/bin/sh", "-c", "/opt/openshift-checks/openshift-checks.sh"]
48 |               resources:
49 |                 requests:
50 |                   cpu: 100m
51 |                   memory: 256Mi
52 |           serviceAccountName: checks-openshift
53 |           restartPolicy: Never
54 |           terminationGracePeriodSeconds: 30
55 |       backoffLimit: 0
56 |   schedule: "53 * * * *"
57 |   successfulJobsHistoryLimit: 3
58 |   suspend: false
59 | 


--------------------------------------------------------------------------------
/checks/zombies:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if more than 5 zombie processes exist on the hosts
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | tmperrorfile=$(mktemp)
 7 | trap "rm ${errorfile}" EXIT
 8 | echo 0 >$tmperrorfile
 9 | 
10 | if oc auth can-i debug node >/dev/null 2>&1; then
11 |   msg "Collecting zombie processes... (${BLUE}using oc debug, it can take a while${NOCOLOR})"
12 |   # shellcheck disable=SC2016
13 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
14 |     # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
15 |     ((i = i % PARALLELJOBS))
16 |     ((i++ == 0)) && wait
17 |     (
18 |       ocdebugorwait # Pause for no OC debug running
19 |       ZOMBIES=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'ps -ef | grep -c "[d]efunct"' 2>/dev/null)
20 |       if [ -n "${ZOMBIES}" ] && [ "${ZOMBIES}" -gt 0 ]; then
21 |         msg "${ORANGE}${ZOMBIES}${NOCOLOR} zombie processes found in ${node}"
22 |         if [ "${ZOMBIES}" -ge 5 ]; then
23 |           echo 1 >$tmperrorfile
24 |         fi
25 |       fi
26 |     ) &
27 |   done
28 |   wait
29 |   if [ "$(cat $tmperrorfile)" -eq 1 ]; then
30 |     errors=$(("${errors}" + 1))
31 |     if [ ! -z "${ERRORFILE}" ]; then
32 |       echo $errors >${ERRORFILE}
33 |     fi
34 |     exit ${OCERROR}
35 |   else
36 |     exit ${OCOK}
37 |   fi
38 | else
39 |   msg "Couldn't debug nodes, check permissions"
40 |   exit ${OCSKIP}
41 | fi
42 | exit ${OCUNKNOWN}
43 | 


--------------------------------------------------------------------------------
/checks/entropy:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if the workers have enough entropy
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | tmperrorfile=$(mktemp)
 7 | trap "rm ${errorfile}" EXIT
 8 | echo 0 >$tmperrorfile
 9 | 
10 | if oc auth can-i debug node >/dev/null 2>&1; then
11 |   msg "Collecting entropy data... (${BLUE}using oc debug, it can take a while${NOCOLOR})"
12 |   # shellcheck disable=SC2016
13 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
14 |     # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
15 |     ((i = i % PARALLELJOBS))
16 |     ((i++ == 0)) && wait
17 |     (
18 |       ocdebugorwait # Pause for no OC debug running
19 |       if ! ENTROPY=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'cat /proc/sys/kernel/random/entropy_avail' 2>/dev/null); then
20 |         msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}"
21 |       else
22 |         if [ -n "${ENTROPY}" ] && [ "${ENTROPY}" -lt 200 ]; then
23 |           msg "${RED}Low entropy in ${node}${NOCOLOR}"
24 |           echo 1 >$tmperrorfile
25 |         fi
26 |       fi
27 |     ) &
28 |   done
29 |   wait
30 |   if [ "$(cat $tmperrorfile)" -eq 1 ]; then
31 |     errors=$(("${errors}" + 1))
32 |     if [ ! -z "${ERRORFILE}" ]; then
33 |       echo $errors >${ERRORFILE}
34 |     fi
35 |     exit ${OCERROR}
36 |   else
37 |     exit ${OCOK}
38 |   fi
39 | else
40 |   msg "Couldn't debug nodes, check permissions"
41 |   exit ${OCSKIP}
42 | fi
43 | exit ${OCUNKNOWN}
44 | 


--------------------------------------------------------------------------------
/ssh/bz1941840:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if the authentication-operator is using excessive RAM -> hung kubelet BZ1941840
 3 | # bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1948052
 4 | 
 5 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 6 | 
 7 | error=false
 8 | 
 9 | if oc auth can-i get pods -n openshift-authentication-operator >/dev/null 2>&1; then
10 |   msg "Checking for a hung kubelet..."
11 |   # shellcheck disable=SC2016
12 |   node=$(oc get pods -n openshift-authentication-operator -l app=authentication-operator -o json | jq -r .items[0].spec.nodeName)
13 |   container_id=$(oc get pods -n openshift-authentication-operator -l app=authentication-operator -o json | jq -r .items[0].status.containerStatuses[0].containerID | awk -F// '{print $2}' | cut -c-13)
14 |   if ! AUTH_OPERATOR_MEMORY=$(ssh -q core@$node "sudo crictl stats --id ${container_id} -o json | jq -r .stats[0].memory.workingSetBytes.value"); then
15 |     msg "${ORANGE}Error running crictl stats openshift-authentication-operator/${pod}${NOCOLOR}"
16 |   else
17 |     if [ -n "${AUTH_OPERATOR_MEMORY}" ] && [ "${AUTH_OPERATOR_MEMORY}" -gt 2147483648 ]; then # more than 2GB is a bad sign
18 |       msg "${RED}High memory usage detected for openshift-authentication-operator, which likely means that kubelet on ${node} is hung. Terminate the pod to remediate${NOCOLOR}"
19 |       errors=$(("${errors}" + 1))
20 |       error=true
21 |     fi
22 |   fi
23 |   if [ ! -z "${ERRORFILE}" ]; then
24 |     echo $errors >${ERRORFILE}
25 |   fi
26 |   if [[ $error == true ]]; then
27 |     exit ${OCERROR}
28 |   else
29 |     exit ${OCOK}
30 |   fi
31 | else
32 |   msg "Couldn't get pods, check permissions"
33 |   exit ${OCSKIP}
34 | fi
35 | exit ${OCUNKNOWN}
36 | 


--------------------------------------------------------------------------------
/checks/nodes:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if there are not ready or not schedulable nodes
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | if oc auth can-i get nodes >/dev/null 2>&1; then
 9 |   nodes_not_ready=$(oc get nodes -o json | jq '.items[] | { name: .metadata.name, type: .status.conditions[] } | select ((.type.type == "Ready") and (.type.status != "True"))')
10 |   if [[ -n ${nodes_not_ready} ]]; then
11 |     NODESNOTREADY=$(echo "${nodes_not_ready}" | jq .)
12 |     msg "Nodes ${RED}NotReady${NOCOLOR}: ${NODESNOTREADY}"
13 |     errors=$(("${errors}" + 1))
14 |     error=true
15 |   fi
16 |   disabled_nodes=$(oc get nodes -o json | jq '.items[] | { name: .metadata.name, status: .spec.unschedulable } | select (.status == true)')
17 |   if [[ -n ${disabled_nodes} ]]; then
18 |     NODESDISABLED=$(echo "${disabled_nodes}" | jq .)
19 |     msg "Nodes ${RED}Disabled${NOCOLOR}: ${NODESDISABLED}"
20 |     errors=$(("${errors}" + 1))
21 |     error=true
22 |   fi
23 |   pressure_nodes=$(oc get node -o json | jq '.items[] | { name: .metadata.name, conditions: .status.conditions[] } | select ((.conditions.type | contains("Pressure")) and .conditions.status != "False")')
24 |   if [[ -n ${pressure_nodes} ]]; then
25 |     NODESPRESSURE=$(echo "${pressure_nodes}" | jq .)
26 |     msg "Nodes with ${RED}Pressure${NOCOLOR}: ${NODESPRESSURE}"
27 |     errors=$(("${errors}" + 1))
28 |   fi
29 |   if [ ! -z "${ERRORFILE}" ]; then
30 |     echo $errors >${ERRORFILE}
31 |   fi
32 |   if [[ $error == true ]]; then
33 |     exit ${OCERROR}
34 |   else
35 |     exit ${OCOK}
36 |   fi
37 | 
38 | else
39 |   msg "Couldn't get nodes, check permissions"
40 |   exit ${OCSKIP}
41 | fi
42 | exit ${OCUNKNOWNN}
43 | 


--------------------------------------------------------------------------------
/checks/chronyc:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if the worker clocks are synced using chronyc
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | tmperrorfile=$(mktemp)
 7 | trap "rm ${errorfile}" EXIT
 8 | echo 0 >$tmperrorfile
 9 | 
10 | if oc auth can-i debug node >/dev/null 2>&1; then
11 |   msg "Collecting NTP data... (${BLUE}using oc debug, it can take a while${NOCOLOR})"
12 |   # shellcheck disable=SC2016
13 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
14 |     # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
15 |     ((i = i % PARALLELJOBS))
16 |     ((i++ == 0)) && wait
17 |     (
18 |       # shellcheck disable=2016
19 |       ocdebugorwait # Pause for no OC debug running
20 |       if ! SOURCES=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'chronyc activity' 2>/dev/null | awk '/sources online/ { print $1 }'); then
21 |         msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}"
22 |       else
23 |         if [ -n "${SOURCES}" ] && [ "${SOURCES}" -lt 1 ]; then
24 |           msg "${RED}Clock doesn't seem to be synced in ${node}${NOCOLOR}"
25 |           echo 1 >$tmperrorfile
26 |         fi
27 |       fi
28 |     ) &
29 |   done
30 |   wait
31 |   if [ "$(cat $tmperrorfile)" -eq 1 ]; then
32 |     errors=$(("${errors}" + 1))
33 |     if [ ! -z "${ERRORFILE}" ]; then
34 |       echo $errors >${ERRORFILE}
35 |     fi
36 |     exit ${OCERROR}
37 |   else
38 |     exit ${OCOK}
39 |   fi
40 | else
41 |   msg "Couldn't debug nodes, check permissions"
42 |   exit ${OCSKIP}
43 | fi
44 | exit ${OCUNKNOWN}
45 | 


--------------------------------------------------------------------------------
/pre/dns-hostnames:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if the api and wildcard DNS entries are correct
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | error=false
 7 | 
 8 | BASEDOMAIN=$(yq e '.baseDomain' ${INSTALL_CONFIG_PATH} 2>/dev/null)
 9 | 
10 | if [ -z ${BASEDOMAIN} ]; then
11 |   errors=$(("${errors}" + 1))
12 |   error=true
13 |   msg ".baseDomain not found in ${INSTALL_CONFIG_PATH}"
14 | fi
15 | 
16 | CLUSTERNAME=$(yq e '.metadata.name' ${INSTALL_CONFIG_PATH} 2>/dev/null)
17 | if [ -z ${CLUSTERNAME} ]; then
18 |   errors=$(("${errors}" + 1))
19 |   error=true
20 |   msg ".metadata.name not found in ${INSTALL_CONFIG_PATH}"
21 | fi
22 | 
23 | #MASTERS=$(yq e '.platform.baremetal.hosts[] | select(.role == "master") | .name' ${INSTALL_CONFIG_PATH} 2> /dev/null)
24 | #WORKERS=$(yq e '.platform.baremetal.hosts[] | select(.role == "worker") | .name' ${INSTALL_CONFIG_PATH} 2> /dev/null)
25 | # NS1="ns1."${CLUSTERNAME}"."${BASEDOMAIN}
26 | 
27 | API="api."${CLUSTERNAME}"."${BASEDOMAIN}"."
28 | WILDCARD="foobar.apps."${CLUSTERNAME}"."${BASEDOMAIN}"."
29 | 
30 | IP_API=$(dig +short ${API})
31 | if [ -z ${IP_API} ]; then
32 |   errors=$(("${errors}" + 1))
33 |   error=true
34 |   msg "${RED}${API} doesn't resolve${NOCOLOR}"
35 | fi
36 | 
37 | IP_WILDCARD=$(dig +short ${WILDCARD})
38 | if [ -z ${IP_WILDCARD} ]; then
39 |   errors=$(("${errors}" + 1))
40 |   error=true
41 |   msg "${RED}${WILDCARD} doesn't resolve${NOCOLOR}"
42 | fi
43 | 
44 | IP_API_REVERSE=$(dig +short -x ${IP_API})
45 | if [ -z ${IP_API_REVERSE} ]; then
46 |   errors=$(("${errors}" + 1))
47 |   error=true
48 |   msg "${YELLOW}api reverse not found${NOCOLOR}"
49 | else
50 |   if [ ${IP_API_REVERSE} != ${API} ]; then
51 |     errors=$(("${errors}" + 1))
52 |     error=true
53 |     msg "${YELLOW}${API} doesn't match the reverse ${IP_API_REVERSE}${NOCOLOR}"
54 |   fi
55 | fi
56 | 
57 | # Wildcard reverse DNS doesn't seem to be a thing
58 | #IP_WILDCARD_REVERSE=$(dig +short -x ${IP_WILDCARD})
59 | #if [ -z ${IP_WILDCARD_REVERSE} ]; then
60 | #  msg "${YELLOW}wildcard reverse not found${NOCOLOR}"
61 | #  else
62 | #    if [ ${IP_WILDCARD_REVERSE} != ${WILDCARD} ]; then
63 | #      msg "${YELLOW}${WILDCARD} doesn't match the reverse ${IP_WILDCARD_REVERSE}${NOCOLOR}"
64 | #    fi
65 | #fi
66 | 
67 | if [ ! -z "${ERRORFILE}" ]; then
68 |   echo $errors >${ERRORFILE}
69 | fi
70 | if [[ $error == true ]]; then
71 |   exit ${OCERROR}
72 | else
73 |   exit ${OCOK}
74 | fi
75 | 


--------------------------------------------------------------------------------
/.github/workflows/refresh-checksmd.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Checks.md updater
 3 | 
 4 | on:
 5 |   # Compare the preceeding commit of main -> to the current commit of the main branch.
 6 |   # (Note: To compare changes between the last pushed commit to the remote main branch set `since_last_remote_commit: true`)
 7 |   push:
 8 |     branches:
 9 |       - main
10 |   # Compare the last commit of main -> to the current commit of a PR branch.
11 |   # (Note: To compare changes between the last pushed commit to the remote PR branch set `since_last_remote_commit: true`)
12 |   pull_request:
13 |     branches:
14 |       - main
15 | 
16 | jobs:
17 |   build:
18 |     runs-on: ubuntu-latest # windows-latest | macos-latest
19 |     name: Test changed-files
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |         with:
23 |           fetch-depth: 0 # OR "2" -> To retrieve the preceding commit.
24 | 
25 |       - name: Get changed files in the checks folder
26 |         id: changed-files-specific
27 |         uses: tj-actions/changed-files@v40
28 |         with:
29 |           files: |
30 |             checks/**
31 |             info/**
32 |             pre/**
33 |             ssh/**
34 | 
35 |       - name: Run step if any file(s) in the watched folder change
36 |         if: steps.changed-files-specific.outputs.any_changed == 'true'
37 |         run: |
38 |           echo "One or more files in the scripts folder has changed, updating checks.md"
39 |           echo "List all the files that have changed: ${{ steps.changed-files-specific.outputs.all_changed_files }}"
40 |           ./scripts/update-checksmd > checks.md
41 | 
42 |       - name: Commit back the checks.md to the repository
43 |         if: steps.changed-files-specific.outputs.any_changed == 'true'
44 |         uses: stefanzweifel/git-auto-commit-action@v5
45 |         with:
46 |           # Optional. Commit message for the created commit.
47 |           # Defaults to "Apply automatic changes"
48 |           commit_message: "[skip ci] Autoupdate Checks.md on change"
49 | 
50 |           # Optional. Local and remote branch name where commit is going to be pushed
51 |           #  to. Defaults to the current branch.
52 |           #  You might need to set `create_branch: true` if the branch does not exist.
53 |           branch: main
54 | 
55 |           # Optional. Options used by `git-commit`.
56 |           # See https://git-scm.com/docs/git-commit#_options
57 |           commit_options: '--no-verify --signoff'
58 | 
59 |           # Optional glob pattern of files which should be added to the commit
60 |           # Defaults to all (.)
61 |           # See the `pathspec`-documentation for git
62 |           # - https://git-scm.com/docs/git-add#Documentation/git-add.txt-ltpathspecgt82308203
63 |           # - https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec
64 |           file_pattern: 'checks.md'
65 | 
66 |           # Optional. Local file path to the repository.
67 |           # Defaults to the root of the repository.
68 |           repository: .
69 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | fail_fast: true
  3 | repos:
  4 |   - hooks:
  5 |       - id: commitizen
  6 |         stages:
  7 |           - commit-msg
  8 |     repo: https://github.com/commitizen-tools/commitizen
  9 |     rev: v2.42.0
 10 |   - hooks:
 11 |       - id: check-useless-excludes
 12 |     repo: meta
 13 |   - hooks:
 14 |       - files: \.(css|js|md|markdown|json)
 15 |         id: prettier
 16 |     repo: https://github.com/pre-commit/mirrors-prettier
 17 |     rev: v3.0.0-alpha.4
 18 |   - hooks:
 19 |       - id: seed-isort-config
 20 |     repo: https://github.com/asottile/seed-isort-config
 21 |     rev: v2.2.0
 22 |   - hooks:
 23 |       - id: isort
 24 |     repo: https://github.com/pre-commit/mirrors-isort
 25 |     rev: v5.10.1
 26 |   - hooks:
 27 |       - id: black
 28 |     repo: https://github.com/python/black
 29 |     rev: 23.1.0
 30 |   - hooks:
 31 |       - id: check-added-large-files
 32 |       - id: check-ast
 33 |       - id: check-case-conflict
 34 |       - id: check-executables-have-shebangs
 35 |       - id: check-json
 36 |       - id: check-merge-conflict
 37 |       - id: check-symlinks
 38 |       - id: check-vcs-permalinks
 39 |       - id: debug-statements
 40 |       - id: check-xml
 41 |       - args:
 42 |           - --unsafe
 43 |         id: check-yaml
 44 |       - id: end-of-file-fixer
 45 |       - id: forbid-new-submodules
 46 |       - args:
 47 |           - --branch
 48 |           - gh-pages
 49 |         id: no-commit-to-branch
 50 |       - id: requirements-txt-fixer
 51 |       - id: sort-simple-yaml
 52 |       - id: trailing-whitespace
 53 |       - id: mixed-line-ending
 54 |       - id: detect-private-key
 55 |       - id: check-byte-order-marker
 56 |       - id: check-docstring-first
 57 |     repo: https://github.com/pre-commit/pre-commit-hooks
 58 |     rev: v4.4.0
 59 |   - hooks:
 60 |       - id: flake8
 61 |     repo: https://github.com/pycqa/flake8
 62 |     rev: 6.0.0
 63 |   - hooks:
 64 |       - additional_dependencies:
 65 |           - mvdan.cc/sh/v3/cmd/shfmt@v3.1.1
 66 |         args:
 67 |           - -w
 68 |           - -i
 69 |           - "2"
 70 |           - -s
 71 |         entry: shfmt
 72 |         id: shfmt
 73 |         language: golang
 74 |         minimum_pre_commit_version: 2.4.0
 75 |         name: shfmt
 76 |         types:
 77 |           - shell
 78 |     repo: local
 79 |   - hooks:
 80 |       - id: blacken-docs
 81 |     repo: https://github.com/asottile/blacken-docs
 82 |     rev: 1.13.0
 83 | 
 84 |   # - repo: https://github.com/asottile/pyupgrade
 85 |   #   rev: v2.38.0
 86 |   #   hooks:
 87 |   #     - id: pyupgrade
 88 |   #       args: [--py39-plus]
 89 | 
 90 |   - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt
 91 |     rev: 0.2.2 # or other specific tag
 92 |     hooks:
 93 |       - id: yamlfmt
 94 |         args: [--mapping, '2', --sequence, '4', --offset, '2', '--preserve-quotes']
 95 | 
 96 | 
 97 |   - repo: https://github.com/hcodes/yaspeller.git
 98 |     rev: v8.0.1
 99 |     hooks:
100 |       - id: yaspeller
101 |         types:
102 |           - markdown
103 | 


--------------------------------------------------------------------------------
/checks/iptables-22623-22624:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # kb: https://access.redhat.com/solutions/5709711
 3 | # description: Checks if the nodes iptables rules are blocking 22623/tpc or 22624/tcp
 4 | 
 5 | #
 6 | # To check if the rule exist, we use iptables -C, it returns 0 if the rule exist
 7 | # and if it doesn't exist, it exits 1 with the following message:
 8 | # "iptables: Bad rule (does a matching rule exist in that chain?)."
 9 | #
10 | # To save cycles, we run every command in the same oc debug session.
11 | # We concatenate all commands with || meaning it will stop if
12 | # some command fails (returns 0, so if the rule exist)
13 | 
14 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
15 | 
16 | tmperrorfile=$(mktemp)
17 | trap "rm ${errorfile}" EXIT
18 | echo 0 >$tmperrorfile
19 | 
20 | if oc auth can-i debug node >/dev/null 2>&1; then
21 |   msg "Checking if ports 22623/tcp and 22624/tcp are blocked (${BLUE}using oc debug, it can take a while${NOCOLOR})"
22 |   # shellcheck disable=SC2016
23 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
24 |     # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
25 |     ((i = i % PARALLELJOBS))
26 |     ((i++ == 0)) && wait
27 |     (
28 |       ocdebugorwait # Pause for no OC debug running
29 |       # shellcheck disable=2016
30 |       OUTPUT=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c \
31 |         "iptables -C FORWARD -p tcp --dport 22623 -j REJECT --reject-with icmp-port-unreachable &>/dev/null || \
32 |            iptables -C FORWARD -p tcp --dport 22624 -j REJECT --reject-with icmp-port-unreachable &>/dev/null || \
33 |            iptables -C OUTPUT -p tcp --dport 22623 -j REJECT --reject-with icmp-port-unreachable &>/dev/null || \
34 |            iptables -C OUTPUT -p tcp --dport 22624 -j REJECT --reject-with icmp-port-unreachable &>/dev/null || \
35 |            echo 'allok'" 2>&1)
36 |       # The command stderr and stdout are captured
37 |       # If the command output is 'allok' is because every other command
38 |       # failed, meaning the iptables rules weren't found
39 |       if [[ ${OUTPUT} =~ "allok" ]]; then
40 |         # Do nothing
41 |         :
42 |       elif [[ ${OUTPUT} =~ "Back-off" ]]; then
43 |         msg "${ORANGE}Error pulling the oc debug image in ${node}${NOCOLOR}"
44 |       elif [[ ${OUTPUT} =~ "unable to create" ]]; then
45 |         msg "${ORANGE}Unable to create debug pod in ${node}${NOCOLOR}"
46 |       else
47 |         msg "${RED}iptables rules for 22623/tcp or 22624/tcp found in ${node}${NOCOLOR}"
48 |         echo 1 >$tmperrorfile
49 |       fi
50 |     ) &
51 |   done
52 |   wait
53 |   if [ "$(cat $tmperrorfile)" -eq 1 ]; then
54 |     errors=$(("${errors}" + 1))
55 |     if [ ! -z "${ERRORFILE}" ]; then
56 |       echo $errors >${ERRORFILE}
57 |     fi
58 |     exit ${OCERROR}
59 |   else
60 |     exit ${OCOK}
61 |   fi
62 | 
63 | else
64 |   msg "Couldn't debug nodes, check permissions"
65 |   exit ${OCSKIP}
66 | fi
67 | exit ${OCUNKNOWN}
68 | 


--------------------------------------------------------------------------------
/checks/mellanox-firmware-version:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Checks if the nodes' Mellanox Connect-4 firmware version is below the recommended version.
 3 | 
 4 | # lspci -nn shows PCI vendor and device codes (and names)
 5 | # Mellanox MT27710 Family [ConnectX-4 Lx] 25GbE dual-port SFP28 with **vendor ID 0x15b3 and device ID 0x1015**
 6 | # Mellanox MT27800 Family [ConnectX-5] 25GbE dual-port SFP28 with **vendor ID 0x15b3 and device ID 0x1017**
 7 | # Mellanox MT27800 Family [ConnectX-5] 100GbE with **vendor ID 0x15b3 and device ID 0x1017**
 8 | # Mellanox MT27700 Family [ConnectX-4] VPI adapter card, EDR IB (100Gb/s), single-port QSFP28 with **vendor ID 0x15b3 and device ID 0x1013**
 9 | # Mellanox MT27800 Family [ConnectX-5] VPI adapter card, EDR IB (100Gb/s), single-port QSFP28 with **vendor ID 0x15b3 and device ID 0x1017**
10 | # Mellanox MT28908 Family [ConnectX-6] VPI adapter card, 100Gb/s (HDR100, EDR IB), single-port QSFP56 with **vendor ID 0x15b3 and device ID 0x101b**
11 | # Mellanox MT28908 Family [ConnectX-6] VPI adapter card, HDR200 IB (200Gb/s), single-port QSFP56 with vendor ID **0x15b3 and device ID 0x101b**
12 | 
13 | IDS="15b3:1015 15b3:1017 15b3:1013 15b3:101b"
14 | MIN_VERS=16.28
15 | 
16 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
17 | if oc auth can-i debug node >/dev/null 2>&1; then
18 |   msg "Checking Mellanox firmware version (${BLUE}using oc debug, it can take a while${NOCOLOR})"
19 |   fw_errors=0
20 |   # shellcheck disable=SC2016
21 |   for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
22 |     # shellcheck disable=SC1083
23 |     ocdebugorwait # Pause for no OC debug running
24 |     if ! FIRMWAREVERS=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "for id in ${IDS}; do for device in \$(lspci -D -d "\${id}" | awk '{ print \$1 }'); do echo -n \"\${device},\" ; ethtool -i \$(ls /sys/bus/pci/devices/\${device}/net/)|grep firmware-version|cut -d: -f2-|xargs echo|awk '{ print \$1 }';done;done" 2>/dev/null); then
25 |       msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}"
26 |     else
27 |       if [ -n "${FIRMWAREVERS}" ]; then
28 |         for result in ${FIRMWAREVERS}; do
29 |           dev=$(echo ${result} | awk -F, '{print $1}')
30 |           fw=$(echo ${result} | awk -F, '{print $2}' | awk -F. '{print $1"."$2}')
31 |           if [[ $(expr ${fw} \< ${MIN_VERS}) -eq 1 ]]; then
32 |             msg "Firmware for Mellanox card ${RED}${dev}${NOCOLOR} (${fw}) on ${RED}${node}${NOCOLOR} is below the minimum recommended version. Please upgrade to at least ${GREEN}${MIN_VERS}${NOCOLOR}."
33 |             errors=$(("${errors}" + 1))
34 |             fw_errors=$(("${fw_errors}" + 1))
35 |             if [ ! -z "${ERRORFILE}" ]; then
36 |               echo $errors >${ERRORFILE}
37 |             fi
38 |           fi
39 |         done
40 |       else
41 |         msg "Couldn't find Mellanox firmware version in ${node}"
42 |       fi
43 |     fi
44 |   done
45 |   if [[ $fw_errors -gt 0 ]]; then
46 |     exit ${OCERROR}
47 |   fi
48 |   exit ${OCINFO}
49 | else
50 |   msg "Couldn't debug nodes, check permissions"
51 |   exit ${OCSKIP}
52 | fi
53 | exit ${OCUNKNOWN}
54 | 


--------------------------------------------------------------------------------
/info/mtu:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # description: Show the nodes' MTU for some interfaces
 3 | 
 4 | [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")
 5 | 
 6 | if oc auth can-i get network/cluster >/dev/null 2>&1; then
 7 |   network_type=$(oc get network/cluster -ojson | jq -r .status.networkType)
 8 |   if [[ $network_type != "OVNKubernetes" ]]; then
 9 |     msg "MTU checks not supported yet for clusters using $network_type"
10 |     exit ${OCSKIP}
11 |   else
12 |     # If the crd doesn't exist it will return 0 if you are cluster-admin as you have permissions to get *
13 |     # So it is needed to 'get' the object as well to verify it does exist
14 |     if oc auth can-i get nodenetworkstates.nmstate.io -A >/dev/null 2>&1 && oc get nodenetworkstates.nmstate.io -o jsonpath='{.items[*].metadata.name}' >/dev/null 2>&1; then
15 |       # We need to split the next command using spaces, hence, using standard IFS
16 |       OLDIFS=$IFS
17 |       IFS=$' \t\n'
18 |       for nns in $(oc get nodenetworkstates.nmstate.io -o jsonpath='{.items[*].metadata.name}'); do
19 |         NNS=$(oc get nodenetworkstates.nmstate.io "${nns}" -o json 2>/dev/null)
20 |         BREXMTU=$(echo "${NNS}" | jq '.status.currentState.interfaces[] | select(.name == "br-ex" and .type == "ovs-interface") | .mtu')
21 |         BREXPHYSINT=$(echo "${NNS}" | jq -r '.status.currentState.interfaces[] | select(.name == "br-ex" and .type == "ovs-bridge") | .bridge.port[] | select(.name != "br-ex") | .name')
22 |         PHYSINTMTU=$(echo "${NNS}" | jq ".status.currentState.interfaces[] | select(.name == \"${BREXPHYSINT}\") | .mtu")
23 |         OVNK8SMP0MTU=$(echo "${NNS}" | jq '.status.currentState.interfaces[] | select(.name == "ovn-k8s-mp0") | .mtu')
24 |         msg "${nns} => br-ex:${BREXMTU}, ${BREXPHYSINT}:${PHYSINTMTU}, ovn-k8s-mp0:${OVNK8SMP0MTU}"
25 |       done
26 |       IFS=${OLDIFS}
27 |     else
28 |       if oc auth can-i debug node -A >/dev/null 2>&1 && oc auth can-i get nodes >/dev/null 2>&1; then
29 |         msg "Collecting MTUs... (${BLUE}using oc debug, it can take a while${NOCOLOR}))"
30 |         # shellcheck disable=SC2016
31 |         for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
32 |           # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
33 |           ((i = i % PARALLELJOBS))
34 |           ((i++ == 0)) && wait
35 |           (
36 |             # Get all the information in a single debug to avoid rescheduling unneeded pods
37 |             # then convert the output into an array for easily consumption
38 |             ocdebugorwait # Pause for no OC debug running
39 |             # shellcheck disable=2016
40 |             mapfile -t MTUS < <(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'export EXTBR="br-ex"; export OVNBR="ovn-k8s-mp0"; export BMINTERFACE=$(ovs-vsctl list-ports "${EXTBR}" | grep -v patch) ; echo "${BMINTERFACE}"; nmcli -g GENERAL.MTU dev show "${BMINTERFACE}"; nmcli -g GENERAL.MTU dev show "${EXTBR}"; nmcli -g GENERAL.MTU dev show "${OVNBR}"' 2>/dev/null)
41 |             # If the array is empty, something has happened
42 |             if [ ${#MTUS[@]} -eq 0 ]; then
43 |               msg "${YELLOW}Couldn't get MTU settings in ${node}${NOCOLOR}"
44 |             else
45 |               # MTUS[0] = Baremetal interface name
46 |               # MTUS[1] = Baremetal interface MTU
47 |               # MTUS[2] = br-ex MTU
48 |               # MTUS[3] = ovn-k8s-mp0 MTU
49 |               msg "${node} => br-ex:${MTUS[2]}, ${MTUS[0]}:${MTUS[1]}, ovn-k8s-mp0: ${MTUS[3]}"
50 |             fi
51 |           ) &
52 |         done
53 |         wait
54 |       fi
55 |     fi
56 |   fi
57 |   exit ${OCINFO}
58 | else
59 |   msg "Couldn't debug nodes, check permissions"
60 |   exit ${OCSKIP}
61 | fi
62 | exit ${OCUNKNOWN}
63 | 


--------------------------------------------------------------------------------
/scripts/recover-northd.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | ###########################################################
  3 | # recover-northd.sh script to unwedge northd in the event #
  4 | # of a node failure                                       #
  5 | ###########################################################
  6 | 
  7 | # Timestamp to be used in the logfile name
  8 | NOW=$(date +"%Y-%m-%d_%H-%M-%S")
  9 | # Logfile to save some DEBUG output
 10 | LOG="/tmp/recover-northd.sh.${NOW}.log"
 11 | # Debug var to write DEBUG lines into the log
 12 | DEBUG=false
 13 | # Whether to intervene if northd is wedged
 14 | REMDIATE=false
 15 | 
 16 | ###########################################################
 17 | # usage(): prints the usage of the script
 18 | ###########################################################
 19 | function usage() {
 20 |   echo "This script checks if northd is stuck and optionally intervene"
 21 |   echo -e
 22 |   echo -e "\tUsage: $(basename "$0")"
 23 |   echo -e "\tHelp: $(basename "$0") -h"
 24 |   echo -e "\tSave extra DEBUG lines into the log: $(basename "$0") -d"
 25 |   echo -e "\tSet the KUBECONFIG env var to /kubeconfig/file: $(basename "$0") -k /kubeconfig/file"
 26 |   echo -e "\tRemediate the issue: $(basename "$0") -r"
 27 |   echo -e
 28 |   echo "After the execution a logfile will be generated with the name recover-northd.DATE.log"
 29 | }
 30 | 
 31 | ###########################################################
 32 | # check_northd(): check the current status of northd
 33 | ###########################################################
 34 | function check_northd() {
 35 | 
 36 |   pods=$(oc get pods -n openshift-ovn-kubernetes -l app=ovnkube-master --no-headers | grep Running | awk '{print $1}')
 37 |   for pod in ${pods}; do
 38 |     pod_status=$(oc exec -n openshift-ovn-kubernetes -c northd "${pod}" -- ovn-appctl -t ovn-northd status | awk '{print $2}')
 39 |     if [[ ${pod_status} == 'active' ]]; then
 40 |       active_pod=${pod}
 41 |       node=$(oc get pod/"$active_pod" -n openshift-ovn-kubernetes -o json | jq .spec.nodeName | sed -e 's/\"//g')
 42 |       date=$(date +"%Y-%m-%d %H:%M:%S")
 43 |       if eval "${DEBUG}"; then echo "[check_northd:${date}] pod ${pod} is active" >>"${LOG}"; fi
 44 |     else
 45 |       date=$(date +"%Y-%m-%d %H:%M:%S")
 46 |       if eval "${DEBUG}"; then echo "[check_northd:${date}] pod ${pod} NOT active,  status:${pod_status}" >>"${LOG}"; fi
 47 |     fi
 48 |   done
 49 | 
 50 |   if [[ -z ${active_pod} ]]; then
 51 |     date=$(date +"%Y-%m-%d %H:%M:%S")
 52 |     if eval "${DEBUG}"; then echo "[check_northd:${date}] no active northd leader found" >>"${LOG}"; else
 53 |       echo "no active northd leader found..."
 54 |     fi
 55 |     if eval "${REMDIATE}"; then
 56 |       if eval "${DEBUG}"; then echo "[check_northd:${date}] ...recovering northd" >>"${LOG}"; else
 57 |         echo "...recovering northd"
 58 |       fi
 59 |       for pod in ${pods}; do
 60 |         oc exec -n openshift-ovn-kubernetes -c northd "${pod}" -- ovn-appctl -t ovn-northd exit
 61 |         date=$(date +"%Y-%m-%d %H:%M:%S")
 62 |         if eval "${DEBUG}"; then echo "[check_northd:${date}] recovering pod ${pod}" >>"${LOG}"; else
 63 |           echo "recovering pod ${pod}"
 64 |         fi
 65 |       done
 66 |     fi
 67 |   else
 68 |     date=$(date +"%Y-%m-%d %H:%M:%S")
 69 |     if eval "${DEBUG}"; then echo "[check_northd:${date}] found active northd leader (${active_pod}) on ${node}" >>"${LOG}"; else
 70 |       echo "found active northd leader (${active_pod}) on ${node}"
 71 |     fi
 72 |   fi
 73 | 
 74 | }
 75 | 
 76 | # Main
 77 | while getopts "dhk:r" flag; do
 78 |   case "${flag}" in
 79 |   d)
 80 |     DEBUG=true
 81 |     ;;
 82 |   h)
 83 |     usage
 84 |     exit 1
 85 |     ;;
 86 |   k)
 87 |     export KUBECONFIG="${OPTARG}"
 88 |     echo "Exported KUBECONFIG=${KUBECONFIG}" >>"${LOG}"
 89 |     ;;
 90 |   r)
 91 |     REMDIATE=true
 92 |     ;;
 93 |   *)
 94 |     echo >&2 "Invalid option: $*"
 95 |     usage
 96 |     exit 1
 97 |     ;;
 98 |   esac
 99 | done
100 | 
101 | check_northd
102 | 
103 | if [[ -f ${LOG} ]]; then
104 |   echo "# Logged operations into the file ${LOG}"
105 | fi
106 | 


--------------------------------------------------------------------------------
/checks.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # info
 3 | | Script | Description |
 4 | | - | - |
 5 | | [info/mtu](info/mtu) |  Show the nodes' MTU for some interfaces |
 6 | | [info/node-versions](info/node-versions) |  Show node components versions such as kubelet, crio, kernel, etc. |
 7 | | [info/04-machineset](info/04-machineset) |  Show the machinesets status |
 8 | | [info/00-clusterversion](info/00-clusterversion) |  Show the clusterversion |
 9 | | [info/biosversion](info/biosversion) |  Show the nodes' BIOS version |
10 | | [info/locks](info/locks) |  List all pods with locks on each node |
11 | | [info/01-clusteroperators](info/01-clusteroperators) |  Show the clusteroperators |
12 | | [info/03-pods](info/03-pods) |  Show the pods running in the cluster |
13 | | [info/ethtool-firmware-version](info/ethtool-firmware-version) |  Show the nodes' NIC firmware version using ethtool |
14 | | [info/ovs-hostnames](info/ovs-hostnames) |  Show the ovs database chassis hostnames |
15 | | [info/02-nodes](info/02-nodes) |  Show the nodes status |
16 | | [info/bmh-machine-node](info/bmh-machine-node) |  Show the node,machine and bmh relationship |
17 | | [info/container-images-running](info/container-images-running) |  Show the images of the containers running in the cluster |
18 | | [info/container-images-stored](info/container-images-stored) |  Show the container images stored in the cluster hosts |
19 | 
20 | # pre
21 | | Script | Description |
22 | | - | - |
23 | | [pre/00-install-config-valid-yaml](pre/00-install-config-valid-yaml) |  Checks if the install-config.yaml file is a valid yaml file |
24 | | [pre/dns-hostnames](pre/dns-hostnames) |  Checks if the api and wildcard DNS entries are correct |
25 | 
26 | # ssh
27 | | Script | Description |
28 | | - | - |
29 | | [ssh/bz1941840](ssh/bz1941840) |  Checks if the authentication-operator is using excessive RAM -> hung kubelet BZ1941840 |
30 | 
31 | # checks
32 | | Script | Description |
33 | | - | - |
34 | | [checks/port-thrashing](checks/port-thrashing) |  Checks if there are OVN pods thrashing |
35 | | [checks/entropy](checks/entropy) |  Checks if the workers have enough entropy |
36 | | [checks/chronyc](checks/chronyc) |  Checks if the worker clocks are synced using chronyc |
37 | | [checks/pdb](checks/pdb) |  Checks if there are PodDisruptionBudgets with 0 disruptions allowed |
38 | | [checks/clusterversion_errors](checks/clusterversion_errors) |  Checks if there are clusterversion errors |
39 | | [checks/mellanox-firmware-version](checks/mellanox-firmware-version) |  Checks if the nodes' Mellanox Connect-4 firmware version is below the recommended version. |
40 | | [checks/pvc](checks/pvc) |  Checks if there are persistent volume claims that are not bound |
41 | | [checks/flow-control](checks/flow-control) |  Checks if either TX or RX flow control is enabled on a NIC |
42 | | [checks/ctrlnodes](checks/ctrlnodes) |  Checks if any controller nodes have had the NoSchedule taint removed |
43 | | [checks/iptables-22623-22624](checks/iptables-22623-22624) |  Checks if the nodes iptables rules are blocking 22623/tpc or 22624/tcp |
44 | | [checks/mcp](checks/mcp) |  Checks if there are degraded mcp |
45 | | [checks/zombies](checks/zombies) |  Checks if more than 5 zombie processes exist on the hosts |
46 | | [checks/notrunningpods](checks/notrunningpods) |  Checks if there are not running pods |
47 | | [checks/alertmanager](checks/alertmanager) |  Checks if there are warning or error alerts firing |
48 | | [checks/ovn-pods-memory-usage](checks/ovn-pods-memory-usage) |  Checks if the memory usage of the OVN pods is under the LIMIT threshold |
49 | | [checks/operators](checks/operators) |  Checks if there are operators in 'bad' state |
50 | | [checks/bz1948052](checks/bz1948052) |  Checks for BZ 1948052 based on kernel version |
51 | | [checks/csr](checks/csr) |  Checks if there are pending csr |
52 | | [checks/sriov](checks/sriov) |  Checks if the SR-IOV network state is synced |
53 | | [checks/nodes](checks/nodes) |  Checks if there are not ready or not schedulable nodes |
54 | | [checks/restarts](checks/restarts) |  Checks if there are pods restarted > n times (10 by default) |
55 | | [checks/terminating](checks/terminating) |  Checks if there are pods terminating |
56 | 


--------------------------------------------------------------------------------
/openshift-checks.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Inherit the shell variables in the subprocesses
  4 | # Useful for the -v flag
  5 | export SHELLOPTS
  6 | 
  7 | # https://betterdev.blog/minimal-safe-bash-script-template/
  8 | 
  9 | #set -Eeuo pipefail
 10 | 
 11 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
 12 | IFS=$'\n\t'
 13 | 
 14 | # shellcheck disable=2164
 15 | cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1
 16 | 
 17 | # shellcheck disable=SC1091
 18 | source $(pwd)/utils
 19 | 
 20 | #trap cleanup SIGINT SIGTERM ERR EXIT
 21 | export ERRORFILE=$(mktemp)
 22 | trap "rm ${ERRORFILE}" EXIT
 23 | 
 24 | errors=0
 25 | # Flags
 26 | INFO=1
 27 | CHECKS=1
 28 | PRE=0
 29 | SSH=1
 30 | LIST=0
 31 | SINGLE=0
 32 | RESULTS_ONLY=0
 33 | SCRIPT_PROVIDED=''
 34 | RESTART_THRESHOLD=${RESTART_THRESHOLD:=10} #arbitray
 35 | THRASHING_THRESHOLD=${THRASHING_THRESHOLD:=10}
 36 | 
 37 | OCDEBUGIMAGE=${OCDEBUGIMAGE:=registry.redhat.io/rhel8/support-tools:latest}
 38 | OSETOOLSIMAGE=${OSETOOLSIMAGE:=registry.redhat.io/openshift4/ose-tools-rhel8:latest}
 39 | 
 40 | parse_params "$@"
 41 | setup_colors
 42 | 
 43 | main() {
 44 |   # Check if only list is needed
 45 |   if [ "${LIST}" -ne 0 ]; then
 46 |     msg "${GREEN}Available scripts:${NOCOLOR}"
 47 |     find checks/ info/ pre/ ssh/ -type f | sort -n
 48 |     exit 0
 49 |   else
 50 |     # Check binaries availability
 51 |     for i in oc yq jq curl column; do
 52 |       check_command ${i}
 53 |     done
 54 |     # If only prechecks are needed:
 55 |     if [ "${PRE}" -gt 0 ]; then
 56 |       INSTALL_CONFIG_PATH=${INSTALL_CONFIG_PATH:=./install-config.yaml}
 57 |       if [ ! -f ${INSTALL_CONFIG_PATH} ]; then
 58 |         die "${RED}install-config.yaml not found${NOCOLOR}"
 59 |       fi
 60 |       msg "Running prechecks:"
 61 |       for pre in ./pre/*; do
 62 |         # shellcheck disable=SC1090,SC1091
 63 |         "${pre}"
 64 |       done
 65 |     else
 66 |       # Check kubeconfig and current user
 67 |       kubeconfig
 68 |       OCUSER=$(oc_whoami)
 69 |       # If only a single script is needed:
 70 |       if [ "${SINGLE}" -ne 0 ]; then
 71 |         # Disable all the other checks
 72 |         INFO=0
 73 |         CHECKS=0
 74 |         PRE=0
 75 |         SSH=0
 76 |         # shellcheck disable=SC1090,SC1091
 77 |         "${SCRIPT_PROVIDED}"
 78 |       fi
 79 |       # If only info data is needed:
 80 |       if [ "${INFO}" -gt 0 ]; then
 81 |         msg "Gathering cluster information as ${GREEN}${OCUSER}${NOCOLOR}:"
 82 |         for info in ./info/*; do
 83 |           # shellcheck disable=SC1090,SC1091
 84 |           "${info}"
 85 |         done
 86 |       fi
 87 |       # If only checks are needed:
 88 |       if [ "${CHECKS}" -gt 0 ]; then
 89 |         msg "Running basic health checks as ${GREEN}${OCUSER}${NOCOLOR}"
 90 |         for check in ./checks/*; do
 91 |           # Refresh error count before execution
 92 |           export errors=$(expr $(cat ${ERRORFILE}) + 0)
 93 |           # shellcheck disable=SC1090,SC1091
 94 |           if [ "${RESULTS_ONLY}" -gt 0 ]; then
 95 |             "${check}" &>/dev/null
 96 |             case $? in
 97 |             0 | 1) msg "${check:2} ${GREEN}PASS${NOCOLOR}" ;;
 98 |             2) msg "${check:2} ${RED}FAIL${NOCOLOR}" ;;
 99 |             3) msg "${check:2} ${ORANGE}SKIPPED${NOCOLOR}" ;;
100 |             4) msg "${check:2} ${YELLOW}UNKNOWN${NOCOLOR}" ;;
101 |             *) msg "${check:2} ${RED}UNKNOWN RETURN CODE${NOCOLOR}" ;;
102 |             esac
103 |           else
104 |             "${check}"
105 |           fi
106 |         done
107 |       fi
108 |       # If only ssh checks are needed:
109 |       if [ "${SSH}" -gt 0 ]; then
110 |         msg "Running ssh-based health checks as ${GREEN}${OCUSER}${NOCOLOR}"
111 |         for ssh in ./ssh/*; do
112 |           # Refresh error count before execution
113 |           export errors=$(expr $(cat ${ERRORFILE}) + 0)
114 |           # shellcheck disable=SC1090,SC1091
115 |           if [ "${RESULTS_ONLY}" -gt 0 ]; then
116 |             "${ssh}" &>/dev/null
117 |             case $? in
118 |             0 | 1) msg "${ssh:2} ${GREEN}PASS${NOCOLOR}" ;;
119 |             2) msg "${ssh:2} ${RED}FAIL${NOCOLOR}" ;;
120 |             3) msg "${ssh:2} ${ORANGE}SKIPPED${NOCOLOR}" ;;
121 |             4) msg "${ssh:2} ${YELLOW}UNKNOWN${NOCOLOR}" ;;
122 |             *) msg "${ssh:2} ${RED}UNKNOWN RETURN CODE${NOCOLOR}" ;;
123 |             esac
124 |           else
125 |             "${ssh}"
126 |           fi
127 |         done
128 |       fi
129 |     fi
130 |   fi
131 |   export errors=$(expr $(cat ${ERRORFILE}) + 0)
132 |   if [ ${errors} -gt 0 ]; then
133 |     die "${RED}Total issues found: ${errors}${NOCOLOR}"
134 |   else
135 |     msg "${GREEN}No issues found${NOCOLOR}"
136 |   fi
137 | }
138 | 
139 | main "$@"
140 | 


--------------------------------------------------------------------------------
/utils:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Global vars
  4 | 
  5 | # Set error codes for scripts https://tecadmin.net/how-to-create-own-nagios-plugin-using-bash-shell-script/
  6 | # This should probably masked to other values to hilight issues in bash coding, etc
  7 | OCOK=0
  8 | OCINFO=1
  9 | OCERROR=2
 10 | OCSKIP=3
 11 | OCUNKNOWN=4
 12 | 
 13 | PARALLELJOBS="${PARALLELJOBS:=1}"
 14 | 
 15 | usage() {
 16 |   cat <<EOF
 17 | Usage: $(basename "$0") [-h]
 18 | 
 19 | This script will run a minimum set of checks to an OpenShift cluster
 20 | 
 21 | Available options:
 22 | 
 23 | -h, --help                               Print this help and exit
 24 | -v, --verbose                            Print script debug info
 25 | -l, --list                               Lists the available checks
 26 | -s <script>, --single <script>           Executes only the provided script
 27 | --no-info                                Disable cluster info commands (default: enabled)
 28 | --no-checks                              Disable cluster check commands (default: enabled)
 29 | --no-ssh                                 Disable ssh-based check commands (default: enabled)
 30 | --prechecks path/to/install-config.yaml  Executes only prechecks (default: disabled)
 31 | --results-only                           Only shows pass/fail results from checks (default: disabled)
 32 | 
 33 | With no options, it will run all checks and info commands with no debug info
 34 | EOF
 35 |   exit
 36 | }
 37 | 
 38 | cleanup() {
 39 |   trap - SIGINT SIGTERM ERR EXIT
 40 |   msg "Something went wrong"
 41 |   # shellcheck disable=SC2154
 42 |   if [ "${errors}" -gt 0 ]; then
 43 |     msg "${RED}Total issues found: ${errors}${NOCOLOR}"
 44 |   fi
 45 |   die "Exiting" 9
 46 | }
 47 | 
 48 | setup_colors() {
 49 |   if [[ -t 2 ]] && [[ -z "${NO_COLOR-}" ]] && [[ "${TERM-}" != "dumb" ]]; then
 50 |     export NOCOLOR='\033[0m' RED='\033[0;31m' GREEN='\033[0;32m' ORANGE='\033[0;33m' BLUE='\033[0;34m' PURPLE='\033[0;35m' CYAN='\033[0;36m' YELLOW='\033[1;33m'
 51 |   else
 52 |     # shellcheck disable=2034
 53 |     export NOCOLOR='' RED='' GREEN='' ORANGE='' BLUE='' PURPLE='' CYAN='' YELLOW=''
 54 |   fi
 55 | }
 56 | 
 57 | msg() {
 58 |   echo -e "${1-}"
 59 | }
 60 | 
 61 | error() {
 62 |   echo >&2 -e "${1-}"
 63 | }
 64 | 
 65 | die() {
 66 |   local msg=$1
 67 |   local code=${2-1} # default exit status 1
 68 |   error "${msg}"
 69 |   exit "${code}"
 70 | }
 71 | 
 72 | parse_params() {
 73 | 
 74 |   while :; do
 75 |     case "${1-}" in
 76 |     -h | --help)
 77 |       usage
 78 |       ;;
 79 |     -v | --verbose)
 80 |       set -x
 81 |       ;;
 82 |     -l | --list)
 83 |       # shellcheck disable=2034
 84 |       LIST=1
 85 |       ;;
 86 |     -s | --single)
 87 |       # shellcheck disable=2034
 88 |       SINGLE=1
 89 |       # shellcheck disable=2034
 90 |       SCRIPT_PROVIDED=${2}
 91 |       ;;
 92 |     --no-color)
 93 |       NO_COLOR=1
 94 |       ;;
 95 |     --no-info)
 96 |       # shellcheck disable=2034
 97 |       INFO=0
 98 |       ;;
 99 |     --no-checks)
100 |       # shellcheck disable=2034
101 |       CHECKS=0
102 |       ;;
103 |     --no-ssh)
104 |       # shellcheck disable=2034
105 |       SSH=0
106 |       ;;
107 |     --prechecks)
108 |       # shellcheck disable=2034
109 |       CHECKS=0
110 |       INFO=0
111 |       SSH=0
112 |       PRE=1
113 |       INSTALL_CONFIG_PATH=${2-}
114 |       ;;
115 |     --results-only)
116 |       # shellcheck disable=2034
117 |       INFO=0
118 |       RESULTS_ONLY=1
119 |       ;;
120 |     -?*)
121 |       die "Unknown option: $1"
122 |       ;;
123 |     *)
124 |       break
125 |       ;;
126 |     esac
127 |     shift
128 |   done
129 | 
130 |   #args=("$@")
131 | 
132 |   return 0
133 | }
134 | 
135 | check_command() {
136 |   command -v "${1}" &>/dev/null || die "${RED}${1}${NOCOLOR} command not found"
137 | }
138 | 
139 | kubeconfig() {
140 |   CONTEXT=$(oc whoami 2>/dev/null)
141 |   if [ -z "${CONTEXT}" ]; then
142 |     die "${RED}KUBECONFIG not set${NOCOLOR}" 1
143 |   else
144 |     if [ ${INFO} -ne 0 ]; then
145 |       msg "Using ${GREEN}${CONTEXT}${NOCOLOR} context"
146 |     fi
147 |   fi
148 | }
149 | 
150 | oc_whoami() {
151 |   WHOAMI=$(oc whoami 2>/dev/null)
152 |   if [ -z "${WHOAMI}" ]; then
153 |     die "${RED}OpenShift user not found${NOCOLOR}" 1
154 |   else
155 |     echo "${WHOAMI}"
156 |   fi
157 | }
158 | 
159 | is_sno() {
160 |   local SNO=$(oc get nodes -o json | jq '.items | length')
161 |   if [ ${SNO} -eq 1 ]; then
162 |     echo 1
163 |   else
164 |     echo 0
165 |   fi
166 | }
167 | 
168 | ocdebugorwait() {
169 |   instances=$(pgrep -f 'oc debug' | wc -l)
170 |   while [ "${instances}" != "0" ]; do
171 |     # Waiting for oc debug to not be running anymore
172 |     sleep $(($RANDOM % 10))
173 |     instances=$(pgrep -f 'oc debug' | wc -l)
174 |   done
175 | }
176 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
  1 | # openshift-check tools
  2 | 
  3 | A set of scripts to run basic checks on an OpenShift cluster. PRs welcome!
  4 | 
  5 | This >:warning: is an unofficial tool, don't blame us if it breaks your cluster
  6 | 
  7 | ## ovn_cleanConntrack.sh
  8 | 
  9 | ### Usage
 10 | 
 11 | ```bash
 12 | $ ./ovn_cleanConntrack.sh -h
 13 | This script gives the potential list of commands to clean up wrong conntracks
 14 | It only supports UDP stale entries
 15 | It only considers clusterIP services
 16 | It only works on IPV4 single stack env
 17 | Assumes node subnet is the default /24 cidr (it also works for /23)
 18 | Assumes Cluster CIDR is /16
 19 | Checks for the Service CIDR to have one of the networks /8 /16 or /24
 20 | 
 21 |         Usage: ovn_cleanConntrack.sh
 22 |         Help: ovn_cleanConntrack.sh -h
 23 |         Save extra DEBUG lines into the log: ovn_cleanConntrack.sh -d
 24 |         Limit the execution to a single node: ovn_cleanConntrack.sh -n node
 25 |         Set the KUBECONFIG env var to /kubeconfig/file: ovn_cleanConntrack.sh -k /kubeconfig/file
 26 |         Set the mode to quiet and save the output to /tmp/output.file: ovn_cleanConntrack.sh -q /tmp/output.file
 27 | 
 28 | After the execution a logfile will be generated with the name ovn_cleanConntrack.DATE.log
 29 | ```
 30 | 
 31 | ### Examples
 32 | 
 33 | Saving extra debug lines in the log file:
 34 | 
 35 | ```bash
 36 | $ ./ovn_cleanConntrack.sh -d
 37 | ```
 38 | 
 39 | Single node execution:
 40 | 
 41 | ```bash
 42 | $ ./ovn_cleanConntrack.sh -s my.node.com
 43 | ```
 44 | 
 45 | For the -k parameter, the original behavior is still the same but if you want to analyze different clusters from the same bastion you can do it using the -k parameter to pass the kubeconfig file to the script, for example:
 46 | 
 47 | ```bash
 48 | $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig
 49 | $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster2/auth/kubeconfig
 50 | $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster3/auth/kubeconfig
 51 | ```
 52 | 
 53 | In the previous example, the script will analyse the clusters indicated by the kubeconfig files on `/home/kni/clusterconfigs/cluster1/kubeconfig`, `/home/kni/clusterconfigs/cluster2/kubeconfig` and `/home/kni/clusterconfigs/cluster3/kubeconfig`
 54 | If no -k is indicated the script expects to have the KUBECONFIG variable exported in the system otherwise it will give an error because it can't connect.
 55 | 
 56 | For the -q parameter, instead of printing the output to the standard output now you can indicate the file were to save the output of the script, to cover the commented use case for running on batch mode:
 57 | 
 58 | ```bash
 59 | 
 60 | $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig -q /tmp/cluster1.output
 61 | $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster2/auth/kubeconfig -q /tmp/cluster2.output
 62 | $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster3/auth/kubeconfig -q /tmp/cluster3.output
 63 | ```
 64 | 
 65 | If no conntracks with issues are found the files `/tmp/cluster?.output` won't be created. If no `-q` is indicated, the script will print the results in the standard output.
 66 | 
 67 | Here is an example of how to configure a cronjob to run the script every hour (you can place it on `/etc/cron.d/1conntracks`).
 68 | This example uses the parameters `-k` and `-q` indicating the kubeconfig and the file to save the output:
 69 | 
 70 | ```bash
 71 | # Run hourly
 72 | SHELL=/bin/bash
 73 | PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
 74 | MAILTO=root
 75 | 0 * * * * kni /usr/local/bin/ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig -q /tmp/ovnconntracks_cluster1.log
 76 | 10 * * * * kni /usr/local/bin/ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster2/auth/kubeconfig -q /tmp/ovnconntracks_cluster2.log
 77 | 20 * * * * kni /usr/local/bin/ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster3/auth/kubeconfig -q /tmp/ovnconntracks_cluster3.log
 78 | ```
 79 | 
 80 | In that example, the debug log is still being generated using the LOG var inside the script, but that is a debug log file in case we need to debug the script behavior, and it can be modified according to bastion space and needs.
 81 | 
 82 | ## recover-northd.sh
 83 | 
 84 | ### Usage
 85 | 
 86 | ```bash
 87 | $ ./recover-northd.sh -h
 88 | This script checks if northd is stuck and optionally intervene
 89 | 
 90 |         Usage: recover-northd.sh
 91 |         Help: recover-northd.sh -h
 92 |         Save extra DEBUG lines into the log: recover-northd.sh -d
 93 |         Set the KUBECONFIG env var to /kubeconfig/file: recover-northd.sh -k /kubeconfig/file
 94 |  				Remediate the issue: recover-northd.sh -r
 95 | ```
 96 | 
 97 | After the execution, a logfile will be generated with the name `recover-northd.DATE.log`
 98 | 
 99 | ### Examples
100 | 
101 | Saving extra debug lines in the log file:
102 | 
103 | ```bash
104 | $ ./recover-northd.sh -d
105 | ```
106 | 
107 | For the -k parameter, the original behavior is still the same but if you want to analyse different clusters from the same bastion you can do it using the -k parameter to pass the kubeconfig file to the script, for example:
108 | 
109 | ```bash
110 | $ ./recover-northd.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig
111 | $ ./recover-northd.sh -k /home/kni/clusterconfigs/cluster2/auth/kubeconfig
112 | $ ./recover-northd.sh -k /home/kni/clusterconfigs/cluster3/auth/kubeconfig
113 | ```
114 | 
115 | In the previous example, the script will analyse the clusters indicated by the kubeconfig files on `/home/kni/clusterconfigs/cluster1/kubeconfig`, `/home/kni/clusterconfigs/cluster2/kubeconfig` and `/home/kni/clusterconfigs/cluster3/kubeconfig`
116 | 
117 | If no `-k` is indicated the script expects to have the KUBECONFIG variable exported in the system otherwise it will give an error because it can't connect.
118 | 
119 | For the `-r` parameter, the script will send an exit to the northd container for OVN to elect a new leader:
120 | 
121 | ```bash
122 | $ ./recover-northd.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig -r
123 | ```
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # openshift-checks
  2 | 
  3 | A set of scripts to run basic checks on an OpenShift cluster. PRs welcome!
  4 | 
  5 | > :warning: This is an unofficial tool, don't blame us if it breaks your cluster
  6 | 
  7 | <!-- TOC depthfrom:2 -->
  8 | 
  9 | - [openshift-checks](#openshift-checks)
 10 |   - [Usage](#usage)
 11 |     - [Container](#container)
 12 |     - [Build your own container](#build-your-own-container)
 13 |     - [CronJob](#cronjob)
 14 |   - [How it works](#how-it-works)
 15 |     - [Checks](#checks)
 16 |     - [Environment variables](#environment-variables)
 17 |     - [About firmware version](#about-firmware-version)
 18 |   - [Collaborate](#collaborate)
 19 |   - [Tips & Tricks](#tips--tricks)
 20 |     - [Send an email if some check fails](#send-an-email-if-some-check-fails)
 21 |     - [Get JSON and HTML output](#get-json-and-html-output)
 22 | 
 23 | <!-- /TOC -->
 24 | 
 25 | ## Usage
 26 | 
 27 | ```bash
 28 | $ ./openshift-checks.sh -h
 29 | Usage: openshift-checks.sh [-h]
 30 | 
 31 | This script will run a minimum set of checks to an OpenShift cluster
 32 | 
 33 | Available options:
 34 | 
 35 | -h, --help                               Print this help and exit
 36 | -v, --verbose                            Print script debug info
 37 | -l, --list                               Lists the available checks
 38 | -s <script>, --single <script>           Executes only the provided script
 39 | --no-info                                Disable cluster info commands (default: enabled)
 40 | --no-checks                              Disable cluster check commands (default: enabled)
 41 | --no-ssh                                 Disable ssh-based check commands (default: enabled)
 42 | --prechecks path/to/install-config.yaml  Executes only prechecks (default: disabled)
 43 | --results-only                           Only shows pass/fail results from checks (default: disabled)
 44 | 
 45 | With no options, it will run all checks and info commands with no debug info
 46 | ```
 47 | 
 48 | ### Container
 49 | 
 50 | There is an automated container build configured with the content of this
 51 | repository main branch available at
 52 | [quay.io/rhsysdeseng/openshift-checks](https://quay.io/rhsysdeseng/openshift-checks).
 53 | 
 54 | You can use it with your own `kubeconfig` file and with the parameters required
 55 | as:
 56 | 
 57 | ```bash
 58 | $ podman run -it --rm -v /home/foobar/kubeconfig:/kubeconfig:Z -e KUBECONFIG=/kubeconfig quay.io/rhsysdeseng/openshift-checks:latest -h
 59 | ```
 60 | 
 61 | You can even create a handy alias:
 62 | 
 63 | ```bash
 64 | $ alias openshift-checks="podman run -it --rm -v /home/foobar/kubeconfig:/kubeconfig:Z -e KUBECONFIG=/kubeconfig quay.io/rhsysdeseng/openshift-checks:latest"
 65 | ```
 66 | 
 67 | Then, simply run it as:
 68 | 
 69 | ```bash
 70 | $ openshift-checks -s info/00-clusterversion
 71 | Using default/api-foobar-example-com:6443/system:admin context
 72 | ...
 73 | ```
 74 | 
 75 | > **Note**:
 76 | > If your kubeconfig file doesn't have the proper permissions you may get the error "KUBECONFIG not set".
 77 | > In that case verify that the kubeconfig file has read permissions for the user that is used inside the container or just `chmod o+r kubeconfig` in your host.
 78 | 
 79 | ### Build your own container
 80 | 
 81 | You can build your own container with the included
 82 | [Containerfile](Containerfile):
 83 | 
 84 | ```bash
 85 | $ podman build --tag foobar/openshiftchecks .
 86 | STEP 1: FROM registry.access.redhat.com/ubi8/ubi:latest
 87 | ...
 88 | $ podman push foobar/openshiftchecks
 89 | ...
 90 | ```
 91 | 
 92 | Then, run it by replacing
 93 | `quay.io/repository/rhsysdeseng/openshift-checks:latest` with your own image
 94 | such as `foobar/openshiftchecks:latest`:
 95 | 
 96 | ```bash
 97 | $ podman run -it --rm -v /home/foobar/kubeconfig:/kubeconfig:Z -e KUBECONFIG=/kubeconfig foobar/openshiftchecks:latest -h
 98 | Usage: openshift-checks.sh [-h]
 99 | ...
100 | ```
101 | 
102 | ### CronJob
103 | 
104 | The checks can be scheduled to run periodically in an OpenShift cluster by
105 | creating a CronJob.
106 | 
107 | Check the [cronjob.yaml](cronjob.yaml) example.
108 | 
109 | ## How it works
110 | 
111 | The `openshift-checks.sh` script is just a wrapper around bash scripts located
112 | in the [info](./info), [checks](./checks) or [ssh](./ssh) directories.
113 | 
114 | ### Checks
115 | 
116 | Check each script and its description in [checks](checks.md)
117 | 
118 | Note: This file is autogenerated when running: `./scripts/update-checksmd  > checks.md`
119 | 
120 | ### Environment variables
121 | 
122 | | Environment variable | Default value                                        | Description                                                                                                                              |
123 | | -------------------- | ---------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
124 | | INTEL_IDS            | 8086:158b                                            | Intel device IDs to check for firmware. Can be overridden for non-supported NICs.                                                        |
125 | | OCDEBUGIMAGE         | registry.redhat.io/rhel8/support-tools:latest        | Used by `oc debug`.                                                                                                                      |
126 | | OSETOOLSIMAGE        | registry.redhat.io/openshift4/ose-tools-rhel8:latest | Used by `oc debug` in [ethtool-firmware-version](info/ethtool-firmware-version)                                                          |
127 | | RESTART_THRESHOLD    | 10                                                   | Used by the [restarts](checks/restarts) script.                                                                                          |
128 | | THRASHING_THRESHOLD  | 10                                                   | Used by the [port-thrashing](checks/port-thrashing) script.                                                                              |
129 | | PARALLELJOBS         | 1                                                    | By default, all the `oc debug` commands run in a serial fashion, unless this variable is set >1                                          |
130 | | OVN_MEMORY_LIMIT     | 5000                                                 | Used by the [ovn-pods-memory-usage](checks/ovn-pods-memory-usage) script to set the maximum memory LIMIT (in Mi) to trigger the warning. |
131 | 
132 | ### About firmware version
133 | 
134 | The current script checks only the firmware version of the SRIOV operator supported NICs ([in 4.6](https://docs.openshift.com/container-platform/4.6/networking/hardware_networks/about-sriov.html#supported-devices_about-sriov)).
135 | 
136 | You can add your own device ID if needed by modifying the script (hint, the
137 | variable is called `IDS` and the format is `vendorID_A:deviceID_A vendorID_B:deviceID_B`)
138 | 
139 | ## Collaborate
140 | 
141 | Add a new script to get some information or to perform some check in the proper
142 | folder and create a pull request.
143 | 
144 | Make sure you include a `# description: $TEXT` that will be later used to populate the `checks.md` file with the description.
145 | 
146 | ## Tips & Tricks
147 | 
148 | ### Send an email if some check fails
149 | 
150 | You can pipe the script to `mail` and if there are any errors, an email will be
151 | sent.
152 | 
153 | First, you can configure postfix (already included in RHEL8) as relay host
154 | (see https://access.redhat.com/solutions/217503). As an example:
155 | 
156 | - Append the following settings in `/etc/postfix/main.cf`:
157 | 
158 | ```bash
159 | myhostname = kni1-bootstrap.example.com
160 | relayhost = smtp.example.com
161 | ```
162 | 
163 | - Restart the postfix service:
164 | 
165 | ```bash
166 | sudo systemctl restart postfix
167 | ```
168 | 
169 | - Test it:
170 | 
171 | ```bash
172 | echo "Hola" | mail -s 'Subject' johndoe@example.com
173 | ```
174 | 
175 | Then, run the script as:
176 | 
177 | ```bash
178 | /openshift-checks.sh > /tmp/oc-errors 2>&1 || mail -s "Something has failed" johndoe@example.com < /tmp/oc-errors
179 | ```
180 | 
181 | As a bonus, you can include this in a cronjob for periodic checks.
182 | 
183 | ### Get JSON and HTML output
184 | 
185 | This requires the installation of python requirements in the `requirements.txt` file, recommended within a virtual environment, once those are installed execute:
186 | 
187 | ```bash
188 | ./risu.py -l
189 | ```
190 | 
191 | To automatically execute the tests against the current environment and generate two output files:
192 | 
193 | - `osc.json`
194 | - `osc.html`
195 | 
196 | When loaded over a web server, the HTML file will pull the `json` file over AJAX and represent the results of the tests in a graphical way:
197 | 
198 | ![](webreport.png)
199 | 


--------------------------------------------------------------------------------
/scripts/ovn_cleanConntrack.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | ###########################################################
  3 | # ovn_cleanConntrack.sh script to remove udp conntrack    #
  4 | # lines persistent in a cluster hitted by BZ 2043094      #
  5 | ###########################################################
  6 | 
  7 | # Timestamp to be used in the logfile name
  8 | NOW=$(date +"%Y-%m-%d_%H-%M-%S")
  9 | # Logfile to save some DEBUG output
 10 | LOG="/tmp/ovn_cleanConntrack.sh.${NOW}.log"
 11 | # IP of the ovn-k8s-mp0 interface for a node subnet with mask /24 or /23
 12 | NODESUBNETIP=2
 13 | # Debug var to write DEBUG lines into the log
 14 | DEBUG=false
 15 | # Number of parallel jobs to be executed
 16 | PARALLELJOBS="${PARALLELJOBS:=4}"
 17 | # Var to contain the node name if the script has to be executed on a single node
 18 | SINGLENODE=''
 19 | # Var to contain the log to save the output instead the standard default system output
 20 | OUTPUTLOG=''
 21 | 
 22 | ###########################################################
 23 | # usage(): prints the usage of the script
 24 | ###########################################################
 25 | function usage() {
 26 |   echo "This script gives the potential list of commands to clean up wrong conntracks"
 27 |   echo "It only supports UDP stale entries"
 28 |   echo "It only considers clusterIP services"
 29 |   echo "It only works on IPV4 single stack env"
 30 |   echo "Assumes node subnet is the default /24 cidr (it also works for /23)"
 31 |   echo "Assumes Cluster CIDR is /16"
 32 |   echo "Checks for the Service CIDR to have one of the networks /8 /16 or /24"
 33 |   echo -e
 34 |   echo -e "\tUsage: $(basename "$0")"
 35 |   echo -e "\tHelp: $(basename "$0") -h"
 36 |   echo -e "\tSave extra DEBUG lines into the log: $(basename "$0") -d"
 37 |   echo -e "\tLimit the execution to a single node: $(basename "$0") -n node"
 38 |   echo -e "\tSet the KUBECONFIG env var to /kubeconfig/file: $(basename "$0") -k /kubeconfig/file"
 39 |   echo -e "\tSet the mode to quiet and save the output to /tmp/output.file: $(basename "$0") -q /tmp/output.file"
 40 |   echo -e
 41 |   echo "After the execution a logfile will be generated with the name ovn_cleanConntrack.DATE.log"
 42 | }
 43 | 
 44 | ###########################################################
 45 | # setup(): initializes some variables after setting up
 46 | # the KUBECONFIG
 47 | ###########################################################
 48 | function setup() {
 49 |   # ServiceNetwork of the cluster
 50 |   svcnetwork=$(oc get network cluster -o jsonpath='{ .spec.serviceNetwork[] }')
 51 |   # Clusternetwork of the cluster
 52 |   clusternetwork=$(oc get network cluster -o jsonpath='{ .spec.clusterNetwork[].cidr }' | cut -d'/' -f1 | sed -e 's/.$/2/')
 53 | }
 54 | 
 55 | ###########################################################
 56 | # getServices(): prepares a list of services
 57 | # NOTE: We only care about services of type clusterIP that
 58 | # use UDP protocol
 59 | ###########################################################
 60 | function getServices() {
 61 |   #filter by protocol=udp and only clusterips
 62 |   if [[ -z ${OUTPUTLOG} ]]; then
 63 |     echo "# Collecting service info..."
 64 |   fi
 65 |   OLDIFS=$IFS
 66 |   IFS=$'\n'
 67 |   for line in $(oc get services -A -o jsonpath='{range .items[?(@.spec.type=="ClusterIP")]}{@.spec.ports[*].protocol}{";"}{@.spec.clusterIP}{";"}{@.spec.ports[*].port}{";"}{"\n"}{end}' | grep -v 'None' | grep UDP); do
 68 |     words=$(echo "${line}" | wc -w)
 69 |     protos=$(echo "${line}" | cut -d';' -f1)
 70 |     ip=$(echo "${line}" | cut -d';' -f2)
 71 |     port1=$(echo "${line}" | cut -d';' -f3)
 72 |     if [ "${words}" -gt 1 ]; then
 73 |       ports=$(echo "${line}" | cut -d';' -f3)
 74 |       cports=$(echo "${ports}" | wc -w)
 75 |       while [ "${cports}" -gt 0 ]; do
 76 |         port=$(echo "${ports}" | cut -d' ' -f"${cports}")
 77 |         proto=$(echo "${protos}" | cut -d' ' -f"${cports}")
 78 |         if [ "${proto}" = "UDP" ]; then
 79 |           services="${services}\n${ip};${port}"
 80 |         fi
 81 |         cports=$((cports - 1))
 82 |       done
 83 |     else
 84 |       if [ "${protos}" = "UDP" ]; then
 85 |         services="${services}\n${ip};${port1}"
 86 |       fi
 87 |     fi
 88 |   done
 89 |   IFS=$OLDIFS
 90 |   echo -e "Services\n-----------------${services}" >>"${LOG}"
 91 | }
 92 | 
 93 | ###########################################################
 94 | # getEndpoints(): prepares a list of endpoints
 95 | ###########################################################
 96 | function getEndpoints() {
 97 |   if [[ -z ${OUTPUTLOG} ]]; then
 98 |     echo "# Collecting endpoints info..."
 99 |   fi
100 |   endpoints=""
101 |   #filter by protocol=udp and only clusterips
102 |   OLDIFS=$IFS
103 |   IFS=$'\n'
104 |   for line in $(oc get endpoints -A -o jsonpath='{range .items[*].subsets[*]}{@.addresses[*].ip}{";"}{@.addresses[*].nodeName}{";"}{@.ports[*].port}{";"}{@.ports[*].protocol}{";"}{"\n"}{end}' | grep UDP); do
105 |     ips=$(echo "${line}" | cut -d';' -f1)
106 |     cips=$(echo "${ips}" | wc -w)
107 |     nodes=$(echo "${line}" | cut -d';' -f2)
108 |     ports=$(echo "${line}" | cut -d';' -f3)
109 |     cports=$(echo "${ports}" | wc -w)
110 |     protocols=$(echo "${line}" | cut -d';' -f4)
111 | 
112 |     if [ "${cips}" -gt 1 ]; then
113 |       #ep multiple ip multiple ports
114 |       if [ "${cports}" -gt 1 ]; then
115 |         count=1
116 |         while [ ${count} -le "${cips}" ]; do
117 |           ip=$(echo "${ips}" | cut -d' ' -f"${count}")
118 |           countports=1
119 |           node=$(echo "${nodes}" | cut -d' ' -f"${count}")
120 |           while [ ${countports} -le "${cports}" ]; do
121 |             port=$(echo "${ports}" | cut -d' ' -f${countports})
122 |             protocol=$(echo "${protocols}" | cut -d' ' -f${countports})
123 |             if [ "${protocol}" = "UDP" ]; then
124 |               endpoints="${endpoints}\n${ip};${node};${port}"
125 |             fi
126 |             countports=$((countports + 1))
127 |           done
128 |           count=$((count + 1))
129 |         done
130 |         #ep multiple ip 1 port
131 |       else
132 |         count=1
133 |         while [ ${count} -le "${cips}" ]; do
134 |           ip=$(echo "${ips}" | cut -d' ' -f${count})
135 |           node=$(echo "${nodes}" | cut -d' ' -f${count})
136 |           if [ "${protocols}" = "UDP" ]; then
137 |             endpoints="${endpoints}\n${ip};${node};${ports}"
138 |           fi
139 |           count=$((count + 1))
140 |         done
141 | 
142 |       fi
143 |     else
144 |       #ep 1 ip multiple ports
145 |       if [ "${cports}" -gt 1 ]; then
146 |         count=1
147 |         while [ ${count} -le "${cports}" ]; do
148 |           port=$(echo "${ports}" | cut -d' ' -f${count})
149 |           protocol=$(echo "${protocols}" | cut -d' ' -f${count})
150 |           if [ "${protocol}" = "UDP" ]; then
151 |             endpoints="${endpoints}\n${ips};${nodes};${port}"
152 |           fi
153 |           count=$((count + 1))
154 |         done
155 |       #ep 1 ip 1 port
156 |       else
157 |         if [ "${protocols}" = "UDP" ]; then
158 |           endpoints="${endpoints}\n${ips};${nodes};${ports}"
159 |         fi
160 |       fi
161 |     fi
162 |   done
163 |   IFS=$OLDIFS
164 |   echo -e "\nEndpoints\n-----------------${endpoints}\n" >>"${LOG}"
165 | }
166 | 
167 | ###########################################################
168 | # isContrackInSvcNetwork(): checks if a contrack line fits
169 | #                        the service network of the cluster
170 | ###########################################################
171 | function isContrackInSvcNetwork() {
172 |   snline=$1
173 |   snnode=$2
174 |   dst1=$(echo "${snline}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}')
175 |   dst1O1=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f1)
176 |   dst1O2=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f2)
177 |   dst1O3=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f3)
178 |   netO1=$(echo "${svcnetwork}" | cut -d'.' -f1)
179 |   netO2=$(echo "${svcnetwork}" | cut -d'.' -f2)
180 |   netO3=$(echo "${svcnetwork}" | cut -d'.' -f3)
181 |   mask=$(echo "${svcnetwork}" | cut -d'/' -f2)
182 |   if [[ ${mask} == "8" ]]; then
183 |     if [[ ${dst1O1} == "${netO1}" && ${dst1O2} == "${netO2}" && ${dst1O3} == "${netO3}" ]]; then
184 |       if eval "${DEBUG}"; then echo "[${snnode}:isContrackInSvcNetwork] ${svcnetwork}: ${snline}" >>"${LOG}"; fi
185 |       return 0
186 |     else
187 |       return 1
188 |     fi
189 |   fi
190 |   if [[ ${mask} == "16" ]]; then
191 |     if [[ ${dst1O1} == "${netO1}" && ${dst1O2} == "${netO2}" ]]; then
192 |       if eval "${DEBUG}"; then echo "[${snnode}:isContrackInSvcNetwork] ${svcnetwork}: ${snline}" >>"${LOG}"; fi
193 |       return 0
194 |     else
195 |       return 1
196 |     fi
197 |   fi
198 |   if [[ ${mask} == "24" ]]; then
199 |     if [[ ${dst1O1} == "${netO1}" ]]; then
200 |       if eval "${DEBUG}"; then echo "[${snnode}:isContrackInSvcNetwork] ${svcnetwork}: ${snline}" >>"${LOG}"; fi
201 |       return 0
202 |     else
203 |       return 1
204 |     fi
205 |   fi
206 | }
207 | 
208 | ###########################################################
209 | # isContrackInServices(): checks if a contrack line fits
210 | #                          one of the services
211 | ###########################################################
212 | function isContrackInServices() {
213 |   sline=$1
214 |   snode=$2
215 |   dst1=$(echo "${sline}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}')
216 |   dstport1=$(echo "${sline}" | awk -F"dport=" '{sub(/ .*/,"",$2);print $2}')
217 |   OLDIFS=$IFS
218 |   IFS=$'\n'
219 |   services=$(echo -e "${services}" | xargs | sed -e 's/ /\n/g')
220 |   for service in ${services}; do
221 |     srvip=$(echo "${service}" | cut -d';' -f1)
222 |     srvport=$(echo "${service}" | cut -d';' -f2)
223 |     if [[ ${dst1} == "${srvip}" && ${dstport1} == "${srvport}" ]]; then
224 |       if eval "${DEBUG}"; then echo "[${snode}:isContrackInServices] ${dst1}:${dstport1}: ${srvip}:${srvport}" >>"${LOG}"; fi
225 |       return 0
226 |     fi
227 |   done
228 |   IFS=${OLDIFS}
229 |   return 1
230 | }
231 | 
232 | ###########################################################
233 | # isContrackInEndPoints(): checks if the conntrack matches
234 | #                          one of the endpoints source IP
235 | #                          and source port
236 | ###########################################################
237 | function isContrackInEndPoints() {
238 |   eline=$1
239 |   enode=$2
240 |   src2=$(echo "${eline}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}')
241 |   srcport2=$(echo "${eline}" | awk -F"sport=" '{sub(/ .*/,"",$3);print $3}')
242 |   endpoints=$(echo -e "${endpoints}" | xargs | sed -e 's/ /\n/g')
243 |   for endpoint in ${endpoints}; do
244 |     epip=$(echo "${endpoint}" | cut -d';' -f1)
245 |     epport=$(echo "${endpoint}" | cut -d';' -f3)
246 |     if [[ ${epip} == "${src2}" && ${epport} == "${srcport2}" ]]; then
247 |       if eval "${DEBUG}"; then echo "[${enode}:isContrackInEndPoints] ${epip}:${epport}: ${src2}:${srcport2}" >>"${LOG}"; fi
248 |       return 0
249 |     fi
250 |   done
251 |   if eval "${DEBUG}"; then echo "[${enode}:isContrackInEndPoints] NOT found ${epip}:${epport}: ${src2}:${srcport2}" >>"${LOG}"; fi
252 |   return 1
253 | }
254 | 
255 | ############################################################
256 | # isContrackInClusterCIDR: checks if the conntrack src
257 | #                         (2nd tuple) is in the clusterCIDR
258 | ############################################################
259 | function isContrackInClusterCIDR() {
260 |   ccline=$1
261 |   ccnode=$2
262 |   src2=$(echo "${ccline}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}')
263 |   srcoc1=$(echo "${src2}" | cut -d. -f1)
264 |   srcoc2=$(echo "${src2}" | cut -d. -f2)
265 |   cnoc1=$(echo "${clusternetwork}" | cut -d. -f1)
266 |   cnoc2=$(echo "${clusternetwork}" | cut -d. -f2)
267 |   if [[ ${srcoc1} == "${cnoc1}" && ${srcoc2} == "${cnoc2}" ]]; then
268 |     if eval "${DEBUG}"; then echo "[${ccnode}:isContrackInClusterCIDR] ${clusternetwork}: ${src2}" >>"${LOG}"; fi
269 |     if eval "${DEBUG}"; then echo "[${ccnode}:isContrackInClusterCIDR] ${ccline}" >>"${LOG}"; fi
270 |     return 0
271 |   else
272 |     return 1
273 |   fi
274 | }
275 | 
276 | ###########################################################
277 | # generateCommands(): generates the conntrack lines to
278 | #                     remove the faulty line
279 | # Template on how to create the conntracks
280 | # conntrack -D -s A.A.A.A -d B.B.B.B -r C.C.C.C -q A.A.A.A
281 | # conntrack -D -s A.A.A.A -d C.C.C.C
282 | # conntrack -D -s D.D.D.D -d C.C.C.C -r C.C.C.C -q D.D.D.D
283 | #
284 | # Where:
285 | # src=A.A.A.A dst=B.B.B.B sport=42740 dport=5353 src=C.C.C.C
286 | # dst=10.128.2.41 sport=5353 dport=42740 mark=0 secctx=sy...
287 | #
288 | # D.D.D.D is the ovn-k8s-mp0 interface IP.
289 | ###########################################################
290 | function generateCommands() {
291 |   gcnode=$1
292 |   gcconn=$2
293 |   gcpod=$3
294 |   src1=$(echo "${gcconn}" | awk -F"src=" '{sub(/ .*/,"",$2);print $2}')
295 |   dst1=$(echo "${gcconn}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}')
296 |   src2=$(echo "${gcconn}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}')
297 |   nodesubnet=$(oc get node "${gcnode}" -o jsonpath='{.metadata.annotations.k8s\.ovn\.org/node-subnets}' | jq .default | xargs | cut -d'/' -f1)
298 |   # shellcheck disable=SC2001
299 |   nodesubnet=$(echo "${nodesubnet}" | sed -e "s/.$/${NODESUBNETIP}/")
300 |   clustername=$(oc whoami --show-console | cut -d. -f3-)
301 |   if [[ -n ${OUTPUTLOG} ]]; then
302 |     # shellcheck disable=SC2129
303 |     echo "# Cluster: ${clustername}" >>"${OUTPUTLOG}"
304 |     echo "# Generating lines for node (${gcnode}) subnet:${nodesubnet}" >>"${OUTPUTLOG}"
305 |     echo "# OVN Pod: ${gcpod}" >>"${OUTPUTLOG}"
306 |     echo "# Raw line: ${gcconn}" >>"${OUTPUTLOG}"
307 |     echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}" >>"${OUTPUTLOG}"
308 |     echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}" >>"${OUTPUTLOG}"
309 |     echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}" >>"${OUTPUTLOG}"
310 |   else
311 |     echo "# Cluster: ${clustername}"
312 |     echo "# Generating lines for node (${gcnode}) subnet:${nodesubnet}"
313 |     echo "# OVN Pod: ${gcpod}"
314 |     echo "# Raw line: ${gcconn}"
315 |     echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}"
316 |     echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}"
317 |     echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}"
318 |   fi
319 |   # Saving the commands into the log
320 |   # shellcheck disable=SC2129
321 |   echo "# Generating lines for node (${gcnode}) subnet:${nodesubnet}" >>"${LOG}"
322 |   echo "# OVN Pod: ${gcpod}" >>"${LOG}"
323 |   echo "# Raw line: ${gcconn}" >>"${LOG}"
324 |   echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}" >>"${LOG}"
325 |   echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}" >>"${LOG}"
326 |   echo "oc -n openshift-ovn-kubernetes exec pod/${gcpod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}" >>"${LOG}"
327 | }
328 | 
329 | ###########################################################
330 | # getConntrack(): loops over the nodes using the          #
331 | #                 ovnkube-node pods, gets the udp         #
332 | #                 conntrackts, validates them and         #
333 | #                 generates the lines to remove it        #
334 | ###########################################################
335 | function getConntrack() {
336 |   if [[ -n ${SINGLENODE} ]]; then
337 |     nodes=$(oc get pods -n openshift-ovn-kubernetes -l app=ovnkube-node -o jsonpath='{range .items[*]}{@.metadata.name}{";"}{@..nodeName}{"\n"}{end}' | grep "${SINGLENODE}")
338 |   else
339 |     nodes=$(oc get pods -n openshift-ovn-kubernetes -l app=ovnkube-node -o jsonpath='{range .items[*]}{@.metadata.name}{";"}{@..nodeName}{"\n"}{end}')
340 |   fi
341 |   # Discarding NotReady nodes
342 |   for n in ${nodes}; do
343 |     onenode=$(echo "${n}" | cut -d';' -f2)
344 |     nodestatus=$(oc get node "${onenode}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')
345 |     if [ "${nodestatus}" = "True" ]; then
346 |       readynodes="${n} ${readynodes}"
347 |     fi
348 |   done
349 | 
350 |   if [[ -z ${OUTPUTLOG} ]]; then
351 |     echo "# Building cache for clusterIP services..."
352 |   fi
353 |   if eval "${DEBUG}"; then echo -e "\nConntracks\n-----------------" >>"${LOG}"; fi
354 | 
355 |   for line in ${readynodes}; do
356 |     # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691
357 |     ((i = i % PARALLELJOBS))
358 |     ((i++ == 0)) && wait
359 |     (
360 |       OLDIFS=$IFS
361 |       IFS=$'\n'
362 |       pod=$(echo "${line}" | cut -d';' -f1)
363 |       node=$(echo "${line}" | cut -d';' -f2)
364 |       conntracks=$(oc -n openshift-ovn-kubernetes exec pod/"${pod}" -c ovnkube-node -- conntrack -L -p udp 2>/dev/null)
365 |       for conntrack in $(echo "${conntracks}" | sed 's/udp/\nudp/g' | sed 's/\[UNREPLIED\]//g' | sed 's/\[ASSURED\]//g' | tr -s ' '); do
366 |         # if not found in the service network or found in services or if not found in clusterCIDR or
367 |         # if found in endpoints, ignore it
368 |         # otherwise generate the commands to remove it
369 |         if isContrackInSvcNetwork "${conntrack}" "${node}"; then
370 |           if isContrackInClusterCIDR "${conntrack}" "${node}"; then
371 |             if isContrackInServices "${conntrack}" "${node}"; then
372 |               if ! isContrackInEndPoints "${conntrack}" "${node}"; then
373 |                 echo -e "===> Generating conntrack lines for (${node}:${pod}): $conntrack}" >>"${LOG}"
374 |                 generateCommands "${node}" "${conntrack}" "${pod}"
375 |               fi
376 |             fi
377 |           fi
378 |         fi
379 |       done
380 |       wait
381 |       IFS=$OLDIFS
382 |     ) &
383 |   done
384 | }
385 | 
386 | # Main
387 | while getopts "dhq:k:n:" flag; do
388 |   case "${flag}" in
389 |   n)
390 |     SINGLENODE=${OPTARG}
391 |     ;;
392 |   q)
393 |     OUTPUTLOG=${OPTARG}
394 |     echo "Quiet mode enabled saving output into ${OUTPUTLOG}" >>"${LOG}"
395 |     ;;
396 |   d)
397 |     DEBUG=true
398 |     ;;
399 |   h)
400 |     usage
401 |     exit 1
402 |     ;;
403 |   k)
404 |     export KUBECONFIG="${OPTARG}"
405 |     echo "Exported KUBECONFIG=${KUBECONFIG}" >>"${LOG}"
406 |     ;;
407 |   *)
408 |     echo >&2 "Invalid option: $*"
409 |     usage
410 |     exit 1
411 |     ;;
412 |   esac
413 | done
414 | 
415 | # Initialize vars dependent of KUBECONFIG
416 | setup
417 | # Prepare the cluster services data
418 | getServices
419 | # Prepare the cluster endpoints data
420 | getEndpoints
421 | # Loop over the conntrack to find persistent conntracks
422 | # and generate the conntrackt commands to remove it
423 | getConntrack
424 | if [[ -z ${OUTPUTLOG} ]]; then
425 |   echo "# Logged operations into the file ${LOG}"
426 | fi
427 | 


--------------------------------------------------------------------------------