├── Jenkinsfile ├── README.md ├── assets └── drain-node.svg ├── chart └── drain-node-on-crash │ ├── .helmignore │ ├── Chart.yaml │ ├── questions.yml │ ├── templates │ ├── _helpers.tpl │ ├── deployment.yaml │ └── rbac.yaml │ └── values.yaml ├── manager ├── Dockerfile ├── kubectl └── run.sh └── worker ├── Dockerfile ├── kubectl └── run.sh /Jenkinsfile: -------------------------------------------------------------------------------- 1 | pipeline { 2 | agent any 3 | stages { 4 | stage('Login to Docker repo') { 5 | steps { 6 | sh '''cat ~/.GH_TOKEN | docker login docker.pkg.github.com -u mattmattox --password-stdin 7 | ''' 8 | } 9 | } 10 | 11 | stage('Docker') { 12 | parallel { 13 | stage('Build Docker image and push - Manager') { 14 | steps { 15 | dir(path: './manager') { 16 | sh '''docker build -t drainnode/manager:"$BRANCH_NAME"-rc"$BUILD_NUMBER" . 17 | docker push drainnode/manager:"$BRANCH_NAME"-rc"$BUILD_NUMBER"''' 18 | } 19 | 20 | } 21 | } 22 | 23 | stage('Build Docker image and push - Worker') { 24 | steps { 25 | dir(path: './worker') { 26 | sh '''docker build -t drainnode/worker:"$BRANCH_NAME"-rc"$BUILD_NUMBER" . 27 | docker push drainnode/worker:"$BRANCH_NAME"-rc"$BUILD_NUMBER"''' 28 | } 29 | 30 | } 31 | } 32 | 33 | stage('Build Docker image and push - Leader') { 34 | steps { 35 | dir(path: './worker') { 36 | sh '''docker pull fredrikjanssonse/leader-elector:0.6 37 | docker tag fredrikjanssonse/leader-elector:0.6 drainnode/leader:"$BRANCH_NAME"-rc"$BUILD_NUMBER"''' 38 | } 39 | 40 | } 41 | } 42 | 43 | } 44 | } 45 | 46 | stage('Packaging') { 47 | steps { 48 | dir(path: './chart') { 49 | sh '''echo "Removing old packages..." 50 | rm -f drain-node-on-crash-*.tgz 51 | 52 | echo "Packing chart using helm..." 53 | helm package ./drain-node-on-crash/ \\ 54 | --app-version="$BRANCH_NAME"-rc"$BUILD_NUMBER" \\ 55 | --version="$BRANCH_NAME"-rc"$BUILD_NUMBER" 56 | 57 | echo "Moving package..." 58 | mv drain-node-on-crash-*.tgz ~/helm-chart/''' 59 | } 60 | 61 | } 62 | } 63 | 64 | stage('Publishing') { 65 | steps { 66 | sh '''cd ~/helm-chart/ 67 | helm repo index ~/helm-chart/ --url https://mattmattox.github.io/helm-chart/ 68 | git add . 69 | git commit -m "Jenkins Import" 70 | git push''' 71 | } 72 | } 73 | 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Drain-Node-On-Crash 2 | This app is designed to automatically Drain a node after a crash where the node fails to recover after 5mins. 3 | 4 | ## Install 5 | ``` 6 | git clone https://github.com/mattmattox/drain-node-on-crash.git 7 | cd drain-node-on-crash 8 | kubectl apply -f . 9 | ``` 10 | 11 | ## Upgrade 12 | ``` 13 | rm -rf drain-node-on-crash 14 | git clone https://github.com/mattmattox/drain-node-on-crash.git 15 | cd drain-node-on-crash 16 | kubectl apply -f . 17 | ``` 18 | 19 | 20 | ## Default settings 21 | NODE_TIMEOUT = 360 (seconds) 22 | 23 | AUTO_UNCORDON = true (This setting will automatically uncordon a node that was drained by the script. NOTE: Nodes that have cordon outside this app will not be changed.) 24 | 25 | REMOVE_PODS = true (This setting will automatically delete all pods from the node after draining.) 26 | 27 | CATTLE_CLUSTER_AGENT = true (This setting will apply a fix for cattle-cluster-agent.) 28 | -------------------------------------------------------------------------------- /assets/drain-node.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chart/drain-node-on-crash/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /chart/drain-node-on-crash/Chart.yaml: -------------------------------------------------------------------------------- 1 | 2 | apiVersion: v1 3 | name: drain-node-on-crash 4 | description: A simple little for fixing kubernetes issues. 5 | version: PlaceHolder_VERSION 6 | appVersion: PlaceHolder_APPVERSION 7 | home: https://Support.Tools 8 | icon: https://raw.githubusercontent.com/mattmattox/drain-node-on-crash/v1.0/assets/drain-node.svg 9 | keywords: 10 | - drain-node-on-crash 11 | sources: 12 | - https://github.com/mattmattox/drain-node-on-crash 13 | maintainers: 14 | - name: Matthew Mattox 15 | email: mmattox@support.tools 16 | -------------------------------------------------------------------------------- /chart/drain-node-on-crash/questions.yml: -------------------------------------------------------------------------------- 1 | labels: 2 | io.cattle.role: project 3 | categories: 4 | - Monitoring 5 | namespace: drain-node-on-crash 6 | questions: 7 | - variable: defaultImage 8 | default: true 9 | description: "Use default Docker image" 10 | label: Use Default Image 11 | type: boolean 12 | show_subquestion_if: false 13 | group: "Container Images" 14 | subquestions: 15 | - variable: managerImage 16 | default: "docker.io/drainnode/manager" 17 | description: "Docker Image Manager" 18 | type: string 19 | label: Docker Image Manager 20 | - variable: workerImage 21 | default: "docker.io/drainnode/worker" 22 | description: "Docker Image worker" 23 | type: string 24 | label: Docker Image Worker 25 | - variable: leaderImage 26 | default: "docker.io/drainnode/leader" 27 | description: "Docker Image Leader" 28 | type: string 29 | label: Docker Image Leader 30 | - variable: nodeTimeout 31 | required: true 32 | default: '360' 33 | description: "Node timeout in seconds" 34 | type: string 35 | label: "nodeTimeout" 36 | - variable: autoUncordon 37 | default: true 38 | description: "Uncordon after node recovery" 39 | label: Auto Uncordon 40 | type: boolean 41 | - variable: managerReplicaCount 42 | required: true 43 | default: '3' 44 | description: "Number of Manager replicas" 45 | type: string 46 | label: "Replicas" 47 | - variable: serviceAccountCreate 48 | default: true 49 | description: "Uncordon after node recovery" 50 | label: Create Service Account 51 | type: boolean 52 | show_subquestion_if: false 53 | group: "Service Account and Permissions" 54 | subquestions: 55 | - variable: serviceAccountName 56 | default: "drain-node-on-crash" 57 | description: "Service Account Name" 58 | type: string 59 | label: Service Account Name 60 | -------------------------------------------------------------------------------- /chart/drain-node-on-crash/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "drain-node-on-crash.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "drain-node-on-crash.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Create chart name and version as used by the chart label. 29 | */}} 30 | {{- define "drain-node-on-crash.chart" -}} 31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 32 | {{- end -}} 33 | 34 | {{/* 35 | Common labels 36 | */}} 37 | {{- define "drain-node-on-crash.labels" -}} 38 | helm.sh/chart: {{ include "drain-node-on-crash.chart" . }} 39 | {{ include "drain-node-on-crash.selectorLabels" . }} 40 | {{- if .Chart.AppVersion }} 41 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 42 | {{- end }} 43 | app.kubernetes.io/managed-by: {{ .Release.Service }} 44 | {{- end -}} 45 | 46 | {{/* 47 | Selector labels 48 | */}} 49 | {{- define "drain-node-on-crash.selectorLabels" -}} 50 | app.kubernetes.io/name: {{ include "drain-node-on-crash.name" . }} 51 | app.kubernetes.io/instance: {{ .Release.Name }} 52 | {{- end -}} 53 | 54 | {{/* 55 | Create the name of the service account to use 56 | */}} 57 | {{- define "drain-node-on-crash.serviceAccountName" -}} 58 | {{- if .Values.serviceAccount.create -}} 59 | {{ default (include "drain-node-on-crash.fullname" .) .Values.serviceAccount.name }} 60 | {{- else -}} 61 | {{ default "default" .Values.serviceAccount.name }} 62 | {{- end -}} 63 | {{- end -}} 64 | -------------------------------------------------------------------------------- /chart/drain-node-on-crash/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: drain-node-manager 5 | namespace: {{ .Values.Namespace }} 6 | labels: 7 | app: drain-node-on-crash 8 | spec: 9 | replicas: {{ .Values.managerReplicaCount }} 10 | selector: 11 | matchLabels: 12 | app: drain-node-on-crash 13 | strategy: 14 | type: Recreate 15 | template: 16 | metadata: 17 | labels: 18 | app: drain-node-on-crash 19 | spec: 20 | {{- with .Values.imagePullSecrets }} 21 | imagePullSecrets: 22 | {{- toYaml . | nindent 8 }} 23 | {{- end }} 24 | affinity: 25 | podAntiAffinity: 26 | requiredDuringSchedulingIgnoredDuringExecution: 27 | - labelSelector: 28 | matchExpressions: 29 | - key: app 30 | operator: In 31 | values: 32 | - drain-node-on-crash 33 | topologyKey: "kubernetes.io/hostname" 34 | serviceAccountName: {{ .Values.serviceAccountName }} 35 | securityContext: 36 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 37 | containers: 38 | - name: drain 39 | securityContext: 40 | {{- toYaml .Values.securityContext | nindent 12 }} 41 | image: "{{ .Values.managerImage }}:{{ .Chart.AppVersion }}" 42 | imagePullPolicy: {{ .Values.imagePullPolicy }} 43 | env: 44 | - name: NODE_TIMEOUT 45 | value: "{{ .Values.nodeTimeout }}" 46 | - name: AUTO_UNCORDON 47 | value: "{{ .Values.autoUncordon }}" 48 | resources: 49 | - name: leader 50 | image: "{{ .Values.leaderImage }}:{{ .Chart.AppVersion }}" 51 | args: 52 | - --election=drain-node-on-crash 53 | - --http=0.0.0.0:4040 54 | imagePullPolicy: IfNotPresent 55 | ports: 56 | - containerPort: 4040 57 | -------------------------------------------------------------------------------- /chart/drain-node-on-crash/templates/rbac.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccountCreate -}} 2 | --- 3 | apiVersion: v1 4 | kind: ServiceAccount 5 | metadata: 6 | name: {{ .Values.serviceAccountName }} 7 | namespace: {{ .Values.Namespace }} 8 | labels: 9 | app: drain-node-on-crash 10 | --- 11 | apiVersion: rbac.authorization.k8s.io/v1beta1 12 | kind: ClusterRole 13 | metadata: 14 | name: drain-node-on-crash 15 | labels: 16 | app: drain-node-on-crash 17 | rules: 18 | - apiGroups: 19 | - '*' 20 | resources: 21 | - '*' 22 | verbs: 23 | - '*' 24 | --- 25 | apiVersion: rbac.authorization.k8s.io/v1 26 | kind: ClusterRoleBinding 27 | metadata: 28 | name: drain-node-on-crash 29 | labels: 30 | app: drain-node-on-crash 31 | roleRef: 32 | apiGroup: rbac.authorization.k8s.io 33 | kind: ClusterRole 34 | name: drain-node-on-crash 35 | subjects: 36 | - kind: ServiceAccount 37 | name: {{ .Values.serviceAccountName }} 38 | namespace: {{ .Values.Namespace }} 39 | {{- end -}} 40 | -------------------------------------------------------------------------------- /chart/drain-node-on-crash/values.yaml: -------------------------------------------------------------------------------- 1 | Name: Drain-Node-On-Crash 2 | Namespace: drain-node-on-crash 3 | 4 | serviceAccountCreate: true 5 | serviceAccountName: drain-node-on-crash 6 | 7 | # Add debug flag to Manager and worker pods 8 | debug: false 9 | 10 | ## Optional array of imagePullSecrets containing private registry credentials 11 | ## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ 12 | imagePullSecrets: [] 13 | # - name: secretName 14 | imagePullPolicy: IfNotPresent 15 | 16 | # Override rancher image location for Air Gap installs 17 | managerImage: drainnode/manager 18 | workerImage: drainnode/worker 19 | leaderImage: drainnode/leader 20 | 21 | # Manager settings 22 | nodeTimeout: 360 23 | autoUncordon: true 24 | 25 | # Number of Manager replicas. 26 | managerReplicaCount: 3 27 | -------------------------------------------------------------------------------- /manager/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | MAINTAINER matthew.mattox@rancher.com 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | RUN apt-get update && apt-get install -yq --no-install-recommends \ 8 | apt-utils \ 9 | curl \ 10 | && apt-get clean && rm -rf /var/lib/apt/lists/* 11 | 12 | ## Install kubectl 13 | ADD kubectl /usr/local/bin/kubectl 14 | RUN chmod +x /usr/local/bin/kubectl 15 | #RUN curl -LO "https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl" && chmod u+x kubectl && mv kubectl /usr/local/bin/kubectl 16 | 17 | ## Setup run script 18 | WORKDIR /root 19 | ADD run.sh /root/run.sh 20 | 21 | CMD /root/run.sh 22 | -------------------------------------------------------------------------------- /manager/kubectl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mattmattox/drain-node-on-crash/8535e0ac94bae4c813591fa99909257a4401aeba/manager/kubectl -------------------------------------------------------------------------------- /manager/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ -z $nodeTimeout ]] 4 | then 5 | nodeTimeout=360 6 | fi 7 | echo "nodeTimeout: $nodeTimeout" 8 | 9 | if [[ -z $AUTO_UNCORDON ]] 10 | then 11 | AUTO_UNCORDON=true 12 | fi 13 | echo "Auto uncordon node on recovery is $AUTO_UNCORDON" 14 | 15 | if [[ -z $REMOVE_PODS ]] 16 | then 17 | REMOVE_PODS=true 18 | fi 19 | echo "Remove all pods from drained node is $REMOVE_PODS" 20 | 21 | if [[ -z $CATTLE_CLUSTER_AGENT ]] 22 | then 23 | CATTLE_CLUSTER_AGENT=true 24 | fi 25 | 26 | touch ~/drained_nodes 27 | 28 | while true 29 | do 30 | if curl -v --silent http://localhost:4040/ 2>&1 | grep $HOSTNAME 31 | then 32 | echo "Leader" 33 | for node in $(kubectl get nodes --no-headers --output=name) 34 | do 35 | echo "#########################################################" 36 | echo "Checking $node" 37 | current_status="$(kubectl get --no-headers $node | awk '{print $2}')" 38 | if [[ "$current_status" == "Ready" ]] || [[ "$current_status" == "Ready,SchedulingDisabled" ]] 39 | then 40 | echo "$node is ready" 41 | if cat ~/drained_nodes | grep -x $node 42 | then 43 | echo "$node has recovered" 44 | cat ~/drained_nodes | grep -v -x $node > ~/drained_nodes.tmp 45 | mv ~/drained_nodes.tmp ~/drained_nodes 46 | if [[ "$AUTO_UNCORDON" == "true" ]] 47 | then 48 | echo "uncordon $node" 49 | kubectl uncordon $node 50 | kubectl patch node $node -p '{"spec":{"unschedulable":false}}' 51 | fi 52 | fi 53 | 54 | else 55 | if cat ~/drained_nodes | grep -x $node 56 | then 57 | echo "$node is already drained, skipping..." 58 | else 59 | echo "$node in Not ready, rechecking..." 60 | count=0 61 | while true 62 | do 63 | current_status="$(kubectl get --no-headers $node | awk '{print $2}')" 64 | if [[ ! "$current_status" == "Ready" ]] || [[ "$current_status" == "Ready,SchedulingDisabled" ]] 65 | then 66 | echo "Sleeping for $count seconds" 67 | sleep 1 68 | count=$((count+1)) 69 | else 70 | echo "$node is now ready" 71 | cat ~/drained_nodes | grep -v -x $node > ~/drained_nodes.tmp 72 | mv ~/drained_nodes.tmp ~/drained_nodes 73 | break 74 | fi 75 | if [ $count -gt $nodeTimeout ] 76 | then 77 | echo "$node has been down for greater than 5Mins, assuming node is down for good." 78 | echo "Starting drain of node..." 79 | kubectl drain $node --ignore-daemonsets --delete-local-data --force --grace-period=5 --timeout=60s 80 | kubectl patch $node -p '{"spec":{"unschedulable":true}}' 81 | echo $node >> ~/drained_nodes 82 | echo "Sleeping for 60 seconds..." 83 | sleep 60 84 | if [[ "$REMOVE_PODS" == "true" ]] 85 | then 86 | echo "Getting all pods on node..." 87 | node_short="$(echo $node | awk -F '/' '{print $2}')" 88 | kubectl get pods --all-namespaces -o wide --field-selector spec.nodeName="$node_short" --no-headers | awk '{print $1 "," $2}' > /tmp/pods.csv 89 | while IFS=, read -r namespace podname 90 | do 91 | echo "Removing $podname from $namespace" 92 | podcount=0 93 | while ! kubectl delete pods "$podname" -n "$namespace" --grace-period=0 --force 94 | do 95 | sleep 1 96 | podcount=$((podcount+1)) 97 | if [ $podcount -gt 60 ] 98 | then 99 | break 100 | fi 101 | done 102 | done < /tmp/pods.csv 103 | fi 104 | if [[ "$CATTLE_CLUSTER_AGENT" == "true" ]] 105 | then 106 | echo "Checking if cattle-cluster-agent is already running..." 107 | if [[ ! "$(kubectl get pods -n cattle-system | grep ^'cattle-cluster-agent-' | awk '{print $3}')" == "Running" ]] 108 | then 109 | echo "Scaling up to force pod to new node..." 110 | kubectl scale --replicas=5 deployment/cattle-cluster-agent -n cattle-system 111 | cattlecount=0 112 | while ! kubectl get pods -n cattle-system | grep ^'cattle-cluster-agent-' | awk '{print $3}' | grep "Running" 113 | do 114 | sleep 1 115 | cattlecount=$((cattlecount+1)) 116 | if [ $cattlecount -gt 60 ] 117 | then 118 | break 119 | fi 120 | done 121 | echo "Scaling back down to 1..." 122 | kubectl scale --replicas=1 deployment/cattle-cluster-agent -n cattle-system 123 | else 124 | echo "cattle-cluster-agent is alreayd running..." 125 | fi 126 | fi 127 | break 128 | fi 129 | done 130 | fi 131 | fi 132 | echo "#########################################################" 133 | done 134 | else 135 | echo "Standby" 136 | fi 137 | echo "Sleeping for 5s before rechecking..." 138 | sleep 5 139 | done 140 | -------------------------------------------------------------------------------- /worker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | MAINTAINER matthew.mattox@rancher.com 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | RUN apt-get update && apt-get install -yq --no-install-recommends \ 8 | apt-utils \ 9 | curl \ 10 | && apt-get clean && rm -rf /var/lib/apt/lists/* 11 | 12 | ## Install kubectl 13 | ADD kubectl /usr/local/bin/kubectl 14 | RUN chmod +x /usr/local/bin/kubectl 15 | #RUN curl -LO "https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl" && chmod u+x kubectl && mv kubectl /usr/local/bin/kubectl 16 | 17 | ## Setup run script 18 | WORKDIR /root 19 | ADD run.sh /root/run.sh 20 | 21 | CMD /root/run.sh 22 | -------------------------------------------------------------------------------- /worker/kubectl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mattmattox/drain-node-on-crash/8535e0ac94bae4c813591fa99909257a4401aeba/worker/kubectl -------------------------------------------------------------------------------- /worker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function check_node { 4 | current_status=`kubectl get node --no-headers "$1" |awk '{print $2}'` 5 | if [[ "$current_status" == "Ready" ]] || [[ "$current_status" == "Ready,SchedulingDisabled" ]] 6 | then 7 | return 0 8 | else 9 | return 1 10 | fi 11 | } 12 | 13 | if [[ -z $nodeTimeout ]] 14 | then 15 | nodeTimeout=60 16 | fi 17 | echo "nodeTimeout: $nodeTimeout" 18 | 19 | echo "nodeName: $nodeName" 20 | if [[ -z "$nodeName" ]] 21 | then 22 | echo "Missing nodeName" 23 | exit 1 24 | fi 25 | 26 | echo "Verifing Docker CLI access..." 27 | if ! docker info 28 | then 29 | echo "Problem accessing Docker CLI" 30 | exit 2 31 | fi 32 | 33 | while true; 34 | do 35 | echo "Checking node status..." 36 | if check_node $nodeName 37 | then 38 | echo "Node is ready" 39 | else 40 | echo "Node is Not ready, rechecking..." 41 | count=0 42 | while true 43 | do 44 | if ! check_node $nodeName 45 | then 46 | echo "Sleeping for $count seconds" 47 | sleep 1 48 | count=$((count+1)) 49 | else 50 | echo "Node is now ready" 51 | break 52 | fi 53 | if [ $count -gt $nodeTimeout ] 54 | then 55 | echo "Node has been down for greater then $nodeTimeout seconds, assuming node is down" 56 | echo "Attempting node recovery" 57 | echo "Restarting kubelet" 58 | docker restart kubelet 59 | echo "Sleeping..." 60 | sleep 15 61 | if check_node $nodeName 62 | then 63 | echo "Node has recovered" 64 | break 65 | fi 66 | fi 67 | done 68 | fi 69 | done 70 | --------------------------------------------------------------------------------