├── Jenkinsfile
├── README.md
├── assets
    └── drain-node.svg
├── chart
    └── drain-node-on-crash
    │   ├── .helmignore
    │   ├── Chart.yaml
    │   ├── questions.yml
    │   ├── templates
    │       ├── _helpers.tpl
    │       ├── deployment.yaml
    │       └── rbac.yaml
    │   └── values.yaml
├── manager
    ├── Dockerfile
    ├── kubectl
    └── run.sh
└── worker
    ├── Dockerfile
    ├── kubectl
    └── run.sh


/Jenkinsfile:
--------------------------------------------------------------------------------
 1 | pipeline {
 2 |   agent any
 3 |   stages {
 4 |     stage('Login to Docker repo') {
 5 |       steps {
 6 |         sh '''cat ~/.GH_TOKEN | docker login docker.pkg.github.com -u mattmattox --password-stdin
 7 | '''
 8 |       }
 9 |     }
10 | 
11 |     stage('Docker') {
12 |       parallel {
13 |         stage('Build Docker image and push - Manager') {
14 |           steps {
15 |             dir(path: './manager') {
16 |               sh '''docker build -t drainnode/manager:"$BRANCH_NAME"-rc"$BUILD_NUMBER" .
17 | docker push drainnode/manager:"$BRANCH_NAME"-rc"$BUILD_NUMBER"'''
18 |             }
19 | 
20 |           }
21 |         }
22 | 
23 |         stage('Build Docker image and push - Worker') {
24 |           steps {
25 |             dir(path: './worker') {
26 |               sh '''docker build -t drainnode/worker:"$BRANCH_NAME"-rc"$BUILD_NUMBER" .
27 | docker push drainnode/worker:"$BRANCH_NAME"-rc"$BUILD_NUMBER"'''
28 |             }
29 | 
30 |           }
31 |         }
32 | 
33 |         stage('Build Docker image and push - Leader') {
34 |           steps {
35 |             dir(path: './worker') {
36 |               sh '''docker pull fredrikjanssonse/leader-elector:0.6
37 | docker tag fredrikjanssonse/leader-elector:0.6 drainnode/leader:"$BRANCH_NAME"-rc"$BUILD_NUMBER"'''
38 |             }
39 | 
40 |           }
41 |         }
42 | 
43 |       }
44 |     }
45 | 
46 |     stage('Packaging') {
47 |       steps {
48 |         dir(path: './chart') {
49 |           sh '''echo "Removing old packages..."
50 | rm -f drain-node-on-crash-*.tgz
51 | 
52 | echo "Packing chart using helm..."
53 | helm package ./drain-node-on-crash/ \\
54 | --app-version="$BRANCH_NAME"-rc"$BUILD_NUMBER" \\
55 | --version="$BRANCH_NAME"-rc"$BUILD_NUMBER"
56 | 
57 | echo "Moving package..."
58 | mv drain-node-on-crash-*.tgz ~/helm-chart/'''
59 |         }
60 | 
61 |       }
62 |     }
63 | 
64 |     stage('Publishing') {
65 |       steps {
66 |         sh '''cd ~/helm-chart/
67 | helm repo index ~/helm-chart/ --url https://mattmattox.github.io/helm-chart/
68 | git add .
69 | git commit -m "Jenkins Import"
70 | git push'''
71 |       }
72 |     }
73 | 
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Drain-Node-On-Crash
 2 | This app is designed to automatically Drain a node after a crash where the node fails to recover after 5mins.
 3 | 
 4 | ## Install
 5 | ```
 6 | git clone https://github.com/mattmattox/drain-node-on-crash.git
 7 | cd drain-node-on-crash
 8 | kubectl apply -f .
 9 | ```
10 | 
11 | ## Upgrade
12 | ```
13 | rm -rf drain-node-on-crash
14 | git clone https://github.com/mattmattox/drain-node-on-crash.git
15 | cd drain-node-on-crash
16 | kubectl apply -f .
17 | ```
18 | 
19 | 
20 | ## Default settings
21 | NODE_TIMEOUT = 360  (seconds)
22 | 
23 | AUTO_UNCORDON = true (This setting will automatically uncordon a node that was drained by the script. NOTE: Nodes that have cordon outside this app will not be changed.)
24 | 
25 | REMOVE_PODS = true (This setting will automatically delete all pods from the node after draining.)
26 | 
27 | CATTLE_CLUSTER_AGENT = true (This setting will apply a fix for cattle-cluster-agent.)
28 | 


--------------------------------------------------------------------------------
/assets/drain-node.svg:
--------------------------------------------------------------------------------
1 | <svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 378.03 420.9"><defs><style>.cls-1{fill:#326ce5;}</style></defs><path class="cls-1" d="M43.59,174.33A66.39,66.39,0,1,1,110,240.72,66.46,66.46,0,0,1,43.59,174.33Zm8.11,0A58.28,58.28,0,1,0,110,116.06,58.34,58.34,0,0,0,51.7,174.33Z"/><path class="cls-1" d="M122,177.26a20.92,20.92,0,1,0,20.92-20.9A20.92,20.92,0,0,0,122,177.26Zm5,5.84a6.77,6.77,0,1,1,6.77,6.77A6.76,6.76,0,0,1,127,183.1Z"/><path class="cls-1" d="M201.67,181.35a66.39,66.39,0,1,1,66.39,66.39A66.46,66.46,0,0,1,201.67,181.35Zm8.11,0a58.28,58.28,0,1,0,58.28-58.27A58.35,58.35,0,0,0,209.78,181.35Z"/><path class="cls-1" d="M280.08,184.28A20.92,20.92,0,1,0,301,163.38,20.92,20.92,0,0,0,280.08,184.28Zm5,5.84a6.77,6.77,0,1,1,6.76,6.77A6.76,6.76,0,0,1,285.08,190.12Z"/><path class="cls-1" d="M198.49,325a16.34,16.34,0,0,0,16.32-16.32V281.42H210v27.21a11.46,11.46,0,0,1-22.91,0V277a11.34,11.34,0,0,1,.83-4.29l-4.52-1.83a16.34,16.34,0,0,0-1.18,6.12v31.64A16.34,16.34,0,0,0,198.49,325Z"/><path class="cls-1" d="M170.72,325A16.34,16.34,0,0,0,187,308.63V277a16.12,16.12,0,0,0-1.15-6l-4.52,1.82a11.17,11.17,0,0,1,.8,4.2v31.64a11.46,11.46,0,0,1-22.91,0v-28.3h-4.87v28.3A16.35,16.35,0,0,0,170.72,325Z"/><path class="cls-1" d="M215.7,236.66c0-17.84-20.28-19.74-30.42-19.74s-28.46,4.6-28.46,18.53S168.31,254,186.26,254,215.7,249.64,215.7,236.66Z"/><path class="cls-1" d="M218.22,284.41c8.26,0,14.65-2.12,18.52-6.17,3.21-3.36,4.64-7.83,4.24-13.3-1-13.64-9.38-22.79-24.9-27.2l-1.33,4.68c13.53,3.85,20.52,11.33,21.37,22.87.3,4-.68,7.27-2.9,9.59-3.11,3.25-8.75,4.85-16.29,4.64-5.82-.16-9.19-2.23-13.09-4.62-4.78-2.93-10.2-6.26-20.4-5.45-8.34.65-12.13,3-15.48,5.13-3.71,2.33-6.65,4.17-16.31,2.93-5.87-.75-10.07-2.75-12.48-6a11.92,11.92,0,0,1-2.26-8.08l0-.22c0-.67,1.4-16.69,20.82-21.6l-1.19-4.72c-22,5.55-24.3,24.14-24.47,25.89a16.72,16.72,0,0,0,3.19,11.65c3.24,4.3,8.54,6.95,15.76,7.87,11.39,1.45,15.33-1,19.51-3.64a25,25,0,0,1,13.27-4.39c8.65-.68,12.93,1.95,17.48,4.74,4.12,2.52,8.38,5.14,15.49,5.34Z"/><path class="cls-1" d="M0,130.37V290.53A50.44,50.44,0,0,0,25.16,334.1l138.7,80.08a50.46,50.46,0,0,0,50.32,0L352.87,334.1A50.44,50.44,0,0,0,378,290.53V130.37A50.46,50.46,0,0,0,352.87,86.8L214.18,6.72a50.46,50.46,0,0,0-50.32,0L25.16,86.8A50.46,50.46,0,0,0,0,130.37Zm21.93,0a28.47,28.47,0,0,1,14.2-24.58L174.82,25.71a28.45,28.45,0,0,1,28.39,0l138.7,80.07a28.46,28.46,0,0,1,14.19,24.58V290.53a28.46,28.46,0,0,1-14.19,24.58l-138.7,80.08a28.48,28.48,0,0,1-28.39,0L36.13,315.11a28.45,28.45,0,0,1-14.2-24.58Z"/></svg>


--------------------------------------------------------------------------------
/chart/drain-node-on-crash/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/chart/drain-node-on-crash/Chart.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | apiVersion: v1
 3 | name: drain-node-on-crash
 4 | description: A simple little for fixing kubernetes issues.
 5 | version: PlaceHolder_VERSION
 6 | appVersion: PlaceHolder_APPVERSION
 7 | home: https://Support.Tools
 8 | icon: https://raw.githubusercontent.com/mattmattox/drain-node-on-crash/v1.0/assets/drain-node.svg
 9 | keywords:
10 |   - drain-node-on-crash
11 | sources:
12 |   - https://github.com/mattmattox/drain-node-on-crash
13 | maintainers:
14 |   - name: Matthew Mattox
15 |     email: mmattox@support.tools
16 | 


--------------------------------------------------------------------------------
/chart/drain-node-on-crash/questions.yml:
--------------------------------------------------------------------------------
 1 | labels:
 2 |   io.cattle.role: project
 3 | categories:
 4 |   - Monitoring
 5 | namespace: drain-node-on-crash
 6 | questions:
 7 | - variable: defaultImage
 8 |   default: true
 9 |   description: "Use default Docker image"
10 |   label: Use Default Image
11 |   type: boolean
12 |   show_subquestion_if: false
13 |   group: "Container Images"
14 |   subquestions:
15 |   - variable: managerImage
16 |     default: "docker.io/drainnode/manager"
17 |     description: "Docker Image Manager"
18 |     type: string
19 |     label: Docker Image Manager
20 |   - variable: workerImage
21 |     default: "docker.io/drainnode/worker"
22 |     description: "Docker Image worker"
23 |     type: string
24 |     label: Docker Image Worker
25 |   - variable: leaderImage
26 |     default: "docker.io/drainnode/leader"
27 |     description: "Docker Image Leader"
28 |     type: string
29 |     label: Docker Image Leader
30 | - variable: nodeTimeout
31 |   required: true
32 |   default: '360'
33 |   description: "Node timeout in seconds"
34 |   type: string
35 |   label: "nodeTimeout"
36 | - variable: autoUncordon
37 |   default: true
38 |   description: "Uncordon after node recovery"
39 |   label: Auto Uncordon
40 |   type: boolean
41 | - variable: managerReplicaCount
42 |   required: true
43 |   default: '3'
44 |   description: "Number of Manager replicas"
45 |   type: string
46 |   label: "Replicas"
47 | - variable: serviceAccountCreate
48 |   default: true
49 |   description: "Uncordon after node recovery"
50 |   label: Create Service Account
51 |   type: boolean
52 |   show_subquestion_if: false
53 |   group: "Service Account and Permissions"
54 |   subquestions:
55 |   - variable: serviceAccountName
56 |     default: "drain-node-on-crash"
57 |     description: "Service Account Name"
58 |     type: string
59 |     label: Service Account Name
60 | 


--------------------------------------------------------------------------------
/chart/drain-node-on-crash/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/* vim: set filetype=mustache: */}}
 2 | {{/*
 3 | Expand the name of the chart.
 4 | */}}
 5 | {{- define "drain-node-on-crash.name" -}}
 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
 7 | {{- end -}}
 8 | 
 9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "drain-node-on-crash.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 | 
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "drain-node-on-crash.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 | 
34 | {{/*
35 | Common labels
36 | */}}
37 | {{- define "drain-node-on-crash.labels" -}}
38 | helm.sh/chart: {{ include "drain-node-on-crash.chart" . }}
39 | {{ include "drain-node-on-crash.selectorLabels" . }}
40 | {{- if .Chart.AppVersion }}
41 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
42 | {{- end }}
43 | app.kubernetes.io/managed-by: {{ .Release.Service }}
44 | {{- end -}}
45 | 
46 | {{/*
47 | Selector labels
48 | */}}
49 | {{- define "drain-node-on-crash.selectorLabels" -}}
50 | app.kubernetes.io/name: {{ include "drain-node-on-crash.name" . }}
51 | app.kubernetes.io/instance: {{ .Release.Name }}
52 | {{- end -}}
53 | 
54 | {{/*
55 | Create the name of the service account to use
56 | */}}
57 | {{- define "drain-node-on-crash.serviceAccountName" -}}
58 | {{- if .Values.serviceAccount.create -}}
59 |     {{ default (include "drain-node-on-crash.fullname" .) .Values.serviceAccount.name }}
60 | {{- else -}}
61 |     {{ default "default" .Values.serviceAccount.name }}
62 | {{- end -}}
63 | {{- end -}}
64 | 


--------------------------------------------------------------------------------
/chart/drain-node-on-crash/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: drain-node-manager
 5 |   namespace: {{ .Values.Namespace }}
 6 |   labels:
 7 |     app: drain-node-on-crash
 8 | spec:
 9 |   replicas: {{ .Values.managerReplicaCount }}
10 |   selector:
11 |     matchLabels:
12 |       app: drain-node-on-crash
13 |   strategy:
14 |     type: Recreate
15 |   template:
16 |     metadata:
17 |       labels:
18 |         app: drain-node-on-crash
19 |     spec:
20 |     {{- with .Values.imagePullSecrets }}
21 |       imagePullSecrets:
22 |         {{- toYaml . | nindent 8 }}
23 |     {{- end }}
24 |       affinity:
25 |         podAntiAffinity:
26 |           requiredDuringSchedulingIgnoredDuringExecution:
27 |           - labelSelector:
28 |               matchExpressions:
29 |               - key: app
30 |                 operator: In
31 |                 values:
32 |                 - drain-node-on-crash
33 |             topologyKey: "kubernetes.io/hostname"
34 |       serviceAccountName: {{ .Values.serviceAccountName }}
35 |       securityContext:
36 |         {{- toYaml .Values.podSecurityContext | nindent 8 }}
37 |       containers:
38 |         - name: drain
39 |           securityContext:
40 |             {{- toYaml .Values.securityContext | nindent 12 }}
41 |           image: "{{ .Values.managerImage }}:{{ .Chart.AppVersion }}"
42 |           imagePullPolicy: {{ .Values.imagePullPolicy }}
43 |           env:
44 |             - name: NODE_TIMEOUT
45 |               value: "{{ .Values.nodeTimeout }}"
46 |             - name: AUTO_UNCORDON
47 |               value: "{{ .Values.autoUncordon }}"
48 |           resources:
49 |         - name: leader
50 |           image: "{{ .Values.leaderImage }}:{{ .Chart.AppVersion }}"
51 |           args:
52 |           - --election=drain-node-on-crash
53 |           - --http=0.0.0.0:4040
54 |           imagePullPolicy: IfNotPresent
55 |           ports:
56 |           - containerPort: 4040
57 | 


--------------------------------------------------------------------------------
/chart/drain-node-on-crash/templates/rbac.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccountCreate -}}
 2 | ---
 3 | apiVersion: v1
 4 | kind: ServiceAccount
 5 | metadata:
 6 |   name: {{ .Values.serviceAccountName }}
 7 |   namespace: {{ .Values.Namespace }}
 8 |   labels:
 9 |     app: drain-node-on-crash
10 | ---
11 | apiVersion: rbac.authorization.k8s.io/v1beta1
12 | kind: ClusterRole
13 | metadata:
14 |   name: drain-node-on-crash
15 |   labels:
16 |     app: drain-node-on-crash
17 | rules:
18 | - apiGroups:
19 |   - '*'
20 |   resources:
21 |     - '*'
22 |   verbs:
23 |     - '*'
24 | ---
25 | apiVersion: rbac.authorization.k8s.io/v1
26 | kind: ClusterRoleBinding
27 | metadata:
28 |   name: drain-node-on-crash
29 |   labels:
30 |     app: drain-node-on-crash  
31 | roleRef:
32 |   apiGroup: rbac.authorization.k8s.io
33 |   kind: ClusterRole
34 |   name: drain-node-on-crash
35 | subjects:
36 | - kind: ServiceAccount
37 |   name: {{ .Values.serviceAccountName }}
38 |   namespace: {{ .Values.Namespace }}
39 | {{- end -}}
40 | 


--------------------------------------------------------------------------------
/chart/drain-node-on-crash/values.yaml:
--------------------------------------------------------------------------------
 1 | Name: Drain-Node-On-Crash
 2 | Namespace: drain-node-on-crash
 3 | 
 4 | serviceAccountCreate: true
 5 | serviceAccountName: drain-node-on-crash
 6 | 
 7 | # Add debug flag to Manager and worker pods
 8 | debug: false
 9 | 
10 | ## Optional array of imagePullSecrets containing private registry credentials
11 | ## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
12 | imagePullSecrets: []
13 | # - name: secretName
14 | imagePullPolicy: IfNotPresent
15 | 
16 | # Override rancher image location for Air Gap installs
17 | managerImage: drainnode/manager
18 | workerImage: drainnode/worker
19 | leaderImage: drainnode/leader
20 | 
21 | # Manager settings
22 | nodeTimeout: 360
23 | autoUncordon: true
24 | 
25 | # Number of Manager replicas.
26 | managerReplicaCount: 3
27 | 


--------------------------------------------------------------------------------
/manager/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | MAINTAINER matthew.mattox@rancher.com
 4 | 
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | RUN apt-get update && apt-get install -yq --no-install-recommends \
 8 |     apt-utils \
 9 |     curl \
10 |     && apt-get clean && rm -rf /var/lib/apt/lists/*
11 | 
12 | ## Install kubectl
13 | ADD kubectl /usr/local/bin/kubectl
14 | RUN chmod +x /usr/local/bin/kubectl
15 | #RUN curl -LO "https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl" && chmod u+x kubectl && mv kubectl /usr/local/bin/kubectl
16 | 
17 | ## Setup run script
18 | WORKDIR /root
19 | ADD run.sh /root/run.sh
20 | 
21 | CMD /root/run.sh
22 | 


--------------------------------------------------------------------------------
/manager/kubectl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattmattox/drain-node-on-crash/8535e0ac94bae4c813591fa99909257a4401aeba/manager/kubectl


--------------------------------------------------------------------------------
/manager/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | if [[ -z $nodeTimeout ]]
  4 | then
  5 | 	nodeTimeout=360
  6 | fi
  7 | echo "nodeTimeout: $nodeTimeout"
  8 | 
  9 | if [[ -z $AUTO_UNCORDON ]]
 10 | then
 11 | 	AUTO_UNCORDON=true
 12 | fi
 13 | echo "Auto uncordon node on recovery is $AUTO_UNCORDON"
 14 | 
 15 | if [[ -z $REMOVE_PODS ]]
 16 | then
 17 |         REMOVE_PODS=true
 18 | fi
 19 | echo "Remove all pods from drained node is $REMOVE_PODS"
 20 | 
 21 | if [[ -z $CATTLE_CLUSTER_AGENT ]]
 22 | then
 23 |         CATTLE_CLUSTER_AGENT=true
 24 | fi
 25 | 
 26 | touch ~/drained_nodes
 27 | 
 28 | while true
 29 | do
 30 | 	if curl -v --silent http://localhost:4040/ 2>&1 | grep $HOSTNAME
 31 | 	then
 32 | 		echo "Leader"
 33 | 		for node in $(kubectl get nodes --no-headers --output=name)
 34 | 		do
 35 | 			echo "#########################################################"
 36 | 			echo "Checking $node"
 37 | 			current_status="$(kubectl get --no-headers $node | awk '{print $2}')"
 38 | 			if [[ "$current_status" == "Ready" ]] || [[ "$current_status" == "Ready,SchedulingDisabled" ]]
 39 | 			then
 40 | 				echo "$node is ready"
 41 | 				if cat ~/drained_nodes | grep -x $node
 42 | 				then
 43 | 					echo "$node has recovered"
 44 | 					cat ~/drained_nodes | grep -v -x $node > ~/drained_nodes.tmp
 45 | 					mv ~/drained_nodes.tmp ~/drained_nodes
 46 | 					if [[ "$AUTO_UNCORDON" == "true" ]]
 47 | 					then
 48 | 						echo "uncordon $node"
 49 | 						kubectl uncordon $node
 50 | 						kubectl patch node $node -p '{"spec":{"unschedulable":false}}'
 51 | 					fi
 52 | 				fi
 53 | 
 54 | 			else
 55 | 				if cat ~/drained_nodes | grep -x $node
 56 | 				then
 57 | 					echo "$node is already drained, skipping..."
 58 | 				else
 59 | 					echo "$node in Not ready, rechecking..."
 60 | 					count=0
 61 | 					while true
 62 | 					do
 63 | 						current_status="$(kubectl get --no-headers $node | awk '{print $2}')"
 64 | 						if [[ ! "$current_status" == "Ready" ]] || [[ "$current_status" == "Ready,SchedulingDisabled" ]]
 65 | 						then
 66 | 							echo "Sleeping for $count seconds"
 67 | 							sleep 1
 68 | 							count=$((count+1))
 69 | 						else
 70 | 							echo "$node is now ready"
 71 | 							cat ~/drained_nodes | grep -v -x $node > ~/drained_nodes.tmp
 72 | 			                                mv ~/drained_nodes.tmp ~/drained_nodes
 73 | 							break
 74 | 						fi
 75 | 						if [ $count -gt $nodeTimeout ]
 76 | 						then
 77 | 							echo "$node has been down for greater than 5Mins, assuming node is down for good."
 78 | 							echo "Starting drain of node..."
 79 | 							kubectl drain $node --ignore-daemonsets --delete-local-data --force --grace-period=5 --timeout=60s
 80 | 							kubectl patch $node -p '{"spec":{"unschedulable":true}}'
 81 | 							echo $node >> ~/drained_nodes
 82 | 							echo "Sleeping for 60 seconds..."
 83 | 							sleep 60
 84 | 							if [[ "$REMOVE_PODS" == "true" ]]
 85 | 							then
 86 | 								echo "Getting all pods on node..."
 87 | 								node_short="$(echo $node | awk -F '/' '{print $2}')"
 88 | 								kubectl get pods --all-namespaces -o wide --field-selector spec.nodeName="$node_short" --no-headers | awk '{print $1 "," $2}' > /tmp/pods.csv
 89 | 								while IFS=, read -r namespace podname
 90 | 								do
 91 | 									echo "Removing $podname from $namespace"
 92 | 									podcount=0
 93 | 									while ! kubectl delete pods "$podname" -n "$namespace" --grace-period=0 --force
 94 | 									do
 95 | 										sleep 1
 96 | 										podcount=$((podcount+1))
 97 | 										if [ $podcount -gt 60 ]
 98 | 										then
 99 | 											break
100 | 										fi
101 | 									done
102 | 								done < /tmp/pods.csv
103 | 							fi
104 | 							if [[ "$CATTLE_CLUSTER_AGENT" == "true" ]]
105 | 							then
106 | 								echo "Checking if cattle-cluster-agent is already running..."
107 | 								if [[ ! "$(kubectl get pods -n cattle-system | grep ^'cattle-cluster-agent-' | awk '{print $3}')" == "Running" ]]
108 | 								then
109 | 									echo "Scaling up to force pod to new node..."
110 | 									kubectl scale --replicas=5 deployment/cattle-cluster-agent -n cattle-system
111 | 									cattlecount=0
112 | 									while ! kubectl get pods -n cattle-system | grep ^'cattle-cluster-agent-' | awk '{print $3}' | grep "Running"
113 | 									do
114 | 										sleep 1
115 |                     cattlecount=$((cattlecount+1))
116 |                     if [ $cattlecount -gt 60 ]
117 |                     then
118 |                       break
119 |                     fi
120 | 									done
121 | 									echo "Scaling back down to 1..."
122 | 									kubectl scale --replicas=1 deployment/cattle-cluster-agent -n cattle-system
123 | 								else
124 | 									echo "cattle-cluster-agent is alreayd running..."
125 | 								fi
126 | 							fi
127 | 							break
128 | 						fi
129 | 					done
130 | 				fi
131 | 			fi
132 | 			echo "#########################################################"
133 | 		done
134 | 	else
135 | 		echo "Standby"
136 | 	fi
137 | 	echo "Sleeping for 5s before rechecking..."
138 | 	sleep 5
139 | done
140 | 


--------------------------------------------------------------------------------
/worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | MAINTAINER matthew.mattox@rancher.com
 4 | 
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | RUN apt-get update && apt-get install -yq --no-install-recommends \
 8 |     apt-utils \
 9 |     curl \
10 |     && apt-get clean && rm -rf /var/lib/apt/lists/*
11 | 
12 | ## Install kubectl
13 | ADD kubectl /usr/local/bin/kubectl
14 | RUN chmod +x /usr/local/bin/kubectl
15 | #RUN curl -LO "https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl" && chmod u+x kubectl && mv kubectl /usr/local/bin/kubectl
16 | 
17 | ## Setup run script
18 | WORKDIR /root
19 | ADD run.sh /root/run.sh
20 | 
21 | CMD /root/run.sh
22 | 


--------------------------------------------------------------------------------
/worker/kubectl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattmattox/drain-node-on-crash/8535e0ac94bae4c813591fa99909257a4401aeba/worker/kubectl


--------------------------------------------------------------------------------
/worker/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function check_node {
 4 |   current_status=`kubectl get node --no-headers "$1" |awk '{print $2}'`
 5 |   if [[ "$current_status" == "Ready" ]] || [[ "$current_status" == "Ready,SchedulingDisabled" ]]
 6 |   then
 7 |     return 0
 8 |   else
 9 |     return 1
10 |   fi
11 | }
12 | 
13 | if [[ -z $nodeTimeout ]]
14 | then
15 |   nodeTimeout=60
16 | fi
17 | echo "nodeTimeout: $nodeTimeout"
18 | 
19 | echo "nodeName: $nodeName"
20 | if [[ -z "$nodeName" ]]
21 | then
22 |   echo "Missing nodeName"
23 |   exit 1
24 | fi
25 | 
26 | echo "Verifing Docker CLI access..."
27 | if ! docker info
28 | then
29 |   echo "Problem accessing Docker CLI"
30 |   exit 2
31 | fi
32 | 
33 | while true;
34 | do
35 |   echo "Checking node status..."
36 |   if check_node $nodeName
37 |   then
38 |     echo "Node is ready"
39 |   else
40 |     echo "Node is Not ready, rechecking..."
41 |     count=0
42 |     while true
43 |     do
44 |       if ! check_node $nodeName
45 |       then
46 |         echo "Sleeping for $count seconds"
47 |         sleep 1
48 |         count=$((count+1))
49 |       else
50 |         echo "Node is now ready"
51 |         break
52 |       fi
53 |       if [ $count -gt $nodeTimeout ]
54 |       then
55 |         echo "Node has been down for greater then $nodeTimeout seconds, assuming node is down"
56 |         echo "Attempting node recovery"
57 |         echo "Restarting kubelet"
58 |         docker restart kubelet
59 |         echo "Sleeping..."
60 |         sleep 15
61 |         if check_node $nodeName
62 |         then
63 |           echo "Node has recovered"
64 |           break
65 |         fi
66 |       fi
67 |     done
68 |   fi
69 | done
70 | 


--------------------------------------------------------------------------------