├── node-sysctls.yaml
├── README.md
├── systemd-cgroup-gc.yaml
└── regular-reboot.yaml


/node-sysctls.yaml:
--------------------------------------------------------------------------------
 1 | # This DaemonSet that runs a systemd cgroup garbage collection job once an hour to workaround
 2 | # bugs https://github.com/Azure/AKS/issues/750 & https://github.com/kubernetes/kubernetes/issues/64137.
 3 | # Note that these pods run in privileged mode, which can pose a security risk. Use with caution.
 4 | apiVersion: v1
 5 | kind: ConfigMap
 6 | metadata:
 7 |   name: node-sysctls
 8 | data:
 9 |   node-sysctls: |
10 |     #!/bin/bash
11 |     sysctl -w fs.inotify.max_user_watches=525000
12 | ---
13 | apiVersion: apps/v1
14 | kind: DaemonSet
15 | metadata:
16 |   name: node-sysctls
17 |   labels:
18 |     tier: management
19 |     app: node-sysctls
20 | spec:
21 |   selector:
22 |     matchLabels:
23 |       name: node-sysctls
24 |   template:
25 |     metadata:
26 |       labels:
27 |         name: node-sysctls
28 |     spec:
29 |       hostPID: true
30 |       containers:
31 |         - resources:
32 |             requests:
33 |               cpu: 5m
34 |               memory: 500Ki
35 |             limits:
36 |               cpu: 5m
37 |               memory: 50Mi  # This crashes on startup with a 5Mi limit, but only uses about 320Ki after that.
38 |           securityContext:
39 |             privileged: true
40 |           image: ubuntu:16.04
41 |           name: node-sysctls
42 |           command: ["/bin/bash", "-c"]
43 |           args:
44 |             - |
45 |               while true; do
46 |                 /usr/local/bin/node-sysctls
47 |                 sleep 3600 # Run hourly.
48 |               done
49 |           volumeMounts:
50 |             - name: scriptsrc
51 |               mountPath: /usr/local/bin
52 |       volumes:
53 |         - name: scriptsrc
54 |           configMap:
55 |             name: node-sysctls
56 |             defaultMode: 0755
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # k8s-hacks
 2 | Scripts Used to Work Around K8s Problems
 3 | 
 4 | I'm attaching some code here so that I can share it.
 5 | ## systemd-cgroup-gc.yaml
 6 | This works around:
 7 | * https://github.com/kubernetes/kubernetes/issues/64137
 8 | * https://github.com/Azure/AKS/issues/750
 9 | * https://github.com/kubernetes/kubernetes/pull/73799
10 | * https://github.com/kubernetes/kubernetes/issues/60987
11 | * https://github.com/rancher/k3s/issues/294
12 | * https://github.com/kubernetes/kubernetes/issues/70324
13 | 
14 | and probably some other tickets that aren't properly marked as duplicates yet.  Basically, K8s is leaving orphaned mounts behind when pods are cleaned up, which causes CPU usage to linearly increase until the whole node is taken down.  This is particularly exacerbated by K8s CronJobs since they have such a short lifecycle.  With a few cronjobs running every minute, I am able to take down a two-core AKS cluster node in about a week.  Depending on which ticket you look at, this problem is attributed to a failure of K8s to cleanup mounts or a bad interaction between certain kernel versions and certain versions of systemd.  My money is on the latter since, on my system, the pod directories (and, thus their mount subdirs) are already gone but systemd still has the cgroup watches registered.
15 | 
16 | Anyhow, this DaemonSet works around the problem by running a script
17 | hourly to search for the orphaned mounts and asking systemd to stop
18 | watching them.
19 | 
20 | ### Installing
21 | kubectl apply -f systemd-cgroup-gc.yaml
22 | 
23 | ### Uninstalling
24 | kubectl delete -f systemd-cgroup-gc.yaml
25 | 
26 | ## node-sysctl.yaml
27 | Simple Daemonset to set sysctl settings on nodes.  Right now it
28 | only sets a higher limit for `fs.inotify.max_user_watches`.  I
29 | think that the only reason that we are encountering watch limit
30 | issues right now is the same bugs that the `system-cgroup-gc`
31 | package is working around.
32 | 
33 | ### Installing
34 | kubectl apply -f node-sysctls.yaml
35 | 
36 | ### Uninstalling
37 | kubectl delete -f node-sysctls.yaml
38 | 
39 | ## reboot-regularly.yaml
40 | Simple Daemonset to reboot nodes regularly.  Uses
41 | [kured](https://github.com/weaveworks/kured) to safely trigger a
42 | rolling reboot during a specified window, if the node has not been
43 | rebooted within a specified timeframe.  If kured is not running,
44 | then marking the node for reboot will have no effect.
45 | 
46 | ### Installing
47 | kubectl apply -f regular-reboot.yaml
48 | 
49 | ### Uninstalling
50 | kubectl delete -f regular-reboot.yaml
51 | 


--------------------------------------------------------------------------------
/systemd-cgroup-gc.yaml:
--------------------------------------------------------------------------------
 1 | # This DaemonSet that runs a systemd cgroup garbage collection job once an hour to workaround
 2 | # bugs https://github.com/Azure/AKS/issues/750 & https://github.com/kubernetes/kubernetes/issues/64137.
 3 | # Note that these pods run in privileged mode, which can pose a security risk. Use with caution.
 4 | apiVersion: v1
 5 | kind: ConfigMap
 6 | metadata:
 7 |   name: systemd-cgroup-gc
 8 | data:
 9 |   systemd-cgroup-gc: |
10 |     #!/bin/bash
11 |     function runhost() {
12 |       # Runs a process on the host K8s node.
13 |       nsenter -m/proc/1/ns/mnt "$@"
14 |     }
15 |     count=0
16 |     for i in $(runhost ls /sys/fs/cgroup/systemd/system.slice |grep "^run-r"); do
17 |     pod=$(runhost systemctl list-units --type scope --state running $i |cat |sed -n 's/\(.*\)Kubernetes transient mount for \/var\/lib\/kubelet\/pods\/\(.*\)\/volumes\(.*\)/\2/p')
18 |     if [ ! -e "/var/lib/kubelet/pods/$pod" ]; then
19 |       echo -n "Try to stop '$i' systemd scope... "
20 |       runhost systemctl stop $i
21 |       echo "Stopped."
22 |       count=$((count + 1))
23 |     fi
24 |     done
25 |     echo "Total ${count} systemd scope stopped."
26 | ---
27 | apiVersion: apps/v1
28 | kind: DaemonSet
29 | metadata:
30 |   name: systemd-cgroup-gc
31 |   labels:
32 |     tier: management
33 |     app: systemd-cgroup-gc
34 | spec:
35 |   selector:
36 |     matchLabels:
37 |       name: systemd-cgroup-gc
38 |   template:
39 |     metadata:
40 |       labels:
41 |         name: systemd-cgroup-gc
42 |     spec:
43 |       hostPID: true
44 |       containers:
45 |         - resources:
46 |             requests:
47 |               cpu: 5m
48 |               memory: 500Ki
49 |             limits:
50 |               cpu: 5m
51 |               memory: 50Mi  # This crashes on startup with a 5Mi limit, but only uses about 320Ki after that.
52 |           securityContext:
53 |             privileged: true
54 |           image: ubuntu:16.04
55 |           name: systemd-cgroup-gc
56 |           command: ["/bin/bash", "-c"]
57 |           args:
58 |             - |
59 |               while true; do
60 |                 /usr/local/bin/systemd-cgroup-gc
61 |                 sleep 3600 # Run hourly.
62 |               done
63 |           volumeMounts:
64 |             - name: scriptsrc
65 |               mountPath: /usr/local/bin
66 |             - name: kubeletpath
67 |               mountPath: /var/lib/kubelet/pods
68 |       volumes:
69 |         - name: scriptsrc
70 |           configMap:
71 |             name: systemd-cgroup-gc
72 |             defaultMode: 0755
73 |         - name: kubeletpath
74 |           hostPath:
75 |             path: /var/lib/kubelet/pods
76 |             type: Directory
77 | 


--------------------------------------------------------------------------------
/regular-reboot.yaml:
--------------------------------------------------------------------------------
  1 | # This DaemonSet reboots K8s nodes via kured approximately once a week to workaround
  2 | # bugs https://github.com/Azure/AKS/issues/750 & https://github.com/kubernetes/kubernetes/issues/64137.
  3 | # Note that you should also be running the system-cgroup-gc DaemonSet, or a weekly reboot may not
  4 | # be sufficient to prevent node failure due to these bugs.
  5 | #
  6 | # This Daemonset relies on kured's (the KUbernetes REboot Daemon (https://github.com/weaveworks/kured))
  7 | # /var/run/reboot-required to signal that a reboot is required.  If kured is not installed, the reboot
  8 | # marker file will still be created, but no reboot will happen.
  9 | #
 10 | # Note that these pods run in privileged mode, which can pose a security risk. Use with caution.
 11 | apiVersion: v1
 12 | kind: ConfigMap
 13 | metadata:
 14 |   name: regular-reboot
 15 | data:
 16 |   regular-reboot: |
 17 |     #!/bin/bash
 18 | 
 19 |     # Default Configuration.
 20 |     window_start="Sunday 0000"
 21 |     window_end="Sunday 0500"
 22 | 
 23 |     # This is just large enough not to reboot twice within the same maintenance window.
 24 |     # Removing "Sunday" from window_start & window_end, then changing max_uptime to
 25 |     # "7", would reboot more exactly every 7 days, at the cost of potentially
 26 |     # rebooting on any day of the week.
 27 |     max_uptime="6 hours"
 28 | 
 29 |     # Maintenance window is interpreted in this time zone.
 30 |     # Also, will not output the same message twice in a row on the same day in this timezone:
 31 |     reference_TZ=America/New_York
 32 | 
 33 |     function runhost() {
 34 |       # Runs a process on the host K8s node.
 35 |       nsenter -m/proc/1/ns/mnt "$@"
 36 |     }
 37 | 
 38 |     # Don't output the same log message twice in a row on the same day.
 39 |     function echo_daily() {
 40 |       echo $(date +%Y-%m-%d): "$@" >/var/run/new-regular-reboot-message
 41 | 
 42 |       if ! diff -s /var/run/last-regular-reboot-message /var/run/new-regular-reboot-message >/dev/null 2>&1; then
 43 |         cat /var/run/new-regular-reboot-message
 44 |         mv /var/run/new-regular-reboot-message /var/run/last-regular-reboot-message
 45 |       fi
 46 |     }
 47 | 
 48 |     # Compare the uptime to our configuration and reboot as requested.
 49 |     export TZ=$reference_TZ
 50 |     now=$(date +%s)
 51 |     if test $(date -d "$window_start" +%s) -lt $now -a $(date -d "$window_end" +%s) -gt $now; then
 52 |       TZ=UTC up_since="$(runhost uptime -s) UTC"
 53 |       if test $(date -d "$up_since + $max_uptime" +%s) -lt $now; then
 54 |         message="Marking node for reboot.  Up since $up_since, which exceeds $max_uptime days."
 55 |         echo_daily $message
 56 |         echo "$(date) regular-reboot: $message" >>/host/var/run/reboot-required
 57 |       else
 58 |         echo_daily "Skipping reboot for node up since $up_since, which is less than $max_uptime days ago."
 59 |       fi
 60 |     else
 61 |       echo_daily "Skipping uptime check for node since we are outside its maintenance window."
 62 |     fi
 63 | ---
 64 | apiVersion: apps/v1
 65 | kind: DaemonSet
 66 | metadata:
 67 |   name: regular-reboot
 68 |   labels:
 69 |     tier: management
 70 |     app: regular-reboot
 71 | spec:
 72 |   selector:
 73 |     matchLabels:
 74 |       name: regular-reboot
 75 |   template:
 76 |     metadata:
 77 |       labels:
 78 |         name: regular-reboot
 79 |     spec:
 80 |       hostPID: true
 81 |       containers:
 82 |         - resources:
 83 |             requests:
 84 |               cpu: 5m
 85 |               memory: 500Ki
 86 |             limits:
 87 |               cpu: 5m
 88 |               memory: 50Mi  # This crashes on startup with a 5Mi limit, but only uses about 320Ki after that.
 89 |           securityContext:
 90 |             privileged: true
 91 |           image: debian:buster-slim
 92 |           name: regular-reboot
 93 |           command: ["/bin/bash", "-c"]
 94 |           args:
 95 |             - |
 96 |               while true; do
 97 |                 /usr/local/bin/regular-reboot
 98 |                 sleep 3600 # Run hourly.
 99 |               done
100 |           volumeMounts:
101 |             - name: scriptsrc
102 |               mountPath: /usr/local/bin
103 |             - name: varrun
104 |               mountPath: /host/var/run
105 |       volumes:
106 |         - name: scriptsrc
107 |           configMap:
108 |             name: regular-reboot
109 |             defaultMode: 0755
110 |         - name: varrun
111 |           hostPath:
112 |             path: /var/run
113 |             type: Directory
114 | 


--------------------------------------------------------------------------------