├── node-sysctls.yaml ├── README.md ├── systemd-cgroup-gc.yaml └── regular-reboot.yaml /node-sysctls.yaml: -------------------------------------------------------------------------------- 1 | # This DaemonSet that runs a systemd cgroup garbage collection job once an hour to workaround 2 | # bugs https://github.com/Azure/AKS/issues/750 & https://github.com/kubernetes/kubernetes/issues/64137. 3 | # Note that these pods run in privileged mode, which can pose a security risk. Use with caution. 4 | apiVersion: v1 5 | kind: ConfigMap 6 | metadata: 7 | name: node-sysctls 8 | data: 9 | node-sysctls: | 10 | #!/bin/bash 11 | sysctl -w fs.inotify.max_user_watches=525000 12 | --- 13 | apiVersion: apps/v1 14 | kind: DaemonSet 15 | metadata: 16 | name: node-sysctls 17 | labels: 18 | tier: management 19 | app: node-sysctls 20 | spec: 21 | selector: 22 | matchLabels: 23 | name: node-sysctls 24 | template: 25 | metadata: 26 | labels: 27 | name: node-sysctls 28 | spec: 29 | hostPID: true 30 | containers: 31 | - resources: 32 | requests: 33 | cpu: 5m 34 | memory: 500Ki 35 | limits: 36 | cpu: 5m 37 | memory: 50Mi # This crashes on startup with a 5Mi limit, but only uses about 320Ki after that. 38 | securityContext: 39 | privileged: true 40 | image: ubuntu:16.04 41 | name: node-sysctls 42 | command: ["/bin/bash", "-c"] 43 | args: 44 | - | 45 | while true; do 46 | /usr/local/bin/node-sysctls 47 | sleep 3600 # Run hourly. 48 | done 49 | volumeMounts: 50 | - name: scriptsrc 51 | mountPath: /usr/local/bin 52 | volumes: 53 | - name: scriptsrc 54 | configMap: 55 | name: node-sysctls 56 | defaultMode: 0755 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # k8s-hacks 2 | Scripts Used to Work Around K8s Problems 3 | 4 | I'm attaching some code here so that I can share it. 5 | ## systemd-cgroup-gc.yaml 6 | This works around: 7 | * https://github.com/kubernetes/kubernetes/issues/64137 8 | * https://github.com/Azure/AKS/issues/750 9 | * https://github.com/kubernetes/kubernetes/pull/73799 10 | * https://github.com/kubernetes/kubernetes/issues/60987 11 | * https://github.com/rancher/k3s/issues/294 12 | * https://github.com/kubernetes/kubernetes/issues/70324 13 | 14 | and probably some other tickets that aren't properly marked as duplicates yet. Basically, K8s is leaving orphaned mounts behind when pods are cleaned up, which causes CPU usage to linearly increase until the whole node is taken down. This is particularly exacerbated by K8s CronJobs since they have such a short lifecycle. With a few cronjobs running every minute, I am able to take down a two-core AKS cluster node in about a week. Depending on which ticket you look at, this problem is attributed to a failure of K8s to cleanup mounts or a bad interaction between certain kernel versions and certain versions of systemd. My money is on the latter since, on my system, the pod directories (and, thus their mount subdirs) are already gone but systemd still has the cgroup watches registered. 15 | 16 | Anyhow, this DaemonSet works around the problem by running a script 17 | hourly to search for the orphaned mounts and asking systemd to stop 18 | watching them. 19 | 20 | ### Installing 21 | kubectl apply -f systemd-cgroup-gc.yaml 22 | 23 | ### Uninstalling 24 | kubectl delete -f systemd-cgroup-gc.yaml 25 | 26 | ## node-sysctl.yaml 27 | Simple Daemonset to set sysctl settings on nodes. Right now it 28 | only sets a higher limit for `fs.inotify.max_user_watches`. I 29 | think that the only reason that we are encountering watch limit 30 | issues right now is the same bugs that the `system-cgroup-gc` 31 | package is working around. 32 | 33 | ### Installing 34 | kubectl apply -f node-sysctls.yaml 35 | 36 | ### Uninstalling 37 | kubectl delete -f node-sysctls.yaml 38 | 39 | ## reboot-regularly.yaml 40 | Simple Daemonset to reboot nodes regularly. Uses 41 | [kured](https://github.com/weaveworks/kured) to safely trigger a 42 | rolling reboot during a specified window, if the node has not been 43 | rebooted within a specified timeframe. If kured is not running, 44 | then marking the node for reboot will have no effect. 45 | 46 | ### Installing 47 | kubectl apply -f regular-reboot.yaml 48 | 49 | ### Uninstalling 50 | kubectl delete -f regular-reboot.yaml 51 | -------------------------------------------------------------------------------- /systemd-cgroup-gc.yaml: -------------------------------------------------------------------------------- 1 | # This DaemonSet that runs a systemd cgroup garbage collection job once an hour to workaround 2 | # bugs https://github.com/Azure/AKS/issues/750 & https://github.com/kubernetes/kubernetes/issues/64137. 3 | # Note that these pods run in privileged mode, which can pose a security risk. Use with caution. 4 | apiVersion: v1 5 | kind: ConfigMap 6 | metadata: 7 | name: systemd-cgroup-gc 8 | data: 9 | systemd-cgroup-gc: | 10 | #!/bin/bash 11 | function runhost() { 12 | # Runs a process on the host K8s node. 13 | nsenter -m/proc/1/ns/mnt "$@" 14 | } 15 | count=0 16 | for i in $(runhost ls /sys/fs/cgroup/systemd/system.slice |grep "^run-r"); do 17 | pod=$(runhost systemctl list-units --type scope --state running $i |cat |sed -n 's/\(.*\)Kubernetes transient mount for \/var\/lib\/kubelet\/pods\/\(.*\)\/volumes\(.*\)/\2/p') 18 | if [ ! -e "/var/lib/kubelet/pods/$pod" ]; then 19 | echo -n "Try to stop '$i' systemd scope... " 20 | runhost systemctl stop $i 21 | echo "Stopped." 22 | count=$((count + 1)) 23 | fi 24 | done 25 | echo "Total ${count} systemd scope stopped." 26 | --- 27 | apiVersion: apps/v1 28 | kind: DaemonSet 29 | metadata: 30 | name: systemd-cgroup-gc 31 | labels: 32 | tier: management 33 | app: systemd-cgroup-gc 34 | spec: 35 | selector: 36 | matchLabels: 37 | name: systemd-cgroup-gc 38 | template: 39 | metadata: 40 | labels: 41 | name: systemd-cgroup-gc 42 | spec: 43 | hostPID: true 44 | containers: 45 | - resources: 46 | requests: 47 | cpu: 5m 48 | memory: 500Ki 49 | limits: 50 | cpu: 5m 51 | memory: 50Mi # This crashes on startup with a 5Mi limit, but only uses about 320Ki after that. 52 | securityContext: 53 | privileged: true 54 | image: ubuntu:16.04 55 | name: systemd-cgroup-gc 56 | command: ["/bin/bash", "-c"] 57 | args: 58 | - | 59 | while true; do 60 | /usr/local/bin/systemd-cgroup-gc 61 | sleep 3600 # Run hourly. 62 | done 63 | volumeMounts: 64 | - name: scriptsrc 65 | mountPath: /usr/local/bin 66 | - name: kubeletpath 67 | mountPath: /var/lib/kubelet/pods 68 | volumes: 69 | - name: scriptsrc 70 | configMap: 71 | name: systemd-cgroup-gc 72 | defaultMode: 0755 73 | - name: kubeletpath 74 | hostPath: 75 | path: /var/lib/kubelet/pods 76 | type: Directory 77 | -------------------------------------------------------------------------------- /regular-reboot.yaml: -------------------------------------------------------------------------------- 1 | # This DaemonSet reboots K8s nodes via kured approximately once a week to workaround 2 | # bugs https://github.com/Azure/AKS/issues/750 & https://github.com/kubernetes/kubernetes/issues/64137. 3 | # Note that you should also be running the system-cgroup-gc DaemonSet, or a weekly reboot may not 4 | # be sufficient to prevent node failure due to these bugs. 5 | # 6 | # This Daemonset relies on kured's (the KUbernetes REboot Daemon (https://github.com/weaveworks/kured)) 7 | # /var/run/reboot-required to signal that a reboot is required. If kured is not installed, the reboot 8 | # marker file will still be created, but no reboot will happen. 9 | # 10 | # Note that these pods run in privileged mode, which can pose a security risk. Use with caution. 11 | apiVersion: v1 12 | kind: ConfigMap 13 | metadata: 14 | name: regular-reboot 15 | data: 16 | regular-reboot: | 17 | #!/bin/bash 18 | 19 | # Default Configuration. 20 | window_start="Sunday 0000" 21 | window_end="Sunday 0500" 22 | 23 | # This is just large enough not to reboot twice within the same maintenance window. 24 | # Removing "Sunday" from window_start & window_end, then changing max_uptime to 25 | # "7", would reboot more exactly every 7 days, at the cost of potentially 26 | # rebooting on any day of the week. 27 | max_uptime="6 hours" 28 | 29 | # Maintenance window is interpreted in this time zone. 30 | # Also, will not output the same message twice in a row on the same day in this timezone: 31 | reference_TZ=America/New_York 32 | 33 | function runhost() { 34 | # Runs a process on the host K8s node. 35 | nsenter -m/proc/1/ns/mnt "$@" 36 | } 37 | 38 | # Don't output the same log message twice in a row on the same day. 39 | function echo_daily() { 40 | echo $(date +%Y-%m-%d): "$@" >/var/run/new-regular-reboot-message 41 | 42 | if ! diff -s /var/run/last-regular-reboot-message /var/run/new-regular-reboot-message >/dev/null 2>&1; then 43 | cat /var/run/new-regular-reboot-message 44 | mv /var/run/new-regular-reboot-message /var/run/last-regular-reboot-message 45 | fi 46 | } 47 | 48 | # Compare the uptime to our configuration and reboot as requested. 49 | export TZ=$reference_TZ 50 | now=$(date +%s) 51 | if test $(date -d "$window_start" +%s) -lt $now -a $(date -d "$window_end" +%s) -gt $now; then 52 | TZ=UTC up_since="$(runhost uptime -s) UTC" 53 | if test $(date -d "$up_since + $max_uptime" +%s) -lt $now; then 54 | message="Marking node for reboot. Up since $up_since, which exceeds $max_uptime days." 55 | echo_daily $message 56 | echo "$(date) regular-reboot: $message" >>/host/var/run/reboot-required 57 | else 58 | echo_daily "Skipping reboot for node up since $up_since, which is less than $max_uptime days ago." 59 | fi 60 | else 61 | echo_daily "Skipping uptime check for node since we are outside its maintenance window." 62 | fi 63 | --- 64 | apiVersion: apps/v1 65 | kind: DaemonSet 66 | metadata: 67 | name: regular-reboot 68 | labels: 69 | tier: management 70 | app: regular-reboot 71 | spec: 72 | selector: 73 | matchLabels: 74 | name: regular-reboot 75 | template: 76 | metadata: 77 | labels: 78 | name: regular-reboot 79 | spec: 80 | hostPID: true 81 | containers: 82 | - resources: 83 | requests: 84 | cpu: 5m 85 | memory: 500Ki 86 | limits: 87 | cpu: 5m 88 | memory: 50Mi # This crashes on startup with a 5Mi limit, but only uses about 320Ki after that. 89 | securityContext: 90 | privileged: true 91 | image: debian:buster-slim 92 | name: regular-reboot 93 | command: ["/bin/bash", "-c"] 94 | args: 95 | - | 96 | while true; do 97 | /usr/local/bin/regular-reboot 98 | sleep 3600 # Run hourly. 99 | done 100 | volumeMounts: 101 | - name: scriptsrc 102 | mountPath: /usr/local/bin 103 | - name: varrun 104 | mountPath: /host/var/run 105 | volumes: 106 | - name: scriptsrc 107 | configMap: 108 | name: regular-reboot 109 | defaultMode: 0755 110 | - name: varrun 111 | hostPath: 112 | path: /var/run 113 | type: Directory 114 | --------------------------------------------------------------------------------