├── .github ├── CODEOWNERS ├── workflows │ ├── release-chart.yaml │ ├── release.yaml │ ├── ci-chart.yaml │ └── ci.yaml ├── ISSUE_TEMPLATE.md ├── dependabot.yml └── PULL_REQUEST_TEMPLATE.md ├── docs ├── k8s-shredder.gif ├── loop_diagram.png ├── k8s-shredder-logo.png ├── shredder_firefly.png ├── architecture.md ├── loop-diagram.md ├── node-parking.md ├── metrics.md └── e2e_tests.md ├── renovate.json ├── pyproject.toml ├── .gitignore ├── .yamlfix ├── charts └── k8s-shredder │ ├── templates │ ├── service-account.yaml │ ├── NOTES.txt │ ├── cluster-role-binding.yaml │ ├── service.yaml │ ├── cluster-role.yaml │ ├── podmonitor.yaml │ ├── configmap.yaml │ ├── _helpers.tpl │ └── deployment.yaml │ ├── .helmignore │ ├── Chart.yaml │ ├── values.yaml │ └── README.md ├── Dockerfile ├── main.go ├── internal ├── testing │ ├── rollback_cluster_upgrade.sh │ ├── cluster_upgrade.sh │ ├── kind.yaml │ ├── kind-karpenter.yaml │ ├── kind-node-labels.yaml │ ├── rbac.yaml │ ├── test_eviction_safety_check.sh │ ├── park_node.go │ ├── prometheus_stuffs.yaml │ ├── prometheus_stuffs_karpenter.yaml │ ├── prometheus_stuffs_node_labels.yaml │ ├── k8s-shredder.yaml │ ├── k8s-shredder-karpenter.yaml │ ├── k8s-shredder-node-labels.yaml │ ├── karpenter-manifests.yaml │ ├── local_env_prep_helm.sh │ ├── local_env_prep_node_labels_helm.sh │ ├── cluster_upgrade_node_labels.sh │ ├── test_apps.yaml │ └── local_env_prep_karpenter_helm.sh └── check_license.sh ├── cmd └── park-node │ └── main.go ├── pkg ├── utils │ ├── signal.go │ ├── context.go │ ├── k8s.go │ └── node_label_detection.go ├── metrics │ ├── types.go │ └── metrics.go ├── config │ └── config.go └── schedule │ └── schedule.go ├── RELEASE.md ├── CONTRIBUTING.md ├── .goreleaser.yml ├── config.yaml ├── CODE_OF_CONDUCT.md ├── go.mod └── Makefile /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Maintainers 2 | * @adobe/ethos -------------------------------------------------------------------------------- /docs/k8s-shredder.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/k8s-shredder/HEAD/docs/k8s-shredder.gif -------------------------------------------------------------------------------- /docs/loop_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/k8s-shredder/HEAD/docs/loop_diagram.png -------------------------------------------------------------------------------- /docs/k8s-shredder-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/k8s-shredder/HEAD/docs/k8s-shredder-logo.png -------------------------------------------------------------------------------- /docs/shredder_firefly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/k8s-shredder/HEAD/docs/shredder_firefly.png -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 |

4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.yamlfix] 2 | # Preserve block scalar format and prevent conversion to quoted strings 3 | # This configuration helps maintain readable YAML block scalars 4 | 5 | # Don't convert multiline strings to quoted format 6 | # This preserves the |- and | block scalar indicators 7 | preserve_block_scalars = true 8 | 9 | # Maintain original formatting where possible 10 | preserve_formatting = true 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OSX os files 2 | .DS_Store 3 | .DS_Store? 4 | 5 | # build artifacts 6 | kubeconfig* 7 | dist 8 | /k8s-shredder 9 | my-k8s-shredder-values.yaml 10 | /park-node 11 | 12 | # Test binary, build with `go test -c` 13 | *.test 14 | 15 | # Output of the go coverage tool, specifically when used with LiteIDE 16 | *.out 17 | 18 | # editor and IDE paraphernalia 19 | .idea 20 | *.swp 21 | *.swo 22 | *~ 23 | .vscode 24 | -------------------------------------------------------------------------------- /.yamlfix: -------------------------------------------------------------------------------- 1 | # yamlfix configuration 2 | # Preserve block scalar format and prevent conversion to quoted strings 3 | 4 | # Configuration options for yamlfix 5 | # This file should be in the root directory of the project 6 | 7 | # Preserve block scalar format (|- and |) 8 | # This prevents conversion to quoted strings with escaped newlines 9 | preserve_block_scalars: true 10 | 11 | # Maintain original formatting 12 | preserve_formatting: true 13 | -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/service-account.yaml: -------------------------------------------------------------------------------- 1 | {{ if .Values.serviceAccount.create }} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "k8s-shredder.serviceAccountName" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{ include "k8s-shredder.labels" . | indent 4 }} 9 | {{- with .Values.serviceAccount.annotations }} 10 | annotations: 11 | {{- toYaml . | nindent 4 }} 12 | {{- end }} 13 | {{ end }} -------------------------------------------------------------------------------- /charts/k8s-shredder/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1. Get the application metrics URL by running these commands: 2 | 3 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "k8s-shredder.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") 4 | echo "Visit http://127.0.0.1:8080/metrics to get shredder metrics" 5 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080 6 | -------------------------------------------------------------------------------- /charts/k8s-shredder/Chart.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v2 3 | name: k8s-shredder 4 | description: a novel way of dealing with kubernetes nodes blocked from draining 5 | type: application 6 | home: https://github.com/adobe/k8s-shredder 7 | icon: https://raw.githubusercontent.com/adobe/k8s-shredder/main/docs/k8s-shredder_logo.jpg 8 | maintainers: 9 | - name: adriananeci 10 | email: aneci@adobe.com 11 | url: https://adobe.com 12 | - name: sfotony 13 | email: gosselin@adobe.com 14 | url: https://adobe.com 15 | version: 0.2.8 16 | appVersion: v0.3.8 17 | -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/cluster-role-binding.yaml: -------------------------------------------------------------------------------- 1 | {{ if .Values.rbac.create}} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: {{ include "k8s-shredder.fullname" . }} 6 | labels: 7 | {{ include "k8s-shredder.labels" . | indent 4 }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: {{ include "k8s-shredder.fullname" . }} 12 | subjects: 13 | - kind: ServiceAccount 14 | name: {{ include "k8s-shredder.serviceAccountName" . }} 15 | namespace: {{ .Release.Namespace }} 16 | {{ end }} -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine AS show_me_your_security 2 | # Install the Certificate-Authority certificates for the app to be able to make 3 | # calls to HTTPS endpoints. 4 | RUN apk add --no-cache ca-certificates 5 | 6 | # The second stage, create a small final image 7 | FROM scratch 8 | # Copy the /etc/passwd file we created in the builder stage. This creates a new 9 | # non-root user as a security best practice. 10 | COPY --from=show_me_your_security /etc/passwd /etc/passwd 11 | # Copy the certs from the builder stage 12 | COPY --from=show_me_your_security /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ 13 | # copy our binary 14 | COPY k8s-shredder /k8s-shredder 15 | ENTRYPOINT ["/k8s-shredder"] 16 | -------------------------------------------------------------------------------- /.github/workflows/release-chart.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Release Charts 3 | on: 4 | push: 5 | branches: [main] 6 | jobs: 7 | release: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v6 12 | with: 13 | fetch-depth: 0 14 | - name: Configure Git 15 | run: | 16 | git config user.name "$GITHUB_ACTOR" 17 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com" 18 | - name: Run chart-releaser 19 | uses: helm/chart-releaser-action@v1.7.0 20 | env: 21 | CR_TOKEN: ${{ secrets.GITHUB_TOKEN }} 22 | CR_SKIP_EXISTING: true 23 | CR_RELEASE_NAME_TEMPLATE: Helm-Chart-v{{ .Version }} 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ### Expected Behaviour 5 | 6 | ### Actual Behaviour 7 | 8 | ### Reproduce Scenario (including but not limited to) 9 | 10 | #### Steps to Reproduce 11 | 12 | #### Platform and Version 13 | 14 | #### Sample Code that illustrates the problem 15 | 16 | #### Logs taken while reproducing problem 17 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE-2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package main 13 | 14 | import "github.com/adobe/k8s-shredder/cmd" 15 | 16 | func main() { 17 | cmd.Execute() 18 | } 19 | -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/service.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.service.create }} 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: {{ include "k8s-shredder.fullname" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{ include "k8s-shredder.labels" . | indent 4 }} 9 | {{- if .Values.service.labels }} 10 | {{ toYaml .Values.service.labels | indent 4 }} 11 | {{- end }} 12 | {{- if .Values.service.annotations }} 13 | annotations: 14 | {{ toYaml .Values.service.annotations | indent 4 }} 15 | {{- end }} 16 | spec: 17 | type: {{ .Values.service.type }} 18 | selector: 19 | {{ include "k8s-shredder.matchLabels" . | indent 4 }} 20 | ports: 21 | - name: metrics 22 | port: {{ .Values.service.port }} 23 | targetPort: {{ .Values.service.targetPort }} 24 | protocol: TCP 25 | {{- end }} 26 | -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/cluster-role.yaml: -------------------------------------------------------------------------------- 1 | {{ if .Values.rbac.create }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: {{ include "k8s-shredder.fullname" . }} 6 | labels: 7 | {{ include "k8s-shredder.labels" . | indent 4 }} 8 | rules: 9 | - apiGroups: ["*"] 10 | resources: [nodes] 11 | verbs: [get, list, watch, update, patch] 12 | - apiGroups: ["*"] 13 | resources: [pods, pods/eviction] 14 | verbs: ["*"] 15 | - apiGroups: [apps, extensions] 16 | resources: [statefulsets, deployments, replicasets] 17 | verbs: [get, list, watch, update, patch] 18 | - apiGroups: [ "argoproj.io" ] 19 | resources: [ rollouts ] 20 | verbs: [ get, list, watch, update, patch ] 21 | - apiGroups: [ "karpenter.sh" ] 22 | resources: [ nodeclaims ] 23 | verbs: [ get, list, watch ] 24 | {{ end }} 25 | -------------------------------------------------------------------------------- /internal/testing/rollback_cluster_upgrade.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | K8S_CLUSTER_NAME=$1 5 | 6 | # For moving node back as active, useful during debug process 7 | export K8S_CLUSTER_NAME=k8s-shredder-test-cluster 8 | kubectl uncordon "${K8S_CLUSTER_NAME}-worker" --kubeconfig=kubeconfig 9 | kubectl label node "${K8S_CLUSTER_NAME}-worker" --kubeconfig=kubeconfig shredder.ethos.adobe.net/upgrade-status- 10 | kubectl label node "${K8S_CLUSTER_NAME}-worker" --kubeconfig=kubeconfig --overwrite shredder.ethos.adobe.net/parked-node-expires-on- 11 | kubectl delete -n ns-k8s-shredder-test $(kubectl get pods -n ns-k8s-shredder-test -oname) --force --wait=0 --timeout=0 12 | kubectl delete -n ns-team-k8s-shredder-test $(kubectl get pods -n ns-team-k8s-shredder-test -oname) --force --wait=0 --timeout=0 13 | kubectl get po -A --field-selector=spec.nodeName=k8s-shredder-test-cluster-worker -------------------------------------------------------------------------------- /docs/loop-diagram.md: -------------------------------------------------------------------------------- 1 | # Loop diagram 2 | 3 | 1. Get all nodes with `labels=shredder.ethos.adobe.net/upgrade-status=parked` and `taints!=ToBeDeletedByClusterAutoscaler`. 4 | 2. Loop through all parked nodes. 5 | 3. Check if the node has reached `shredder.ethos.adobe.net/parked-node-expires-on` time. 6 | 4. Force delete all pods from the node. 7 | 5. Get all pods from the parked node. 8 | 6. Loop through each pod. 9 | 7. Check if the pod is part of a skipped eviction namespace. 10 | 8. Check if the controller object that owns the pod has a rollout restart already in progress. 11 | 9. Check if the elapsed time is greater than the time node needs to be parked. 12 | 10. Perform a rollout restart of the controller object which owns the pod. 13 | 11. Check if the pod has the label `shredder.ethos.adobe.net/allow-eviction=false` attached. 14 | 12. Evict the pod. 15 | 16 |

17 | -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/podmonitor.yaml: -------------------------------------------------------------------------------- 1 | {{ if .Values.podMonitor.enabled }} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PodMonitor 4 | metadata: 5 | name: {{ include "k8s-shredder.fullname" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{ include "k8s-shredder.labels" . | indent 4 }} 9 | {{- with .Values.podMonitor.labels }} 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | jobLabel: "k8s-shredder" 14 | namespaceSelector: 15 | matchNames: 16 | - {{ .Release.Namespace }} 17 | podMetricsEndpoints: 18 | - interval: {{ .Values.podMonitor.interval }} 19 | path: /metrics 20 | port: metrics 21 | scheme: http 22 | honorLabels: {{ .Values.podMonitor.honorLabels }} 23 | {{- if .Values.podMonitor.relabelings }} 24 | relabelings: 25 | {{- toYaml .Values.podMonitor.relabelings | nindent 8 }} 26 | {{- end }} 27 | selector: 28 | matchLabels: 29 | {{ include "k8s-shredder.matchLabels" . | indent 6 }} 30 | {{- end }} -------------------------------------------------------------------------------- /cmd/park-node/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Adobe. All rights reserved. 2 | package main 3 | 4 | import ( 5 | "flag" 6 | "log" 7 | "os" 8 | 9 | e2e "github.com/adobe/k8s-shredder/internal/testing" 10 | ) 11 | 12 | func main() { 13 | var nodeName, kubeconfigPath string 14 | 15 | // Use a custom flag set to avoid conflicts with client-go flags 16 | fs := flag.NewFlagSet("park-node", flag.ExitOnError) 17 | fs.StringVar(&nodeName, "node", "", "Name of the node to park") 18 | fs.StringVar(&kubeconfigPath, "park-kubeconfig", "", "Path to kubeconfig file") 19 | if err := fs.Parse(os.Args[1:]); err != nil { 20 | log.Fatal(err) 21 | } 22 | 23 | if nodeName == "" { 24 | log.Fatal("Node name is required. Use -node flag") 25 | } 26 | if kubeconfigPath == "" { 27 | log.Fatal("Kubeconfig path is required. Use -park-kubeconfig flag") 28 | } 29 | 30 | if err := e2e.ParkNodeForTesting(nodeName, kubeconfigPath); err != nil { 31 | log.Fatalf("Failed to park node: %v", err) 32 | } 33 | 34 | log.Printf("Successfully parked node %s", nodeName) 35 | os.Exit(0) 36 | } 37 | -------------------------------------------------------------------------------- /pkg/utils/signal.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE-2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package utils 13 | 14 | import ( 15 | "context" 16 | "os" 17 | "os/signal" 18 | "syscall" 19 | 20 | log "github.com/sirupsen/logrus" 21 | ) 22 | 23 | // HandleOsSignals gracefully handles OS signals 24 | func HandleOsSignals(cancel context.CancelFunc) { 25 | c := make(chan os.Signal, 1) 26 | signal.Notify(c, 27 | syscall.SIGHUP, 28 | syscall.SIGINT, 29 | syscall.SIGTERM, 30 | syscall.SIGQUIT, 31 | ) 32 | 33 | sig := <-c 34 | log.Debugf("Got signal %s, terminating gracefully", sig.String()) 35 | cancel() 36 | os.Exit(0) 37 | } 38 | -------------------------------------------------------------------------------- /internal/testing/cluster_upgrade.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | K8S_CLUSTER_NAME=$1 5 | KUBECONFIG_FILE=${2:-kubeconfig} 6 | 7 | echo "K8S_SHREDDER: Simulating cluster upgrade..." 8 | echo "K8S_SHREDDER: Parking k8s-shredder-worker with proper pod labeling and a TTL of 1 minute!" 9 | 10 | # Use the park-node binary to properly park the node (labels both node and pods) 11 | ./park-node -node "${K8S_CLUSTER_NAME}-worker" -park-kubeconfig "${KUBECONFIG_FILE}" 12 | 13 | if [[ ${WAIT_FOR_PODS:-false} == "true" ]] 14 | then 15 | while [[ $pod_status != "No resources found" ]] 16 | do 17 | echo "Info: Waiting for all pods to be evicted from the node..." 18 | sleep 10 19 | pod_status=$(kubectl get pods -A --field-selector metadata.namespace!=kube-system,metadata.namespace!=local-path-storage,spec.nodeName=k8s-shredder-test-cluster-worker 2>&1 >/dev/null) 20 | done 21 | 22 | # This is to simulate the upgrade process. We are going to wait for 1 minute and then uncordon the node. 23 | kubectl label node "${K8S_CLUSTER_NAME}-worker" --kubeconfig=${KUBECONFIG_FILE} shredder.ethos.adobe.net/upgrade-status- 24 | kubectl label node "${K8S_CLUSTER_NAME}-worker" --kubeconfig=${KUBECONFIG_FILE} --overwrite shredder.ethos.adobe.net/parked-node-expires-on- 25 | fi 26 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Please see the documentation for all configuration options: https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | version: 2 4 | updates: 5 | # github-actions 6 | - directory: / 7 | package-ecosystem: github-actions 8 | schedule: 9 | interval: weekly 10 | time: 09:00 11 | # Use Europe/Bucharest Standard Time (UTC +02:00) 12 | timezone: Europe/Bucharest 13 | commit-message: 14 | prefix: dependabot 15 | include: scope 16 | labels: 17 | - kind/cleanup 18 | - dependabot 19 | # Go 20 | - directory: / 21 | package-ecosystem: gomod 22 | schedule: 23 | interval: weekly 24 | time: 09:00 25 | # Use Europe/Bucharest Standard Time (UTC +02:00) 26 | timezone: Europe/Bucharest 27 | commit-message: 28 | prefix: dependabot 29 | include: scope 30 | # TODO decide if we should enable ignore 31 | # ignore: 32 | # # Ignore controller-runtime as its upgraded manually. 33 | # - dependency-name: "sigs.k8s.io/controller-runtime" 34 | # # Ignore k8s and its transitives modules as they are upgraded manually together with controller-runtime. 35 | # - dependency-name: "k8s.io/*" 36 | labels: [kind/cleanup, dependabot] 37 | -------------------------------------------------------------------------------- /internal/testing/kind.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: Cluster 3 | apiVersion: kind.x-k8s.io/v1alpha4 4 | networking: 5 | apiServerPort: 6443 6 | apiServerAddress: 0.0.0.0 7 | nodes: 8 | - role: control-plane 9 | extraPortMappings: 10 | - containerPort: 30007 11 | hostPort: 30007 12 | kubeadmConfigPatches: 13 | - | 14 | kind: InitConfiguration 15 | nodeRegistration: 16 | kubeletExtraArgs: 17 | node-labels: "node.kubernetes.io/role=etcd,node.kubernetes.io/role=master" 18 | - role: worker 19 | kubeadmConfigPatches: 20 | - | 21 | kind: JoinConfiguration 22 | nodeRegistration: 23 | kubeletExtraArgs: 24 | node-labels: "node.kubernetes.io/role=worker,will-be-parked=yes" 25 | - role: worker 26 | kubeadmConfigPatches: 27 | - |- 28 | kind: JoinConfiguration 29 | nodeRegistration: 30 | kubeletExtraArgs: 31 | node-labels: "node.kubernetes.io/role=worker" 32 | - role: worker 33 | kubeadmConfigPatches: 34 | - |- 35 | kind: JoinConfiguration 36 | nodeRegistration: 37 | kubeletExtraArgs: 38 | node-labels: "node.kubernetes.io/role=worker,monitoring=dedicated" 39 | taints: 40 | - key: "monitoring" 41 | value: "dedicated" 42 | effect: "NoSchedule" 43 | -------------------------------------------------------------------------------- /internal/testing/kind-karpenter.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: Cluster 3 | apiVersion: kind.x-k8s.io/v1alpha4 4 | networking: 5 | apiServerPort: 6444 6 | apiServerAddress: 0.0.0.0 7 | nodes: 8 | - role: control-plane 9 | extraPortMappings: 10 | - containerPort: 30007 11 | hostPort: 30008 12 | kubeadmConfigPatches: 13 | - | 14 | kind: InitConfiguration 15 | nodeRegistration: 16 | kubeletExtraArgs: 17 | node-labels: "node.kubernetes.io/role=etcd,node.kubernetes.io/role=master" 18 | - role: worker 19 | kubeadmConfigPatches: 20 | - | 21 | kind: JoinConfiguration 22 | nodeRegistration: 23 | kubeletExtraArgs: 24 | node-labels: "node.kubernetes.io/role=worker,will-be-parked=yes" 25 | - role: worker 26 | kubeadmConfigPatches: 27 | - |- 28 | kind: JoinConfiguration 29 | nodeRegistration: 30 | kubeletExtraArgs: 31 | node-labels: "node.kubernetes.io/role=worker" 32 | - role: worker 33 | kubeadmConfigPatches: 34 | - |- 35 | kind: JoinConfiguration 36 | nodeRegistration: 37 | kubeletExtraArgs: 38 | node-labels: "node.kubernetes.io/role=worker,monitoring=dedicated" 39 | taints: 40 | - key: "monitoring" 41 | value: "dedicated" 42 | effect: "NoSchedule" 43 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: release 3 | on: 4 | push: 5 | tags: [v*] 6 | permissions: 7 | contents: write # needed to write releases 8 | id-token: write # needed for keyless signing 9 | packages: write # needed for ghcr access 10 | jobs: 11 | release: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v6 15 | with: 16 | fetch-depth: 0 # this is important, otherwise it won't checkout the full tree (i.e. no previous tags) 17 | # Add support for more platforms with QEMU (optional) 18 | # https://github.com/docker/setup-qemu-action 19 | - name: Set up QEMU 20 | uses: docker/setup-qemu-action@v3 21 | - name: Set up Docker Buildx 22 | uses: docker/setup-buildx-action@v3 23 | - uses: actions/setup-go@v6 24 | with: 25 | go-version: '1.25' 26 | cache: true 27 | - uses: sigstore/cosign-installer@v4.0.0 # installs cosign 28 | # - uses: anchore/sbom-action/download-syft@v0.14.1 # installs syft 29 | - uses: docker/login-action@v3 # login to ghcr 30 | with: 31 | registry: ghcr.io 32 | username: ${{ github.repository_owner }} 33 | password: ${{ secrets.GITHUB_TOKEN }} 34 | - uses: goreleaser/goreleaser-action@v6 # run goreleaser 35 | with: 36 | version: latest 37 | args: release --clean 38 | env: 39 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 40 | -------------------------------------------------------------------------------- /internal/testing/kind-node-labels.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: Cluster 3 | apiVersion: kind.x-k8s.io/v1alpha4 4 | networking: 5 | apiServerPort: 6445 6 | apiServerAddress: 0.0.0.0 7 | nodes: 8 | - role: control-plane 9 | extraPortMappings: 10 | - containerPort: 30007 11 | hostPort: 30009 12 | kubeadmConfigPatches: 13 | - | 14 | kind: InitConfiguration 15 | nodeRegistration: 16 | kubeletExtraArgs: 17 | node-labels: "node.kubernetes.io/role=etcd,node.kubernetes.io/role=master" 18 | - role: worker 19 | kubeadmConfigPatches: 20 | - | 21 | kind: JoinConfiguration 22 | nodeRegistration: 23 | kubeletExtraArgs: 24 | node-labels: "node.kubernetes.io/role=worker" 25 | - role: worker 26 | kubeadmConfigPatches: 27 | - |- 28 | kind: JoinConfiguration 29 | nodeRegistration: 30 | kubeletExtraArgs: 31 | node-labels: "node.kubernetes.io/role=worker" 32 | - role: worker 33 | kubeadmConfigPatches: 34 | - |- 35 | kind: JoinConfiguration 36 | nodeRegistration: 37 | kubeletExtraArgs: 38 | node-labels: "node.kubernetes.io/role=worker" 39 | - role: worker 40 | kubeadmConfigPatches: 41 | - |- 42 | kind: JoinConfiguration 43 | nodeRegistration: 44 | kubeletExtraArgs: 45 | node-labels: "node.kubernetes.io/role=worker,monitoring=dedicated" 46 | taints: 47 | - key: "monitoring" 48 | value: "dedicated" 49 | effect: "NoSchedule" 50 | -------------------------------------------------------------------------------- /.github/workflows/ci-chart.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Lint and Test Helm Chart 3 | on: pull_request 4 | jobs: 5 | lint-test: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Checkout 9 | uses: actions/checkout@v6 10 | with: 11 | fetch-depth: 0 12 | - name: Set up Helm 13 | uses: azure/setup-helm@v4 14 | with: 15 | version: v3.12.1 16 | # Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and 17 | # yamllint (https://github.com/adrienverge/yamllint) which require Python 18 | - name: Set up Python 19 | uses: actions/setup-python@v6 20 | with: 21 | python-version: '3.12' 22 | check-latest: true 23 | - name: Set up chart-testing 24 | uses: helm/chart-testing-action@v2.8.0 25 | - name: Run chart-testing (list-changed) 26 | id: list-changed 27 | run: | 28 | changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }}) 29 | if [[ -n "$changed" ]]; then 30 | echo "changed=true" >> "$GITHUB_OUTPUT" 31 | fi 32 | - name: Run chart-testing (lint) 33 | if: steps.list-changed.outputs.changed == 'true' 34 | run: ct lint --target-branch ${{ github.event.repository.default_branch }} 35 | - name: Create kind cluster 36 | if: steps.list-changed.outputs.changed == 'true' 37 | uses: helm/kind-action@v1.13.0 38 | with: 39 | version: v0.29.0 40 | - name: Run chart-testing (install) 41 | if: steps.list-changed.outputs.changed == 'true' 42 | run: ct install --target-branch ${{ github.event.repository.default_branch }} 43 | -------------------------------------------------------------------------------- /internal/testing/rbac.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: k8s-shredder 6 | namespace: kube-system 7 | --- 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRoleBinding 10 | metadata: 11 | name: k8s-shredder 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: k8s-shredder 16 | subjects: 17 | - kind: ServiceAccount 18 | name: k8s-shredder 19 | namespace: kube-system 20 | --- 21 | apiVersion: rbac.authorization.k8s.io/v1 22 | kind: ClusterRole 23 | metadata: 24 | name: k8s-shredder 25 | rules: 26 | - apiGroups: ['*'] 27 | resources: [nodes] 28 | verbs: [get, list, watch, update, patch] 29 | - apiGroups: ['*'] 30 | resources: [pods, pods/eviction] 31 | verbs: ['*'] 32 | - apiGroups: [apps, extensions] 33 | resources: [statefulsets, deployments, replicasets] 34 | verbs: [get, list, watch, update, patch] 35 | - apiGroups: [argoproj.io] 36 | resources: [rollouts] 37 | verbs: [get, list, watch, update, patch] 38 | - apiGroups: [karpenter.sh] 39 | resources: [nodeclaims] 40 | verbs: [get, list, watch] 41 | --- 42 | apiVersion: rbac.authorization.k8s.io/v1 43 | kind: ClusterRole 44 | metadata: 45 | name: edit-debug-flags-v 46 | rules: 47 | - apiGroups: [''] 48 | resources: [nodes/proxy] 49 | verbs: [update] 50 | - nonResourceURLs: [/debug/flags/v] 51 | verbs: [put] 52 | --- 53 | apiVersion: rbac.authorization.k8s.io/v1 54 | kind: ClusterRoleBinding 55 | metadata: 56 | name: edit-debug-flags-v 57 | roleRef: 58 | apiGroup: rbac.authorization.k8s.io 59 | kind: ClusterRole 60 | name: edit-debug-flags-v 61 | subjects: 62 | - kind: ServiceAccount 63 | name: default 64 | namespace: default 65 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | ## Creating a new release of `k8s-shredder` 2 | 3 | All k8s-shredder binaries, container image and helm chart are released using github actions workflows. 4 | See [release workflow](.github/workflows/release.yaml) and [release chart workflow](.github/workflows/release-chart.yaml) for more details. 5 | 6 | For publishing a new release follow below steps: 7 | 8 | ``` 9 | export NEW_VERSION=vX.Y.Z 10 | git tag -a ${NEW_VERSION} -m "Release ${NEW_VERSION}" 11 | git push origin ${NEW_VERSION} 12 | ``` 13 | 14 | ## Manually Releasing new `k8s-shredder` version 15 | 16 | For release process we're using [`goreleaser`](https://goreleaser.com/). You must install it first before being able to 17 | release a new version. 18 | Config file for `goreleaser` can be found in [goreleaser file](.goreleaser.yml) 19 | 20 | GoReleaser requires an API token with the `repo` scope selected to deploy the artifacts to GitHub. 21 | For generating a new token, you can create one from [tokens section](https://github.com/settings/tokens/new). For more details see 22 | [creating-a-personal-access-token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) 23 | 24 | For publishing a new release follow below steps: 25 | 26 | ``` 27 | export NEW_VERSION=vX.Y.Z 28 | git tag -a ${NEW_VERSION} -m "Release ${NEW_VERSION}" 29 | git push origin ${NEW_VERSION} 30 | 31 | export GITHUB_TOKEN= 32 | 33 | docker login ghcr.io 34 | Username: 35 | Password: 36 | 37 | make publish 38 | ``` 39 | 40 | You can check if the new release and associated artifacts were properly pushed into GitHub by accessing 41 | [k8s-shredder releases](https://github.com/adobe/k8s-shredder/releases) -------------------------------------------------------------------------------- /internal/testing/test_eviction_safety_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # EvictionSafetyCheck E2E Test Script 4 | # 5 | # This script runs the EvictionSafetyCheck e2e tests for k8s-shredder. 6 | # The tests verify that k8s-shredder properly validates that all pods on a parked node 7 | # have the required parking labels before proceeding with force eviction. 8 | # 9 | # Test Cases: 10 | # 1. TestEvictionSafetyCheck (Failure Case): Tests that nodes are unparked when pods lack proper labels 11 | # 2. TestEvictionSafetyCheckPasses (Success Case): Tests that force eviction proceeds when all pods are properly labeled 12 | # 13 | # The failure test includes a PodDisruptionBudget step to prevent soft eviction of the unlabeled pod, 14 | # ensuring the pod remains on the node when the safety check runs. 15 | # 16 | # Prerequisites: 17 | # - A running kind cluster with k8s-shredder deployed 18 | # - EvictionSafetyCheck enabled in the k8s-shredder configuration 19 | # - The park-node binary built and available 20 | # 21 | # Usage: 22 | # ./test_eviction_safety_check.sh 23 | # 24 | # The tests will automatically skip if: 25 | # - EvictionSafetyCheck is disabled in the k8s-shredder configuration 26 | # - Running in Karpenter or node-labels test environments (different node structures) 27 | 28 | set -e 29 | 30 | echo "Running EvictionSafetyCheck E2E Tests..." 31 | 32 | # Check if we're in the right directory 33 | if [ ! -f "internal/testing/e2e_test.go" ]; then 34 | echo "Error: This script must be run from the k8s-shredder project root" 35 | exit 1 36 | fi 37 | 38 | # Build the park-node binary if it doesn't exist 39 | if [ ! -f "park-node" ]; then 40 | echo "Building park-node binary..." 41 | make build 42 | fi 43 | 44 | # Run the e2e tests 45 | echo "Running e2e tests..." 46 | make e2e-tests 47 | 48 | echo "EvictionSafetyCheck E2E Tests completed!" 49 | -------------------------------------------------------------------------------- /pkg/utils/context.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE-2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package utils 13 | 14 | import ( 15 | "context" 16 | 17 | "github.com/adobe/k8s-shredder/pkg/config" 18 | "k8s.io/client-go/dynamic" 19 | 20 | "k8s.io/client-go/kubernetes" 21 | ) 22 | 23 | // AppContext struct stores a context and a k8s client 24 | type AppContext struct { 25 | Context context.Context 26 | K8sClient kubernetes.Interface 27 | DynamicK8SClient dynamic.Interface 28 | Config config.Config 29 | dryRun bool 30 | } 31 | 32 | // NewAppContext creates a new AppContext object 33 | func NewAppContext(cfg config.Config, dryRun bool) (*AppContext, error) { 34 | client, err := getK8SClient() 35 | if err != nil { 36 | return nil, err 37 | } 38 | 39 | dynamicClient, err := getDynamicK8SClient() 40 | if err != nil { 41 | return nil, err 42 | } 43 | 44 | ctx, cancel := context.WithCancel(context.Background()) 45 | 46 | go HandleOsSignals(cancel) 47 | 48 | return &AppContext{ 49 | Context: ctx, 50 | K8sClient: client, 51 | DynamicK8SClient: dynamicClient, 52 | Config: cfg, 53 | dryRun: dryRun, 54 | }, nil 55 | } 56 | 57 | // IsDryRun returns true if the "--dry-run" flag was provided 58 | func (ac *AppContext) IsDryRun() bool { 59 | return ac.dryRun 60 | } 61 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 6 | 7 | ## Related Issue 8 | 9 | 10 | 11 | 12 | 13 | 14 | ## Motivation and Context 15 | 16 | 17 | 18 | ## How Has This Been Tested? 19 | 20 | 21 | 22 | 23 | 24 | ## Screenshots (if appropriate): 25 | 26 | ## Types of changes 27 | 28 | 29 | 30 | - [ ] Bug fix (non-breaking change which fixes an issue) 31 | - [ ] New feature (non-breaking change which adds functionality) 32 | - [ ] Breaking change (fix or feature that would cause existing functionality to change) 33 | 34 | ## Checklist: 35 | 36 | 37 | 38 | 39 | - [ ] I have signed the [Adobe Open Source CLA](https://opensource.adobe.com/cla.html). 40 | - [ ] My code follows the code style of this project. 41 | - [ ] My change requires a change to the documentation. 42 | - [ ] I have updated the documentation accordingly. 43 | - [ ] I have read the **CONTRIBUTING** document. 44 | - [ ] I have added tests to cover my changes. 45 | - [ ] All new and existing tests passed. 46 | -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "k8s-shredder.fullname" . }}-config 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{ include "k8s-shredder.labels" . | indent 4 }} 8 | data: 9 | config.yaml: |- 10 | EvictionLoopInterval: "{{.Values.shredder.EvictionLoopInterval}}" 11 | {{- if .Values.shredder.EvictionLoopSchedule }} 12 | EvictionLoopSchedule: "{{.Values.shredder.EvictionLoopSchedule}}" 13 | {{- end }} 14 | {{- if .Values.shredder.EvictionLoopDuration }} 15 | EvictionLoopDuration: "{{.Values.shredder.EvictionLoopDuration}}" 16 | {{- end }} 17 | ParkedNodeTTL: "{{.Values.shredder.ParkedNodeTTL}}" 18 | RollingRestartThreshold: "{{.Values.shredder.RollingRestartThreshold}}" 19 | UpgradeStatusLabel: "{{.Values.shredder.UpgradeStatusLabel}}" 20 | ExpiresOnLabel: "{{.Values.shredder.ExpiresOnLabel}}" 21 | NamespacePrefixSkipInitialEviction: "{{.Values.shredder.NamespacePrefixSkipInitialEviction}}" 22 | RestartedAtAnnotation: "{{.Values.shredder.RestartedAtAnnotation}}" 23 | AllowEvictionLabel: "{{.Values.shredder.AllowEvictionLabel}}" 24 | ToBeDeletedTaint: "{{.Values.shredder.ToBeDeletedTaint}}" 25 | ArgoRolloutsAPIVersion: "{{.Values.shredder.ArgoRolloutsAPIVersion}}" 26 | EnableKarpenterDriftDetection: {{.Values.shredder.EnableKarpenterDriftDetection}} 27 | EnableKarpenterDisruptionDetection: {{.Values.shredder.EnableKarpenterDisruptionDetection}} 28 | ParkedByLabel: "{{.Values.shredder.ParkedByLabel}}" 29 | ParkedByValue: "{{.Values.shredder.ParkedByValue}}" 30 | ParkedNodeTaint: "{{.Values.shredder.ParkedNodeTaint}}" 31 | EnableNodeLabelDetection: {{.Values.shredder.EnableNodeLabelDetection}} 32 | NodeLabelsToDetect: {{.Values.shredder.NodeLabelsToDetect | toJson}} 33 | MaxParkedNodes: {{.Values.shredder.MaxParkedNodes}} 34 | EvictionSafetyCheck: {{.Values.shredder.EvictionSafetyCheck}} 35 | ParkingReasonLabel: "{{.Values.shredder.ParkingReasonLabel}}" 36 | ExtraParkingLabels: {{.Values.shredder.ExtraParkingLabels | toJson}} 37 | -------------------------------------------------------------------------------- /internal/check_license.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ueo pipefail 3 | 4 | CURRENT_YEAR=$(date +%Y) 5 | export CURRENT_YEAR 6 | 7 | # Function to update copyright year in Go files 8 | update_go_copyright_year() { 9 | local file=$1 10 | local temp_file=$(mktemp) 11 | 12 | # Check if file has a copyright header 13 | if head -n3 "$file" | grep -q "Copyright.*20[0-9]\{2\}"; then 14 | # Update the year to current year 15 | echo "Processing file: $file" 16 | # Now do the replacement 17 | sed "s/202[0-9]/$CURRENT_YEAR/g" "$file" > "$temp_file" 18 | else 19 | # Add copyright header if missing 20 | echo "// Copyright $CURRENT_YEAR Adobe. All rights reserved." > "$temp_file" 21 | cat "$file" >> "$temp_file" 22 | fi 23 | 24 | # Replace original file with modified content 25 | mv "$temp_file" "$file" 26 | } 27 | 28 | # Function to update copyright year in LICENSE file 29 | update_license_copyright_year() { 30 | local file=$1 31 | local temp_file=$(mktemp) 32 | 33 | echo "Processing LICENSE file" 34 | 35 | # Update only the line containing "Copyright 2022 Adobe" 36 | sed "s/Copyright 202[0-9] Adobe/Copyright $CURRENT_YEAR Adobe/g" "$file" > "$temp_file" 37 | 38 | # Replace original file with modified content 39 | mv "$temp_file" "$file" 40 | } 41 | 42 | export -f update_go_copyright_year 43 | export -f update_license_copyright_year 44 | 45 | # Update LICENSE file if it exists 46 | if [ -f "LICENSE" ]; then 47 | update_license_copyright_year "LICENSE" 48 | fi 49 | 50 | # Find all Go files and update their copyright headers 51 | find . -type f -iname '*.go' ! -path '*/vendor/*' -exec bash -c 'update_go_copyright_year "$1"' _ {} \; 52 | 53 | # Check if any files are missing the license header 54 | licRes=$( 55 | find . -type f -iname '*.go' ! -path '*/vendor/*' -exec \ 56 | sh -c 'head -n3 $1 | grep -Eq "(Copyright|generated|GENERATED)" || echo "$1"' {} {} \; 57 | ) 58 | 59 | if [ -n "${licRes}" ]; then 60 | echo -e "License header is missing in:\\n${licRes}" 61 | exit 255 62 | fi 63 | -------------------------------------------------------------------------------- /internal/testing/park_node.go: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Adobe. All rights reserved. 2 | package e2e 3 | 4 | import ( 5 | "context" 6 | "time" 7 | 8 | "github.com/adobe/k8s-shredder/pkg/config" 9 | "github.com/adobe/k8s-shredder/pkg/utils" 10 | log "github.com/sirupsen/logrus" 11 | "k8s.io/client-go/kubernetes" 12 | "k8s.io/client-go/tools/clientcmd" 13 | ) 14 | 15 | // ParkNodeForTesting properly parks a node using the ParkNodes function 16 | func ParkNodeForTesting(nodeName string, kubeconfigPath string) error { 17 | // Load kubeconfig from file without registering flags 18 | kubeconfig, err := clientcmd.LoadFromFile(kubeconfigPath) 19 | if err != nil { 20 | return err 21 | } 22 | 23 | k8sConfig, err := clientcmd.NewDefaultClientConfig(*kubeconfig, &clientcmd.ConfigOverrides{}).ClientConfig() 24 | if err != nil { 25 | return err 26 | } 27 | 28 | // Create Kubernetes client 29 | clientset, err := kubernetes.NewForConfig(k8sConfig) 30 | if err != nil { 31 | return err 32 | } 33 | 34 | // Create test configuration 35 | cfg := config.Config{ 36 | ParkedNodeTTL: 1 * time.Minute, // 1 minute TTL for testing 37 | UpgradeStatusLabel: "shredder.ethos.adobe.net/upgrade-status", 38 | ExpiresOnLabel: "shredder.ethos.adobe.net/parked-node-expires-on", 39 | ParkedByLabel: "shredder.ethos.adobe.net/parked-by", 40 | ParkedByValue: "k8s-shredder", 41 | ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule", 42 | EvictionSafetyCheck: true, // Keep safety check enabled 43 | ExtraParkingLabels: map[string]string{}, 44 | ParkingReasonLabel: "shredder.ethos.adobe.net/parked-reason", 45 | } 46 | 47 | // Create logger 48 | logEntry := log.NewEntry(log.New()) 49 | 50 | // Create node info for parking 51 | nodesToPark := []utils.NodeInfo{ 52 | { 53 | Name: nodeName, 54 | Labels: map[string]string{}, 55 | }, 56 | } 57 | 58 | // Park the node (this will label both node and pods) 59 | ctx := context.Background() 60 | err = utils.ParkNodes(ctx, clientset, nodesToPark, cfg, false, "e2e-test", logEntry) 61 | if err != nil { 62 | return err 63 | } 64 | 65 | logEntry.Infof("Successfully parked node %s with proper pod labeling", nodeName) 66 | return nil 67 | } 68 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thanks for choosing to contribute! 4 | 5 | The following are a set of guidelines to follow when contributing to this project. 6 | 7 | ## Code Of Conduct 8 | 9 | This project adheres to the Adobe [code of conduct](../CODE_OF_CONDUCT.md). By participating, 10 | you are expected to uphold this code. Please report unacceptable behavior to 11 | [Grp-opensourceoffice@adobe.com](mailto:Grp-opensourceoffice@adobe.com). 12 | 13 | ## Have A Question? 14 | 15 | Start by filing an issue. The existing committers on this project work to reach 16 | consensus around project direction and issue solutions within issue threads 17 | (when appropriate). 18 | 19 | ## Contributor License Agreement 20 | 21 | All third-party contributions to this project must be accompanied by a signed contributor 22 | license agreement. This gives Adobe permission to redistribute your contributions 23 | as part of the project. [Sign our CLA](https://opensource.adobe.com/cla.html). You 24 | only need to submit an Adobe CLA one time, so if you have submitted one previously, 25 | you are good to go! 26 | 27 | ## Code Reviews 28 | 29 | All submissions should come in the form of pull requests and need to be reviewed 30 | by project committers. Read [GitHub's pull request documentation](https://help.github.com/articles/about-pull-requests/) 31 | for more information on sending pull requests. 32 | 33 | Lastly, please follow the [pull request template](.github/PULL_REQUEST_TEMPLATE.md) when 34 | submitting a pull request! 35 | 36 | ## From Contributor To Committer 37 | 38 | We love contributions from our community! If you'd like to go a step beyond contributor 39 | and become a committer with full write access and a say in the project, you must 40 | be invited to the project. The existing committers employ an internal nomination 41 | process that must reach lazy consensus (silence is approval) before invitations 42 | are issued. If you feel you are qualified and want to get more deeply involved, 43 | feel free to reach out to existing committers to have a conversation about that. 44 | 45 | ## Security Issues 46 | 47 | Security issues shouldn't be reported on this issue tracker. Instead, [file an issue to our security experts](https://helpx.adobe.com/security/alertus.html). -------------------------------------------------------------------------------- /internal/testing/prometheus_stuffs.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: prometheus 6 | namespace: kube-system 7 | labels: 8 | app: prometheus 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app: prometheus 14 | template: 15 | metadata: 16 | labels: 17 | app: prometheus 18 | spec: 19 | nodeSelector: 20 | monitoring: dedicated 21 | tolerations: 22 | - key: monitoring 23 | value: dedicated 24 | effect: NoSchedule 25 | containers: 26 | - name: prometheus 27 | image: prom/prometheus:v2.54.1 28 | args: 29 | - --storage.tsdb.retention.time=1h 30 | - --config.file=/etc/prometheus/prometheus.yml 31 | - --storage.tsdb.path=/prometheus/ 32 | ports: 33 | - containerPort: 9090 34 | resources: 35 | requests: 36 | cpu: 500m 37 | memory: 500M 38 | limits: 39 | cpu: '1' 40 | memory: 1Gi 41 | volumeMounts: 42 | - name: prometheus-config-volume 43 | mountPath: /etc/prometheus/ 44 | - name: prometheus-storage-volume 45 | mountPath: /prometheus/ 46 | volumes: 47 | - name: prometheus-config-volume 48 | configMap: 49 | defaultMode: 420 50 | name: prometheus-server-conf 51 | - name: prometheus-storage-volume 52 | emptyDir: {} 53 | --- 54 | apiVersion: v1 55 | kind: Service 56 | metadata: 57 | name: prometheus 58 | namespace: kube-system 59 | spec: 60 | type: NodePort 61 | selector: 62 | app: prometheus 63 | ports: 64 | - port: 9090 65 | targetPort: 9090 66 | nodePort: 30007 67 | --- 68 | apiVersion: v1 69 | kind: ConfigMap 70 | metadata: 71 | name: prometheus-server-conf 72 | labels: 73 | name: prometheus-server-conf 74 | namespace: kube-system 75 | data: 76 | prometheus.yml: |- 77 | global: 78 | scrape_interval: 5s 79 | evaluation_interval: 5s 80 | scrape_configs: 81 | - job_name: 'k8s-shredder' 82 | static_configs: 83 | - targets: ['k8s-shredder.kube-system.svc.cluster.local:8080'] 84 | -------------------------------------------------------------------------------- /internal/testing/prometheus_stuffs_karpenter.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: prometheus 6 | namespace: kube-system 7 | labels: 8 | app: prometheus 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app: prometheus 14 | template: 15 | metadata: 16 | labels: 17 | app: prometheus 18 | spec: 19 | nodeSelector: 20 | monitoring: dedicated 21 | tolerations: 22 | - key: monitoring 23 | value: dedicated 24 | effect: NoSchedule 25 | containers: 26 | - name: prometheus 27 | image: prom/prometheus:v2.54.1 28 | args: 29 | - --storage.tsdb.retention.time=1h 30 | - --config.file=/etc/prometheus/prometheus.yml 31 | - --storage.tsdb.path=/prometheus/ 32 | ports: 33 | - containerPort: 9090 34 | resources: 35 | requests: 36 | cpu: 500m 37 | memory: 500M 38 | limits: 39 | cpu: '1' 40 | memory: 1Gi 41 | volumeMounts: 42 | - name: prometheus-config-volume 43 | mountPath: /etc/prometheus/ 44 | - name: prometheus-storage-volume 45 | mountPath: /prometheus/ 46 | volumes: 47 | - name: prometheus-config-volume 48 | configMap: 49 | defaultMode: 420 50 | name: prometheus-server-conf 51 | - name: prometheus-storage-volume 52 | emptyDir: {} 53 | --- 54 | apiVersion: v1 55 | kind: Service 56 | metadata: 57 | name: prometheus 58 | namespace: kube-system 59 | spec: 60 | type: NodePort 61 | selector: 62 | app: prometheus 63 | ports: 64 | - port: 9090 65 | targetPort: 9090 66 | nodePort: 30008 67 | --- 68 | apiVersion: v1 69 | kind: ConfigMap 70 | metadata: 71 | name: prometheus-server-conf 72 | labels: 73 | name: prometheus-server-conf 74 | namespace: kube-system 75 | data: 76 | prometheus.yml: |- 77 | global: 78 | scrape_interval: 5s 79 | evaluation_interval: 5s 80 | scrape_configs: 81 | - job_name: 'k8s-shredder' 82 | static_configs: 83 | - targets: ['k8s-shredder.kube-system.svc.cluster.local:8080'] 84 | -------------------------------------------------------------------------------- /internal/testing/prometheus_stuffs_node_labels.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: prometheus 6 | namespace: kube-system 7 | labels: 8 | app: prometheus 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app: prometheus 14 | template: 15 | metadata: 16 | labels: 17 | app: prometheus 18 | spec: 19 | nodeSelector: 20 | monitoring: dedicated 21 | tolerations: 22 | - key: monitoring 23 | value: dedicated 24 | effect: NoSchedule 25 | containers: 26 | - name: prometheus 27 | image: prom/prometheus:v2.54.1 28 | args: 29 | - --storage.tsdb.retention.time=1h 30 | - --config.file=/etc/prometheus/prometheus.yml 31 | - --storage.tsdb.path=/prometheus/ 32 | ports: 33 | - containerPort: 9090 34 | resources: 35 | requests: 36 | cpu: 500m 37 | memory: 500M 38 | limits: 39 | cpu: '1' 40 | memory: 1Gi 41 | volumeMounts: 42 | - name: prometheus-config-volume 43 | mountPath: /etc/prometheus/ 44 | - name: prometheus-storage-volume 45 | mountPath: /prometheus/ 46 | volumes: 47 | - name: prometheus-config-volume 48 | configMap: 49 | defaultMode: 420 50 | name: prometheus-server-conf 51 | - name: prometheus-storage-volume 52 | emptyDir: {} 53 | --- 54 | apiVersion: v1 55 | kind: Service 56 | metadata: 57 | name: prometheus 58 | namespace: kube-system 59 | spec: 60 | type: NodePort 61 | selector: 62 | app: prometheus 63 | ports: 64 | - port: 9090 65 | targetPort: 9090 66 | nodePort: 30009 67 | --- 68 | apiVersion: v1 69 | kind: ConfigMap 70 | metadata: 71 | name: prometheus-server-conf 72 | labels: 73 | name: prometheus-server-conf 74 | namespace: kube-system 75 | data: 76 | prometheus.yml: |- 77 | global: 78 | scrape_interval: 5s 79 | evaluation_interval: 5s 80 | scrape_configs: 81 | - job_name: 'k8s-shredder' 82 | static_configs: 83 | - targets: ['k8s-shredder.kube-system.svc.cluster.local:8080'] 84 | -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "k8s-shredder.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "k8s-shredder.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Create chart name and version as used by the chart label. 29 | */}} 30 | {{- define "k8s-shredder.chart" -}} 31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 32 | {{- end -}} 33 | 34 | {{/* 35 | Common labels 36 | */}} 37 | {{- define "k8s-shredder.labels" -}} 38 | app.kubernetes.io/name: {{ include "k8s-shredder.name" . }} 39 | helm.sh/chart: {{ include "k8s-shredder.chart" . }} 40 | app.kubernetes.io/instance: {{ .Release.Name }} 41 | app.kubernetes.io/managed-by: {{ .Release.Service }} 42 | {{- if .Values.podLabels }} 43 | {{ toYaml .Values.podLabels }} 44 | {{- end }} 45 | {{- end -}} 46 | 47 | {{/* 48 | matchLabels 49 | */}} 50 | {{- define "k8s-shredder.matchLabels" -}} 51 | app.kubernetes.io/name: {{ include "k8s-shredder.name" . }} 52 | app.kubernetes.io/instance: {{ .Release.Name }} 53 | {{- end -}} 54 | 55 | {{/* 56 | Additional pod annotations 57 | */}} 58 | {{- define "k8s-shredder.annotations" -}} 59 | {{- if .Values.podAnnotations }} 60 | {{- toYaml .Values.podAnnotations }} 61 | {{- end }} 62 | {{- end -}} 63 | 64 | 65 | {{/* 66 | Create the name of the service account to use. 67 | */}} 68 | {{- define "k8s-shredder.serviceAccountName" -}} 69 | {{- if .Values.serviceAccount.create -}} 70 | {{ default (include "k8s-shredder.fullname" .) .Values.serviceAccount.name }} 71 | {{- else -}} 72 | {{ default "default" .Values.serviceAccount.name }} 73 | {{- end -}} 74 | {{- end -}} 75 | 76 | -------------------------------------------------------------------------------- /internal/testing/k8s-shredder.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: k8s-shredder 6 | namespace: kube-system 7 | labels: 8 | app: k8s-shredder 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app: k8s-shredder 14 | template: 15 | metadata: 16 | labels: 17 | app: k8s-shredder 18 | spec: 19 | affinity: 20 | nodeAffinity: 21 | preferredDuringSchedulingIgnoredDuringExecution: 22 | - weight: 1 23 | preference: 24 | matchExpressions: 25 | - key: node.kubernetes.io/role 26 | operator: In 27 | values: [master] 28 | tolerations: 29 | - key: node-role.kubernetes.io/control-plane 30 | operator: Exists 31 | effect: NoSchedule 32 | serviceAccountName: k8s-shredder 33 | containers: 34 | - name: k8s-shredder 35 | image: adobe/k8s-shredder:dev # replace it with a stable version 36 | args: 37 | - --config=/k8s-shredder-config/config.yaml 38 | - --metrics-port=8080 39 | - --log-level=info 40 | # For running it in dry run, without taking any real eviction actions 41 | # - "--dry-run" 42 | ports: 43 | - containerPort: 8080 44 | resources: 45 | requests: 46 | cpu: 250m 47 | memory: 250M 48 | limits: 49 | cpu: '1' 50 | memory: 1Gi 51 | volumeMounts: 52 | - name: k8s-shredder-config-volume 53 | mountPath: /k8s-shredder-config 54 | volumes: 55 | - name: k8s-shredder-config-volume 56 | configMap: 57 | defaultMode: 420 58 | name: k8s-shredder-config 59 | --- 60 | apiVersion: v1 61 | kind: ConfigMap 62 | metadata: 63 | name: k8s-shredder-config 64 | namespace: kube-system 65 | data: 66 | config.yaml: |- 67 | EvictionLoopInterval: 10s 68 | ParkedNodeTTL: 30s 69 | RollingRestartThreshold: 0.5 70 | UpgradeStatusLabel: "shredder.ethos.adobe.net/upgrade-status" 71 | ExpiresOnLabel: "shredder.ethos.adobe.net/parked-node-expires-on" 72 | NamespacePrefixSkipInitialEviction: "ns-ethos-" 73 | RestartedAtAnnotation: "shredder.ethos.adobe.net/restartedAt" 74 | AllowEvictionLabel: "shredder.ethos.adobe.net/allow-eviction" 75 | ToBeDeletedTaint: "ToBeDeletedByClusterAutoscaler" 76 | ArgoRolloutsAPIVersion: "v1alpha1" 77 | EnableKarpenterDriftDetection: false 78 | ParkedByLabel: "shredder.ethos.adobe.net/parked-by" 79 | ParkedByValue: "k8s-shredder" 80 | ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule" 81 | EnableNodeLabelDetection: false 82 | NodeLabelsToDetect: [] 83 | MaxParkedNodes: "0" 84 | --- 85 | apiVersion: v1 86 | kind: Service 87 | metadata: 88 | name: k8s-shredder 89 | namespace: kube-system 90 | spec: 91 | selector: 92 | app: k8s-shredder 93 | ports: 94 | - port: 8080 95 | targetPort: 8080 96 | -------------------------------------------------------------------------------- /internal/testing/k8s-shredder-karpenter.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: k8s-shredder 6 | namespace: kube-system 7 | labels: 8 | app: k8s-shredder 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app: k8s-shredder 14 | template: 15 | metadata: 16 | labels: 17 | app: k8s-shredder 18 | spec: 19 | affinity: 20 | nodeAffinity: 21 | preferredDuringSchedulingIgnoredDuringExecution: 22 | - weight: 1 23 | preference: 24 | matchExpressions: 25 | - key: node.kubernetes.io/role 26 | operator: In 27 | values: [master] 28 | tolerations: 29 | - key: node-role.kubernetes.io/control-plane 30 | operator: Exists 31 | effect: NoSchedule 32 | serviceAccountName: k8s-shredder 33 | containers: 34 | - name: k8s-shredder 35 | image: adobe/k8s-shredder:dev # replace it with a stable version 36 | args: 37 | - --config=/k8s-shredder-config/config.yaml 38 | - --metrics-port=8080 39 | - --log-level=debug 40 | # For running it in dry run, without taking any real eviction actions 41 | # - "--dry-run" 42 | ports: 43 | - containerPort: 8080 44 | resources: 45 | requests: 46 | cpu: 250m 47 | memory: 250M 48 | limits: 49 | cpu: '1' 50 | memory: 1Gi 51 | volumeMounts: 52 | - name: k8s-shredder-config-volume 53 | mountPath: /k8s-shredder-config 54 | volumes: 55 | - name: k8s-shredder-config-volume 56 | configMap: 57 | defaultMode: 420 58 | name: k8s-shredder-config 59 | --- 60 | apiVersion: v1 61 | kind: ConfigMap 62 | metadata: 63 | name: k8s-shredder-config 64 | namespace: kube-system 65 | data: 66 | config.yaml: |- 67 | EvictionLoopInterval: 30s 68 | ParkedNodeTTL: 2m 69 | RollingRestartThreshold: 0.5 70 | UpgradeStatusLabel: "shredder.ethos.adobe.net/upgrade-status" 71 | ExpiresOnLabel: "shredder.ethos.adobe.net/parked-node-expires-on" 72 | NamespacePrefixSkipInitialEviction: "ns-ethos-" 73 | RestartedAtAnnotation: "shredder.ethos.adobe.net/restartedAt" 74 | AllowEvictionLabel: "shredder.ethos.adobe.net/allow-eviction" 75 | ToBeDeletedTaint: "ToBeDeletedByClusterAutoscaler" 76 | ArgoRolloutsAPIVersion: "v1alpha1" 77 | EnableKarpenterDriftDetection: true 78 | ParkedByLabel: "shredder.ethos.adobe.net/parked-by" 79 | ParkedByValue: "k8s-shredder" 80 | ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule" 81 | EnableNodeLabelDetection: false 82 | NodeLabelsToDetect: [] 83 | MaxParkedNodes: "0" 84 | --- 85 | apiVersion: v1 86 | kind: Service 87 | metadata: 88 | name: k8s-shredder 89 | namespace: kube-system 90 | spec: 91 | selector: 92 | app: k8s-shredder 93 | ports: 94 | - port: 8080 95 | targetPort: 8080 96 | -------------------------------------------------------------------------------- /pkg/utils/k8s.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE-2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package utils 13 | 14 | import ( 15 | "strconv" 16 | "time" 17 | 18 | "github.com/pkg/errors" 19 | v1 "k8s.io/api/core/v1" 20 | "k8s.io/client-go/dynamic" 21 | "k8s.io/client-go/kubernetes" 22 | "sigs.k8s.io/controller-runtime/pkg/client/config" 23 | ) 24 | 25 | func getK8SClient() (*kubernetes.Clientset, error) { 26 | cfg, err := config.GetConfig() 27 | if err != nil { 28 | return nil, err 29 | } 30 | 31 | client, err := kubernetes.NewForConfig(cfg) 32 | if err != nil { 33 | return nil, err 34 | } 35 | 36 | return client, nil 37 | } 38 | 39 | func getDynamicK8SClient() (*dynamic.DynamicClient, error) { 40 | cfg, err := config.GetConfig() 41 | if err != nil { 42 | return nil, err 43 | } 44 | 45 | // Create a dynamic client 46 | dynamicClient, err := dynamic.NewForConfig(cfg) 47 | if err != nil { 48 | return nil, errors.Errorf("Error creating dynamic client: %v", err) 49 | } 50 | 51 | return dynamicClient, nil 52 | } 53 | 54 | // NodeHasTaint check if a node has a taint set 55 | func NodeHasTaint(node v1.Node, key string) bool { 56 | for _, taint := range node.Spec.Taints { 57 | if taint.Key == key { 58 | return true 59 | } 60 | } 61 | return false 62 | } 63 | 64 | // NodeHasLabel check if a node has a specific label set 65 | func NodeHasLabel(node v1.Node, key string) bool { 66 | for k := range node.Labels { 67 | if k == key { 68 | return true 69 | } 70 | } 71 | return false 72 | } 73 | 74 | // PodEvictionAllowed check if a pod has the `skipEvictionLabel`=false label set 75 | func PodEvictionAllowed(pod v1.Pod, skipEvictionLabel string) bool { 76 | if PodHasLabel(pod, skipEvictionLabel) { 77 | if pod.Labels[skipEvictionLabel] == "false" { 78 | return false 79 | } 80 | } 81 | return true 82 | } 83 | 84 | // PodHasLabel check if a pod has a specific label set 85 | func PodHasLabel(pod v1.Pod, key string) bool { 86 | for k := range pod.Labels { 87 | if k == key { 88 | return true 89 | } 90 | } 91 | return false 92 | } 93 | 94 | // GetParkedNodeExpiryTime get the time a parked node TTL expires 95 | func GetParkedNodeExpiryTime(node v1.Node, expiresOnLabel string) (time.Time, error) { 96 | i, err := strconv.ParseFloat(node.Labels[expiresOnLabel], 64) 97 | if err != nil { 98 | return time.Now().UTC(), errors.Errorf("Failed to parse label %s with value %s", expiresOnLabel, node.Labels[expiresOnLabel]) 99 | } 100 | return time.Unix(int64(i), 0).UTC(), nil 101 | } 102 | -------------------------------------------------------------------------------- /internal/testing/k8s-shredder-node-labels.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: k8s-shredder 6 | namespace: kube-system 7 | labels: 8 | app: k8s-shredder 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app: k8s-shredder 14 | template: 15 | metadata: 16 | labels: 17 | app: k8s-shredder 18 | spec: 19 | affinity: 20 | nodeAffinity: 21 | preferredDuringSchedulingIgnoredDuringExecution: 22 | - weight: 1 23 | preference: 24 | matchExpressions: 25 | - key: node.kubernetes.io/role 26 | operator: In 27 | values: [master] 28 | tolerations: 29 | - key: node-role.kubernetes.io/control-plane 30 | operator: Exists 31 | effect: NoSchedule 32 | serviceAccountName: k8s-shredder 33 | containers: 34 | - name: k8s-shredder 35 | image: adobe/k8s-shredder:dev # replace it with a stable version 36 | args: 37 | - --config=/k8s-shredder-config/config.yaml 38 | - --metrics-port=8080 39 | - --log-level=info 40 | # For running it in dry run, without taking any real eviction actions 41 | # - "--dry-run" 42 | ports: 43 | - containerPort: 8080 44 | resources: 45 | requests: 46 | cpu: 250m 47 | memory: 250M 48 | limits: 49 | cpu: '1' 50 | memory: 1Gi 51 | volumeMounts: 52 | - name: k8s-shredder-config-volume 53 | mountPath: /k8s-shredder-config 54 | volumes: 55 | - name: k8s-shredder-config-volume 56 | configMap: 57 | defaultMode: 420 58 | name: k8s-shredder-config 59 | --- 60 | apiVersion: v1 61 | kind: ConfigMap 62 | metadata: 63 | name: k8s-shredder-config 64 | namespace: kube-system 65 | data: 66 | config.yaml: |- 67 | EvictionLoopInterval: 30s 68 | ParkedNodeTTL: 2m 69 | RollingRestartThreshold: 0.5 70 | UpgradeStatusLabel: "shredder.ethos.adobe.net/upgrade-status" 71 | ExpiresOnLabel: "shredder.ethos.adobe.net/parked-node-expires-on" 72 | NamespacePrefixSkipInitialEviction: "ns-ethos-" 73 | RestartedAtAnnotation: "shredder.ethos.adobe.net/restartedAt" 74 | AllowEvictionLabel: "shredder.ethos.adobe.net/allow-eviction" 75 | ToBeDeletedTaint: "ToBeDeletedByClusterAutoscaler" 76 | ArgoRolloutsAPIVersion: "v1alpha1" 77 | EnableKarpenterDriftDetection: false 78 | ParkedByLabel: "shredder.ethos.adobe.net/parked-by" 79 | ParkedByValue: "k8s-shredder" 80 | ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule" 81 | EnableNodeLabelDetection: true 82 | NodeLabelsToDetect: 83 | - "test-label" 84 | - "maintenance=scheduled" 85 | - "node.test.io/park" 86 | MaxParkedNodes: "0" 87 | --- 88 | apiVersion: v1 89 | kind: Service 90 | metadata: 91 | name: k8s-shredder 92 | namespace: kube-system 93 | spec: 94 | selector: 95 | app: k8s-shredder 96 | ports: 97 | - port: 8080 98 | targetPort: 8080 99 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: CI tests 3 | on: pull_request 4 | jobs: 5 | ci: 6 | runs-on: ubuntu-latest 7 | name: ci 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v6 11 | - name: Setup Go 12 | uses: actions/setup-go@v6 13 | with: 14 | go-version: '1.25' 15 | - name: Run Gosec Security Scanner 16 | uses: securego/gosec@master 17 | with: 18 | args: -quiet -exclude=G107 ./... 19 | - name: Run golangci-lint 20 | uses: golangci/golangci-lint-action@v9 21 | with: 22 | # Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version 23 | # version: v1.46 24 | args: -v --timeout 5m --no-config ./... 25 | - name: Install k8s Kind Cluster 26 | uses: helm/kind-action@v1.13.0 27 | with: 28 | install_only: true 29 | version: v0.29.0 30 | - name: Prepare test environment 31 | run: make local-test 32 | - name: Run e2e tests 33 | run: make e2e-tests 34 | ci-karpenter: 35 | runs-on: ubuntu-latest 36 | name: ci-karpenter 37 | steps: 38 | - name: Checkout 39 | uses: actions/checkout@v6 40 | - name: Setup Go 41 | uses: actions/setup-go@v6 42 | with: 43 | go-version: '1.25' 44 | - name: Run Gosec Security Scanner 45 | uses: securego/gosec@master 46 | with: 47 | args: -quiet -exclude=G107 ./... 48 | - name: Run golangci-lint 49 | uses: golangci/golangci-lint-action@v9 50 | with: 51 | # Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version 52 | # version: v1.46 53 | args: -v --timeout 5m --no-config ./... 54 | - name: Install k8s Kind Cluster 55 | uses: helm/kind-action@v1.13.0 56 | with: 57 | install_only: true 58 | version: v0.29.0 59 | - name: Prepare test environment 60 | run: make local-test-karpenter 61 | - name: Run e2e tests 62 | run: make e2e-tests 63 | ci-node-labels: 64 | runs-on: ubuntu-latest 65 | name: ci-node-labels 66 | steps: 67 | - name: Checkout 68 | uses: actions/checkout@v6 69 | - name: Setup Go 70 | uses: actions/setup-go@v6 71 | with: 72 | go-version: '1.25' 73 | - name: Run Gosec Security Scanner 74 | uses: securego/gosec@master 75 | with: 76 | args: -quiet -exclude=G107 ./... 77 | - name: Run golangci-lint 78 | uses: golangci/golangci-lint-action@v9 79 | with: 80 | # Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version 81 | # version: v1.46 82 | args: -v --timeout 5m --no-config ./... 83 | - name: Install k8s Kind Cluster 84 | uses: helm/kind-action@v1.13.0 85 | with: 86 | install_only: true 87 | version: v0.29.0 88 | - name: Prepare test environment 89 | run: make local-test-node-labels 90 | - name: Run e2e tests 91 | run: make e2e-tests 92 | -------------------------------------------------------------------------------- /pkg/metrics/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE-2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package metrics 13 | 14 | import ( 15 | "fmt" 16 | "net/http" 17 | "time" 18 | 19 | log "github.com/sirupsen/logrus" 20 | 21 | "github.com/prometheus/client_golang/prometheus" 22 | "github.com/prometheus/client_golang/prometheus/promhttp" 23 | ) 24 | 25 | // Init .. 26 | func Init(port int) error { 27 | if err := registerMetrics(); err != nil { 28 | return err 29 | } 30 | if err := serve(port); err != nil { 31 | return err 32 | } 33 | return nil 34 | } 35 | 36 | func registerMetrics() error { 37 | prometheus.MustRegister(ShredderAPIServerRequestsTotal) 38 | prometheus.MustRegister(ShredderAPIServerRequestsDurationSeconds) 39 | prometheus.MustRegister(ShredderLoopsTotal) 40 | prometheus.MustRegister(ShredderLoopsDurationSeconds) 41 | prometheus.MustRegister(ShredderProcessedNodesTotal) 42 | prometheus.MustRegister(ShredderProcessedPodsTotal) 43 | prometheus.MustRegister(ShredderErrorsTotal) 44 | prometheus.MustRegister(ShredderPodErrorsTotal) 45 | prometheus.MustRegister(ShredderNodeForceToEvictTime) 46 | prometheus.MustRegister(ShredderPodForceToEvictTime) 47 | prometheus.MustRegister(ShredderKarpenterDriftedNodesTotal) 48 | prometheus.MustRegister(ShredderKarpenterDisruptedNodesTotal) 49 | prometheus.MustRegister(ShredderKarpenterNodesParkedTotal) 50 | prometheus.MustRegister(ShredderKarpenterNodesParkingFailedTotal) 51 | prometheus.MustRegister(ShredderKarpenterProcessingDurationSeconds) 52 | prometheus.MustRegister(ShredderNodeLabelNodesParkedTotal) 53 | prometheus.MustRegister(ShredderNodeLabelNodesParkingFailedTotal) 54 | prometheus.MustRegister(ShredderNodeLabelProcessingDurationSeconds) 55 | prometheus.MustRegister(ShredderNodeLabelMatchingNodesTotal) 56 | prometheus.MustRegister(ShredderNodesParkedTotal) 57 | prometheus.MustRegister(ShredderNodesParkingFailedTotal) 58 | prometheus.MustRegister(ShredderProcessingDurationSeconds) 59 | return nil 60 | } 61 | 62 | func serve(port int) error { 63 | http.Handle("/metrics", promhttp.HandlerFor( 64 | prometheus.DefaultGatherer, 65 | promhttp.HandlerOpts{ 66 | EnableOpenMetrics: true, 67 | }, 68 | )) 69 | 70 | http.HandleFunc("/healthz", func(res http.ResponseWriter, req *http.Request) { 71 | res.WriteHeader(200) 72 | _, err := res.Write([]byte("OK")) 73 | if err != nil { 74 | log.Errorln("Error while replying to /healthz request:", err) 75 | } 76 | }) 77 | 78 | server := &http.Server{ 79 | Addr: fmt.Sprintf(":%d", port), 80 | ReadHeaderTimeout: 3 * time.Second, 81 | } 82 | 83 | go func() { 84 | log.Fatal(server.ListenAndServe(), nil) 85 | }() 86 | return nil 87 | } 88 | -------------------------------------------------------------------------------- /internal/testing/karpenter-manifests.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: karpenter.sh/v1 3 | kind: NodePool 4 | metadata: 5 | name: default-nodepool 6 | spec: 7 | template: 8 | spec: 9 | requirements: 10 | - key: kubernetes.io/arch 11 | operator: In 12 | values: [amd64] 13 | - key: kubernetes.io/os 14 | operator: In 15 | values: [linux] 16 | - key: karpenter.sh/capacity-type 17 | operator: In 18 | values: [spot, on-demand] 19 | - key: node.kubernetes.io/instance-type 20 | operator: In 21 | values: [m5.large, m5.xlarge] 22 | nodeClassRef: 23 | group: karpenter.k8s.aws 24 | kind: EC2NodeClass 25 | name: default-nodeclass 26 | taints: 27 | - key: example.com/special-taint 28 | value: special-value 29 | effect: NoSchedule 30 | limits: 31 | cpu: 1000 32 | disruption: 33 | consolidationPolicy: WhenEmpty 34 | consolidateAfter: 30s 35 | --- 36 | apiVersion: karpenter.sh/v1 37 | kind: NodePool 38 | metadata: 39 | name: test-nodepool 40 | spec: 41 | template: 42 | spec: 43 | requirements: 44 | - key: kubernetes.io/arch 45 | operator: In 46 | values: [amd64] 47 | - key: kubernetes.io/os 48 | operator: In 49 | values: [linux] 50 | - key: karpenter.sh/capacity-type 51 | operator: In 52 | values: [spot] 53 | - key: node.kubernetes.io/instance-type 54 | operator: In 55 | values: [m5.large] 56 | nodeClassRef: 57 | group: karpenter.k8s.aws 58 | kind: EC2NodeClass 59 | name: test-nodeclass 60 | taints: 61 | - key: example.com/test-taint 62 | value: test-value 63 | effect: NoSchedule 64 | limits: 65 | cpu: 500 66 | disruption: 67 | consolidationPolicy: WhenEmpty 68 | consolidateAfter: 30s 69 | --- 70 | apiVersion: karpenter.k8s.aws/v1 71 | kind: EC2NodeClass 72 | metadata: 73 | name: default-nodeclass 74 | spec: 75 | role: KarpenterNodeRole-k8s-shredder-test-cluster 76 | amiFamily: AL2 77 | subnetSelectorTerms: 78 | - tags: 79 | karpenter.sh/discovery: k8s-shredder-test-cluster 80 | securityGroupSelectorTerms: 81 | - tags: 82 | karpenter.sh/discovery: k8s-shredder-test-cluster 83 | instanceStorePolicy: RAID0 84 | userData: | 85 | #!/bin/bash 86 | /etc/eks/bootstrap.sh k8s-shredder-test-cluster 87 | echo "NodeClass: default-nodeclass" >> /etc/kubernetes/kubelet/kubelet-config.json 88 | --- 89 | apiVersion: karpenter.k8s.aws/v1 90 | kind: EC2NodeClass 91 | metadata: 92 | name: test-nodeclass 93 | spec: 94 | role: KarpenterNodeRole-k8s-shredder-test-cluster 95 | amiFamily: AL2 96 | subnetSelectorTerms: 97 | - tags: 98 | karpenter.sh/discovery: k8s-shredder-test-cluster 99 | securityGroupSelectorTerms: 100 | - tags: 101 | karpenter.sh/discovery: k8s-shredder-test-cluster 102 | instanceStorePolicy: RAID0 103 | userData: |- 104 | #!/bin/bash 105 | /etc/eks/bootstrap.sh k8s-shredder-test-cluster 106 | echo "NodeClass: test-nodeclass" >> /etc/kubernetes/kubelet/kubelet-config.json 107 | -------------------------------------------------------------------------------- /.goreleaser.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | project_name: k8s-shredder 4 | release: 5 | github: 6 | owner: adobe 7 | name: k8s-shredder 8 | builds: 9 | - id: k8s-shredder 10 | goos: [linux, windows, darwin] 11 | goarch: [amd64, '386', arm64] 12 | env: [CGO_ENABLED=0] 13 | main: . 14 | ldflags: 15 | - -s -w -X github.com/adobe/k8s-shredder/cmd.buildVersion=v{{.Version}} -X github.com/adobe/k8s-shredder/cmd.gitSHA={{.Commit}} 16 | -X github.com/adobe/k8s-shredder/cmd.buildTime={{.Date}} 17 | flags: [-trimpath] 18 | binary: k8s-shredder 19 | # signs the checksum file 20 | # all files (including the sboms) are included in the checksum, so we don't need to sign each one if we don't want to 21 | # https://goreleaser.com/customization/sign 22 | signs: 23 | - cmd: cosign 24 | env: [COSIGN_EXPERIMENTAL=1] 25 | signature: ${artifact}.bundle 26 | args: 27 | - sign-blob 28 | - --bundle=${signature} 29 | - ${artifact} 30 | - --yes # needed on cosign 2.0.0+ 31 | artifacts: checksum 32 | output: true 33 | dockers: 34 | - image_templates: ['ghcr.io/adobe/{{ .ProjectName }}:v{{ .Version }}-amd64'] 35 | use: buildx 36 | dockerfile: Dockerfile 37 | build_flag_templates: 38 | - --platform=linux/amd64 39 | - --label=org.opencontainers.image.title={{ .ProjectName }} 40 | - --label=org.opencontainers.image.description={{ .ProjectName }} 41 | - --label=org.opencontainers.image.url=https://github.com/adobe/{{ .ProjectName }} 42 | - --label=org.opencontainers.image.source=https://github.com/adobe/{{ .ProjectName }} 43 | - --label=org.opencontainers.image.version=v{{ .Version }} 44 | - --label=org.opencontainers.image.created={{ .Date }} 45 | - --label=org.opencontainers.image.revision={{ .FullCommit }} 46 | - --label=org.opencontainers.image.licenses=Apache-2.0 47 | - image_templates: 48 | - ghcr.io/adobe/{{ .ProjectName }}:v{{ .Version }}-arm64v8 49 | use: buildx 50 | goarch: arm64 51 | dockerfile: Dockerfile 52 | build_flag_templates: 53 | - --platform=linux/arm64/v8 54 | - --label=org.opencontainers.image.title={{ .ProjectName }} 55 | - --label=org.opencontainers.image.description={{ .ProjectName }} 56 | - --label=org.opencontainers.image.url=https://github.com/adobe/{{ .ProjectName }} 57 | - --label=org.opencontainers.image.source=https://github.com/adobe/{{ .ProjectName }} 58 | - --label=org.opencontainers.image.version=v{{ .Version }} 59 | - --label=org.opencontainers.image.created={{ .Date }} 60 | - --label=org.opencontainers.image.revision={{ .FullCommit }} 61 | - --label=org.opencontainers.image.licenses=Apache-2.0 62 | docker_manifests: 63 | - name_template: ghcr.io/adobe/{{.ProjectName}}:v{{.Version}} 64 | image_templates: 65 | - ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}-amd64 66 | - ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}-arm64v8 67 | - name_template: ghcr.io/adobe/{{.ProjectName}}:latest 68 | image_templates: 69 | - ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}-amd64 70 | - ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}-arm64v8 71 | # signs our docker image 72 | # https://goreleaser.com/customization/docker_sign 73 | docker_signs: 74 | - cmd: cosign 75 | env: [COSIGN_EXPERIMENTAL=1] 76 | artifacts: images 77 | output: true 78 | args: 79 | - sign 80 | - ${artifact} 81 | - --yes # needed on cosign 2.0.0+ 82 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # K8s-shredder configuration file with default values 3 | # See README.md for detailed description of each option 4 | 5 | # Core eviction settings 6 | EvictionLoopInterval: 60s # How often to run the eviction loop process 7 | ParkedNodeTTL: 60m # Time a node can be parked before starting force eviction process 8 | RollingRestartThreshold: 0.5 # How much time(percentage) should pass from ParkedNodeTTL before starting the rollout restart process 9 | # Node and pod labeling 10 | UpgradeStatusLabel: shredder.ethos.adobe.net/upgrade-status # Label used for identifying parked nodes 11 | ExpiresOnLabel: shredder.ethos.adobe.net/parked-node-expires-on # Label used for identifying the TTL for parked nodes 12 | ParkedByLabel: shredder.ethos.adobe.net/parked-by # Label used to identify which component parked the node 13 | ParkedByValue: k8s-shredder # Value to set for the ParkedByLabel 14 | # Eviction behavior 15 | NamespacePrefixSkipInitialEviction: '' # For pods in namespaces having this prefix proceed directly with a rollout restart without waiting for the RollingRestartThreshold 16 | RestartedAtAnnotation: shredder.ethos.adobe.net/restartedAt # Annotation name used to mark a controller object for rollout restart 17 | AllowEvictionLabel: shredder.ethos.adobe.net/allow-eviction # Label used for skipping evicting pods that have explicitly set this label on false 18 | # Node management 19 | ToBeDeletedTaint: ToBeDeletedByClusterAutoscaler # Node taint used for skipping a subset of parked nodes that are already handled by cluster-autoscaler 20 | ParkedNodeTaint: shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule # Taint to apply to parked nodes in format key=value:effect 21 | # Argo Rollouts integration 22 | ArgoRolloutsAPIVersion: v1alpha1 # API version from argoproj.io API group to be used while handling Argo Rollouts objects 23 | # Karpenter integration 24 | EnableKarpenterDriftDetection: false # Controls whether to scan for drifted Karpenter NodeClaims and automatically label their nodes 25 | EnableKarpenterDisruptionDetection: false # Controls whether to scan for disrupted Karpenter NodeClaims and automatically label their nodes 26 | # Node label detection 27 | EnableNodeLabelDetection: false # Controls whether to scan for nodes with specific labels and automatically park them 28 | NodeLabelsToDetect: [] # List of node labels to detect. Supports both key-only and key=value formats 29 | # Examples: 30 | # - "maintenance" # Matches any node with the "maintenance" label (any value) 31 | # - "upgrade=required" # Matches nodes with label "upgrade" set to "required" 32 | # - "node.example.com/park" # Matches any node with the "node.example.com/park" label 33 | 34 | # Parking limits 35 | MaxParkedNodes: '0' # Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" (default) for no limit. 36 | 37 | # Extra labels to apply to parked nodes and pods 38 | # ExtraParkingLabels: # (optional) Additional labels to apply to nodes and pods during parking 39 | # example.com/owner: "infrastructure" 40 | # example.com/maintenance: "true" 41 | 42 | # Safety settings 43 | EvictionSafetyCheck: true # Controls whether to perform safety checks before force eviction. If true, nodes will be unparked if pods don't have required parking labels. 44 | 45 | # Parking reason tracking 46 | ParkingReasonLabel: shredder.ethos.adobe.net/parked-reason # Label used to track why a node or pod was parked 47 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Adobe Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language. 18 | * Being respectful of differing viewpoints and experiences. 19 | * Gracefully accepting constructive criticism. 20 | * Focusing on what is best for the community. 21 | * Showing empathy towards other community members. 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances. 27 | * Trolling, insulting/derogatory comments, and personal or political attacks. 28 | * Public or private harassment. 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission. 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting. 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at Grp-opensourceoffice@adobe.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [https://contributor-covenant.org/version/1/4][version]. 72 | 73 | [homepage]: https://contributor-covenant.org 74 | [version]: https://contributor-covenant.org/version/1/4/ -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/adobe/k8s-shredder 2 | 3 | go 1.25.0 4 | 5 | toolchain go1.25.5 6 | 7 | require ( 8 | github.com/fsnotify/fsnotify v1.9.0 9 | github.com/go-co-op/gocron/v2 v2.19.0 10 | github.com/google/uuid v1.6.0 11 | github.com/pkg/errors v0.9.1 12 | github.com/prometheus/client_golang v1.23.2 13 | github.com/prometheus/common v0.67.4 14 | github.com/robfig/cron/v3 v3.0.1 15 | github.com/sirupsen/logrus v1.9.3 16 | github.com/spf13/cobra v1.10.2 17 | github.com/spf13/viper v1.21.0 18 | github.com/stretchr/testify v1.11.1 19 | golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 20 | k8s.io/api v0.35.0 21 | k8s.io/apimachinery v0.35.0 22 | k8s.io/client-go v0.35.0 23 | k8s.io/kubectl v0.35.0 24 | k8s.io/utils v0.0.0-20251220205832-9d40a56c1308 25 | sigs.k8s.io/controller-runtime v0.22.4 26 | ) 27 | 28 | require ( 29 | github.com/Masterminds/semver/v3 v3.4.0 // indirect 30 | github.com/beorn7/perks v1.0.1 // indirect 31 | github.com/blang/semver/v4 v4.0.0 // indirect 32 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 33 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 34 | github.com/emicklei/go-restful/v3 v3.12.2 // indirect 35 | github.com/fxamacker/cbor/v2 v2.9.0 // indirect 36 | github.com/go-errors/errors v1.4.2 // indirect 37 | github.com/go-logr/logr v1.4.3 // indirect 38 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 39 | github.com/go-openapi/jsonreference v0.20.2 // indirect 40 | github.com/go-openapi/swag v0.23.0 // indirect 41 | github.com/go-viper/mapstructure/v2 v2.4.0 // indirect 42 | github.com/gogo/protobuf v1.3.2 // indirect 43 | github.com/google/gnostic-models v0.7.0 // indirect 44 | github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect 45 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 46 | github.com/jonboulle/clockwork v0.5.0 // indirect 47 | github.com/josharian/intern v1.0.0 // indirect 48 | github.com/json-iterator/go v1.1.12 // indirect 49 | github.com/mailru/easyjson v0.7.7 // indirect 50 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 51 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect 52 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect 53 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 54 | github.com/pelletier/go-toml/v2 v2.2.4 // indirect 55 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect 56 | github.com/prometheus/client_model v0.6.2 // indirect 57 | github.com/prometheus/procfs v0.16.1 // indirect 58 | github.com/sagikazarmark/locafero v0.11.0 // indirect 59 | github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect 60 | github.com/spf13/afero v1.15.0 // indirect 61 | github.com/spf13/cast v1.10.0 // indirect 62 | github.com/spf13/pflag v1.0.10 // indirect 63 | github.com/subosito/gotenv v1.6.0 // indirect 64 | github.com/x448/float16 v0.8.4 // indirect 65 | github.com/xlab/treeprint v1.2.0 // indirect 66 | go.yaml.in/yaml/v2 v2.4.3 // indirect 67 | go.yaml.in/yaml/v3 v3.0.4 // indirect 68 | golang.org/x/net v0.47.0 // indirect 69 | golang.org/x/oauth2 v0.32.0 // indirect 70 | golang.org/x/sync v0.19.0 // indirect 71 | golang.org/x/sys v0.38.0 // indirect 72 | golang.org/x/term v0.37.0 // indirect 73 | golang.org/x/text v0.31.0 // indirect 74 | golang.org/x/time v0.11.0 // indirect 75 | google.golang.org/protobuf v1.36.10 // indirect 76 | gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect 77 | gopkg.in/inf.v0 v0.9.1 // indirect 78 | gopkg.in/yaml.v3 v3.0.1 // indirect 79 | k8s.io/cli-runtime v0.35.0 // indirect 80 | k8s.io/klog/v2 v2.130.1 // indirect 81 | k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect 82 | sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect 83 | sigs.k8s.io/kustomize/api v0.20.1 // indirect 84 | sigs.k8s.io/kustomize/kyaml v0.20.1 // indirect 85 | sigs.k8s.io/randfill v1.0.0 // indirect 86 | sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect 87 | sigs.k8s.io/yaml v1.6.0 // indirect 88 | ) 89 | -------------------------------------------------------------------------------- /charts/k8s-shredder/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ include "k8s-shredder.fullname" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{ include "k8s-shredder.labels" . | indent 4 }} 8 | {{- if .Values.podAnnotations }} 9 | annotations: 10 | {{ include "k8s-shredder.annotations" . | indent 4 }} 11 | {{- end }} 12 | spec: 13 | replicas: {{ .Values.replicaCount }} 14 | {{- with .Values.deploymentStrategy }} 15 | strategy: 16 | {{- toYaml . | nindent 4 }} 17 | {{- end }} 18 | selector: 19 | matchLabels: 20 | {{ include "k8s-shredder.matchLabels" . | indent 6 }} 21 | template: 22 | metadata: 23 | labels: 24 | {{ include "k8s-shredder.labels" . | indent 8 }} 25 | {{- if .Values.podAnnotations }} 26 | annotations: 27 | {{ include "k8s-shredder.annotations" . | indent 8 }} 28 | {{- end }} 29 | spec: 30 | {{- with .Values.imagePullSecrets }} 31 | imagePullSecrets: 32 | {{- toYaml . | nindent 8 }} 33 | {{- end }} 34 | serviceAccountName: {{ include "k8s-shredder.serviceAccountName" . }} 35 | securityContext: 36 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 37 | initContainers: 38 | {{- with .Values.initContainers }} 39 | {{- toYaml . | nindent 8 }} 40 | {{- end }} 41 | containers: 42 | - name: {{ .Chart.Name }} 43 | securityContext: 44 | {{- toYaml .Values.securityContext | nindent 12 }} 45 | image: "{{ .Values.image.registry }}:{{ .Values.image.tag | default .Chart.AppVersion }}" 46 | imagePullPolicy: {{ .Values.image.pullPolicy }} 47 | args: 48 | - "--config=/k8s-shredder-config/config.yaml" 49 | - "--metrics-port=8080" 50 | - "--log-level={{ .Values.logLevel }}" 51 | {{- if .Values.dryRun }} 52 | - "--dry-run" 53 | {{- end }} 54 | env: 55 | {{- with .Values.environmentVars }} 56 | {{- toYaml . | nindent 12 }} 57 | {{- end }} 58 | ports: 59 | - name: metrics 60 | containerPort: 8080 61 | protocol: TCP 62 | volumeMounts: 63 | - name: k8s-shredder-config-volume 64 | mountPath: /k8s-shredder-config 65 | livenessProbe: 66 | httpGet: 67 | path: /healthz 68 | port: metrics 69 | initialDelaySeconds: 10 70 | timeoutSeconds: 3 71 | periodSeconds: 10 72 | failureThreshold: 5 73 | resources: 74 | {{- toYaml .Values.resources | nindent 12 }} 75 | {{- with .Values.additionalContainers }} 76 | {{- toYaml . | nindent 8 }} 77 | {{- end }} 78 | volumes: 79 | - name: k8s-shredder-config-volume 80 | configMap: 81 | defaultMode: 420 82 | name: {{ include "k8s-shredder.fullname" . }}-config 83 | {{- with .Values.volumes }} 84 | {{- toYaml . | nindent 8 }} 85 | {{- end }} 86 | {{- with .Values.nodeSelector }} 87 | nodeSelector: 88 | {{- toYaml . | nindent 8 }} 89 | {{- end }} 90 | {{- if .Values.affinity }} 91 | affinity: 92 | {{- toYaml .Values.affinity | nindent 8 }} 93 | {{- else }} 94 | affinity: 95 | nodeAffinity: 96 | preferredDuringSchedulingIgnoredDuringExecution: 97 | - weight: 1 98 | preference: 99 | matchExpressions: 100 | - key: node.kubernetes.io/role 101 | operator: In 102 | values: 103 | - master 104 | {{- end }} 105 | {{- if .Values.tolerations }} 106 | tolerations: 107 | {{- toYaml .Values.tolerations | nindent 8 }} 108 | {{- else }} 109 | tolerations: 110 | - key: "node-role.kubernetes.io/control-plane" 111 | operator: "Exists" 112 | effect: "NoSchedule" 113 | {{- end }} 114 | {{- with .Values.topologySpreadConstraints }} 115 | topologySpreadConstraints: 116 | {{- toYaml . | nindent 8 }} 117 | {{- end }} 118 | {{- if .Values.priorityClassName }} 119 | priorityClassName: {{ .Values.priorityClassName }} 120 | {{- end }} 121 | -------------------------------------------------------------------------------- /docs/node-parking.md: -------------------------------------------------------------------------------- 1 | # Node Parking 2 | 3 | "Node Parking" is a process by which nodes that need replacement but are currently being handled by which nodes and the pods scheduled on them are labeled and subsequently targeted for safe eviction over a period of (commonly) several days, after which the pods are forcibly removed by k8s-shredder and the node deleted by the cluster's autoscaler. This process gives tenants the opportunity to reschedule sensitive workloads in a manner that fits their application's SLO while ultimately allowing for the eventual replacement of nodes. 4 | 5 | ## Parking Basics 6 | 7 | When a cluster operator upgrades the node on a cluster (e.g. upgrades the version of Kubernetes, the underlying operating system, a configuration change, etc), it first needs to reschedule all pods on that node. This is done using the [Kuberentes Eviction API](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/), which is to say evictions that respect the application's [PodDisruptionBudgets](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) (PDBs) and [terminationGracePeriodSeconds](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#pod-termination) settings on its pods. Once the node is emptied of application workloads, it is deleted by either the cluster-autoscaler or karpenter (or some other node autoscaler). 8 | 9 | In some cases, it may not be possible to evict a pod without violating the PDB. This is normally due to how the application owner has configured the PDB, but can be caused by other scenarios such as lack of nodes to schedule new pods of scale up due to cloud provider issues. As stewards of the cluster's security and stability, cluster operators cannot let the node run forever. However, they also want to make every effort to make sure application owners have a chance to own and manage their application stability. Enter "Node Parking". 10 | 11 | When a cluster operator encounters a node with a pod that cannot be evict, it will cordon and taint the node. Then, it will label the node and the (non-daemonset) pods contained within it with the following labels: 12 | 13 | ```bash 14 | shredder.ethos.adobe.net/parked-by=k8s-shredder 15 | shredder.ethos.adobe.net/parked-node-expires-on=1750164865.36373 16 | shredder.ethos.adobe.net/upgrade-status=parked 17 | ``` 18 | 19 | The first label denotes who/what parked the node. The second contains a unix timestamp of when that node/pod may be forcibly removed. The third label denotes that the node/pod is parked. 20 | 21 | These labels are used by a deployment called [k8s-shredder](../README.md) that will scan the cluster periodically for pods with these labels, and will try to evict them using the Eviction API. After a portion of the expiry period has passed (default 10%), it will shift to using [rollout restarts](https://kubernetes.io/docs/reference/kubectl/generated/kubectl_rollout/kubectl_rollout_restart/) to help reschedule the pod. If the pod is still present when the expiration date is reached, it forcibly evicted (e.g. it is deleted); this the only action k8s-shredder will take that will violate the pod's PDB. 22 | 23 | If you want a pod to be exempted from the eviction loop until parked node TTL expires, you can label the pod with 24 | 25 | ```bash 26 | "shredder.ethos.adobe.net/allow-eviction=false" 27 | ``` 28 | 29 | so that k8s-shredder will skip it. It will be encumbent on application owners to gracefully reschedule these pods to avoid deletion once the TTL expires. 30 | 31 | More information about k8s-shredder and how it functions can be found [here](../README.md). 32 | 33 | ## How can I tell if my pods are parked? 34 | 35 | As mentioned above, we don't want to forcibly evict our tenant's workloads, and we would much rather give them the power to manage the eviction process in a way that makes sense for their workload, SLOs, and customers. Given that, we have exposed metrics and labels that will allow customers to track and alert when they have workloads that are parked so that they may take action. 36 | 37 | ### Metrics (Recommended) 38 | 39 | If you are writing an alert or promql query, the recommended approach is to incorporate the metric `kube_ethos_upgrade:parked_pod` after exposing it in prometheus. Given that the expiry time for a pod is measured in days, you may want to delay any alerting on pod-parking for the first hour or so to allow for normal rescheduling to occur. 40 | 41 | ### Pod Labels 42 | 43 | Another way to find out if your pod is parked is to monitor the labels on the pods in your names space. You can find parked pods using this kubectl command: 44 | 45 | ``` 46 | kubectl get pods -l shredder.ethos.adobe.net/upgrade-status=parked 47 | ``` 48 | 49 | You can also query and alert on pods labels (although, again, we recommend using the metric exposed above): 50 | 51 | ``` 52 | kube_pod_labels{label_shredder_ethos_adobe_net_upgrade_status="parked"} 53 | ``` 54 | -------------------------------------------------------------------------------- /pkg/config/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE-2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package config 13 | 14 | import ( 15 | "time" 16 | 17 | "github.com/adobe/k8s-shredder/pkg/schedule" 18 | "github.com/pkg/errors" 19 | ) 20 | 21 | // Config struct defines application configuration options 22 | type Config struct { 23 | // EvictionLoopInterval defines how often to run the eviction loop process 24 | EvictionLoopInterval time.Duration 25 | // EvictionLoopSchedule is an optional cron schedule for when eviction operations are allowed 26 | // If set, parking and shredding operations will only occur during the scheduled time window 27 | // Supports standard cron syntax and macros (@yearly, @monthly, @weekly, @daily, @hourly) 28 | // Example: "@daily" (runs at midnight UTC), "0 2 * * *" (runs at 2 AM UTC daily) 29 | EvictionLoopSchedule string 30 | // EvictionLoopDuration defines how long the scheduled window stays active after the schedule triggers 31 | // Only used when EvictionLoopSchedule is set 32 | // Supports compound durations with hours and minutes (e.g., "10h5m", "30m", "160h") 33 | // Example: "10h" (window stays active for 10 hours), "30m" (window stays active for 30 minutes) 34 | EvictionLoopDuration string 35 | // ParkedNodeTTL is used for defining the time a node can stay parked before starting force eviction process 36 | ParkedNodeTTL time.Duration 37 | // RollingRestartThreshold specifies how much time(percentage) should pass from ParkedNodeTTL before starting the rollout restart process 38 | RollingRestartThreshold float64 39 | // UpgradeStatusLabel is used for identifying parked nodes 40 | UpgradeStatusLabel string 41 | // ExpiresOnLabel is used for identifying the TTL for parked nodes 42 | ExpiresOnLabel string 43 | // NamespacePrefixSkipInitialEviction is used for proceeding directly with a rollout restart without waiting for the RollingRestartThreshold 44 | NamespacePrefixSkipInitialEviction string 45 | // RestartedAtAnnotation is used to mark a controller object for rollout restart 46 | RestartedAtAnnotation string 47 | // AllowEvictionLabel is used for skipping evicting pods that have explicitly set this label on false 48 | AllowEvictionLabel string 49 | // ToBeDeletedTaint is used for skipping a subset of parked nodes 50 | ToBeDeletedTaint string 51 | // ArgoRolloutsAPIVersion is used for specifying the API version from `argoproj.io` apigroup to be used while handling Argo Rollouts objects 52 | ArgoRolloutsAPIVersion string 53 | // EnableKarpenterDriftDetection controls whether to scan for drifted Karpenter NodeClaims and automatically label their nodes 54 | EnableKarpenterDriftDetection bool 55 | // EnableKarpenterDisruptionDetection controls whether to scan for disrupted Karpenter NodeClaims and automatically label their nodes 56 | EnableKarpenterDisruptionDetection bool 57 | // ParkedByLabel is used for identifying which component parked the node 58 | ParkedByLabel string 59 | // ParkedByValue is the value to set for the ParkedByLabel 60 | ParkedByValue string 61 | // ParkedNodeTaint is the taint to apply to parked nodes in the format key=value:effect 62 | ParkedNodeTaint string 63 | // EnableNodeLabelDetection controls whether to scan for nodes with specific labels and automatically park them 64 | EnableNodeLabelDetection bool 65 | // NodeLabelsToDetect is a list of node labels to look for. Can be just keys or key=value pairs 66 | NodeLabelsToDetect []string 67 | // MaxParkedNodes is the maximum number of nodes that can be parked simultaneously. 68 | // Can be either an integer (e.g. "5") or a percentage (e.g. "20%"). 69 | // If set to "0" or empty (default), no limit is applied. 70 | // When a percentage is specified, the limit is calculated as (percentage/100) * (total nodes in cluster). 71 | MaxParkedNodes string 72 | // ExtraParkingLabels is a map of additional labels to apply to nodes and pods during the parking process. If not set, no extra labels are applied. 73 | ExtraParkingLabels map[string]string 74 | // EvictionSafetyCheck controls whether to perform safety checks before force eviction. If true, nodes will be unparked if pods don't have required parking labels. 75 | EvictionSafetyCheck bool 76 | // ParkingReasonLabel is the label used to track why a node or pod was parked 77 | ParkingReasonLabel string 78 | } 79 | 80 | // GetEvictionLoopSchedule returns a parsed Schedule object if EvictionLoopSchedule is configured 81 | // Returns nil if schedule is not configured or if there's an error parsing it 82 | func (c *Config) GetEvictionLoopSchedule() (*schedule.Schedule, error) { 83 | if c.EvictionLoopSchedule == "" { 84 | return nil, nil 85 | } 86 | 87 | if c.EvictionLoopDuration == "" { 88 | return nil, errors.New("EvictionLoopDuration must be set when EvictionLoopSchedule is configured") 89 | } 90 | 91 | return schedule.NewSchedule(c.EvictionLoopSchedule, c.EvictionLoopDuration) 92 | } 93 | 94 | // HasEvictionLoopSchedule returns true if EvictionLoopSchedule is configured 95 | func (c *Config) HasEvictionLoopSchedule() bool { 96 | return c.EvictionLoopSchedule != "" 97 | } 98 | -------------------------------------------------------------------------------- /internal/testing/local_env_prep_helm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | K8S_SHREDDER_VERSION=$1 5 | KINDNODE_VERSION=$2 6 | K8S_CLUSTER_NAME=$3 7 | KUBECONFIG_FILE=${4:-kubeconfig} 8 | 9 | test_dir=$(dirname "${BASH_SOURCE[0]}") 10 | 11 | if kind get clusters | grep "${K8S_CLUSTER_NAME}" ; then 12 | echo "Local environment should be already set up. If that is not the case run 'make clean' first"; 13 | [[ -z "${KUBECONFIG}" ]] && export KUBECONFIG=${KUBECONFIG_FILE} 14 | else 15 | # create a k8s cluster 16 | echo "KIND: creating cluster ${K8S_CLUSTER_NAME} with version ${KINDNODE_VERSION}..." 17 | kind create cluster --name "${K8S_CLUSTER_NAME}" --kubeconfig=${KUBECONFIG_FILE} --image "kindest/node:${KINDNODE_VERSION}" \ 18 | --config "${test_dir}/kind.yaml" 19 | export KUBECONFIG=${KUBECONFIG_FILE} 20 | fi 21 | 22 | # upload k8s-shredder image inside kind cluster 23 | kind load docker-image adobe/k8s-shredder:"${K8S_SHREDDER_VERSION}" --name "${K8S_CLUSTER_NAME}" 24 | 25 | namespace_status=$(kubectl get ns ns-k8s-shredder-test -o json | jq .status.phase -r) 26 | 27 | if [[ $namespace_status == "Active" ]] 28 | then 29 | echo "KIND: Namespace ns-k8s-shredder-test and ns-team-k8s-shredder-test already present" 30 | else 31 | echo "KIND: creating ns-team-k8s-shredder-test and ns-k8s-shredder-test namespaces..." 32 | kubectl create namespace ns-k8s-shredder-test 33 | kubectl create namespace ns-team-k8s-shredder-test 34 | fi 35 | 36 | if [[ ${ENABLE_APISERVER_DEBUG} == "true" ]] 37 | then 38 | echo -e "K8S_SHREDDER: Enable debug logging on apiserver" 39 | TOKEN=$(kubectl create token default) 40 | 41 | APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"kind-${K8S_CLUSTER_NAME}\")].cluster.server}") 42 | curl -s -X PUT -d '5' "$APISERVER"/debug/flags/v --header "Authorization: Bearer $TOKEN" -k 43 | fi 44 | 45 | echo "KIND: deploying k8s-shredder using Helm chart..." 46 | # Use Helm to deploy k8s-shredder with test-specific configuration 47 | helm install k8s-shredder "${test_dir}/../../charts/k8s-shredder" \ 48 | --namespace kube-system \ 49 | --set image.registry=adobe/k8s-shredder \ 50 | --set image.tag="${K8S_SHREDDER_VERSION}" \ 51 | --set image.pullPolicy=Never \ 52 | --set shredder.EvictionLoopInterval=10s \ 53 | --set shredder.ParkedNodeTTL=30s \ 54 | --set shredder.RollingRestartThreshold=0.5 \ 55 | --set shredder.EnableKarpenterDriftDetection=false \ 56 | --set shredder.EnableNodeLabelDetection=false \ 57 | --set logLevel=debug \ 58 | --set logFormat=text \ 59 | --set dryRun=false \ 60 | --set service.create=true \ 61 | --set service.type=ClusterIP \ 62 | --set service.port=8080 \ 63 | --set service.targetPort=metrics 64 | 65 | echo "KIND: deploying prometheus..." 66 | kubectl apply -f "${test_dir}/prometheus_stuffs.yaml" 67 | 68 | echo "KIND: deploying Argo Rollouts CRD..." 69 | kubectl apply -f https://raw.githubusercontent.com/argoproj/argo-rollouts/v1.7.2/manifests/crds/rollout-crd.yaml 70 | 71 | echo "KIND: deploying test applications..." 72 | kubectl apply -f "${test_dir}/test_apps.yaml" 73 | 74 | # Adjust the correct UID for the test-app-argo-rollout ownerReference 75 | rollout_uid=$(kubectl -n ns-team-k8s-shredder-test get rollout test-app-argo-rollout -o jsonpath='{.metadata.uid}') 76 | sed "s/REPLACE_WITH_ROLLOUT_UID/${rollout_uid}/" < "${test_dir}/test_apps.yaml" | kubectl apply -f - 77 | 78 | echo "K8S_SHREDDER: waiting for k8s-shredder deployment to become ready!" 79 | retry_count=0 80 | i=1 81 | sp="/-\|" 82 | while [[ ${status} == *"False"* || -z ${status} ]]; do 83 | # set 5 minute timeout 84 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi 85 | # shellcheck disable=SC2059 86 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5; 87 | status=$(kubectl get pods -n kube-system -l app.kubernetes.io/name=k8s-shredder -o json | \ 88 | jq '.items[].status.conditions[] | select(.type=="Ready")| .status' 2> /dev/null) 89 | retry_count=$((retry_count+1)) 90 | done 91 | echo "" 92 | 93 | echo "K8S_SHREDDER: waiting for rollout object PDB to become ready!" 94 | retry_count=0 95 | while [[ $(kubectl get pdb -n ns-team-k8s-shredder-test test-app-argo-rollout \ 96 | -o jsonpath="{.status.currentHealthy}" 2> /dev/null) != "2" ]]; do 97 | # set 5 minute timeout 98 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi 99 | # shellcheck disable=SC2059 100 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5; 101 | retry_count=$((retry_count+1)) 102 | done 103 | 104 | echo "" 105 | kubectl logs -l app.kubernetes.io/name=k8s-shredder -n kube-system 106 | 107 | echo "K8S_SHREDDER: waiting for prometheus deployment to become ready!" 108 | retry_count=0 109 | while [[ $(kubectl get pods -n kube-system -l app=prometheus \ 110 | -o jsonpath="{.items[0].status.conditions[?(@.type=='Ready')].status}" 2> /dev/null) != "True" ]]; do 111 | # set 5 minute timeout 112 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi 113 | # shellcheck disable=SC2059 114 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5; 115 | retry_count=$((retry_count+1)) 116 | done 117 | 118 | echo "" 119 | 120 | echo -e "K8S_SHREDDER: You can access k8s-shredder metrics at http://localhost:1234/metrics after running 121 | kubectl port-forward -n kube-system svc/k8s-shredder --kubeconfig=${KUBECONFIG_FILE} 1234:8080\n 122 | It can take few minutes before seeing k8s-shredder metrics..." 123 | 124 | echo -e "K8S_SHREDDER: You can access k8s-shredder logs by running 125 | kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder --kubeconfig=${KUBECONFIG_FILE} \n" 126 | 127 | echo -e "K8S_SHREDDER: You can access prometheus metrics at http://localhost:1234 after running 128 | kubectl port-forward -n kube-system svc/prometheus --kubeconfig=${KUBECONFIG_FILE} 1234:9090\n" 129 | -------------------------------------------------------------------------------- /internal/testing/local_env_prep_node_labels_helm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | K8S_SHREDDER_VERSION=$1 5 | KINDNODE_VERSION=$2 6 | K8S_CLUSTER_NAME=$3 7 | KUBECONFIG_FILE=${4:-kubeconfig} 8 | 9 | test_dir=$(dirname "${BASH_SOURCE[0]}") 10 | 11 | if kind get clusters | grep "${K8S_CLUSTER_NAME}" ; then 12 | echo "Local environment should be already set up. If that is not the case run 'make clean' first"; 13 | [[ -z "${KUBECONFIG}" ]] && export KUBECONFIG=${KUBECONFIG_FILE} 14 | else 15 | # create a k8s cluster 16 | echo "KIND: creating cluster ${K8S_CLUSTER_NAME} with version ${KINDNODE_VERSION}..." 17 | kind create cluster --name "${K8S_CLUSTER_NAME}" --kubeconfig=${KUBECONFIG_FILE} --image "kindest/node:${KINDNODE_VERSION}" \ 18 | --config "${test_dir}/kind-node-labels.yaml" 19 | export KUBECONFIG=${KUBECONFIG_FILE} 20 | fi 21 | 22 | # upload k8s-shredder image inside kind cluster 23 | kind load docker-image adobe/k8s-shredder:"${K8S_SHREDDER_VERSION}" --name "${K8S_CLUSTER_NAME}" 24 | 25 | namespace_status=$(kubectl get ns ns-k8s-shredder-test -o json | jq .status.phase -r) 26 | 27 | if [[ $namespace_status == "Active" ]] 28 | then 29 | echo "KIND: Namespace ns-k8s-shredder-test and ns-team-k8s-shredder-test already present" 30 | else 31 | echo "KIND: creating ns-team-k8s-shredder-test and ns-k8s-shredder-test namespaces..." 32 | kubectl create namespace ns-k8s-shredder-test 33 | kubectl create namespace ns-team-k8s-shredder-test 34 | fi 35 | 36 | if [[ ${ENABLE_APISERVER_DEBUG} == "true" ]] 37 | then 38 | echo -e "K8S_SHREDDER: Enable debug logging on apiserver" 39 | TOKEN=$(kubectl create token default) 40 | 41 | APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"kind-${K8S_CLUSTER_NAME}\")].cluster.server}") 42 | curl -s -X PUT -d '5' "$APISERVER"/debug/flags/v --header "Authorization: Bearer $TOKEN" -k 43 | fi 44 | 45 | echo "NODE_LABELS: This test environment will demonstrate node label detection functionality" 46 | echo "NODE_LABELS: k8s-shredder will detect nodes with specific labels and park them" 47 | 48 | echo "KIND: deploying k8s-shredder using Helm chart with node label detection enabled..." 49 | # Use Helm to deploy k8s-shredder with node label detection enabled 50 | helm install k8s-shredder "${test_dir}/../../charts/k8s-shredder" \ 51 | --namespace kube-system \ 52 | --set image.registry=adobe/k8s-shredder \ 53 | --set image.tag="${K8S_SHREDDER_VERSION}" \ 54 | --set image.pullPolicy=Never \ 55 | --set shredder.EvictionLoopInterval=30s \ 56 | --set shredder.ParkedNodeTTL=2m \ 57 | --set shredder.RollingRestartThreshold=0.5 \ 58 | --set shredder.EnableKarpenterDriftDetection=false \ 59 | --set shredder.EnableNodeLabelDetection=true \ 60 | --set shredder.NodeLabelsToDetect[0]="test-label" \ 61 | --set shredder.NodeLabelsToDetect[1]="maintenance=scheduled" \ 62 | --set shredder.NodeLabelsToDetect[2]="node.test.io/park" \ 63 | --set logLevel=debug \ 64 | --set logFormat=text \ 65 | --set dryRun=false \ 66 | --set service.create=true \ 67 | --set service.type=ClusterIP \ 68 | --set service.port=8080 \ 69 | --set service.targetPort=metrics 70 | 71 | echo "KIND: deploying prometheus..." 72 | kubectl apply -f "${test_dir}/prometheus_stuffs_node_labels.yaml" 73 | 74 | echo "KIND: deploying Argo Rollouts CRD..." 75 | kubectl apply -f https://raw.githubusercontent.com/argoproj/argo-rollouts/v1.7.2/manifests/crds/rollout-crd.yaml 76 | 77 | echo "KIND: deploying test applications..." 78 | kubectl apply -f "${test_dir}/test_apps.yaml" 79 | 80 | # Adjust the correct UID for the test-app-argo-rollout ownerReference 81 | rollout_uid=$(kubectl -n ns-team-k8s-shredder-test get rollout test-app-argo-rollout -o jsonpath='{.metadata.uid}') 82 | sed "s/REPLACE_WITH_ROLLOUT_UID/${rollout_uid}/" < "${test_dir}/test_apps.yaml" | kubectl apply -f - 83 | 84 | echo "NODE_LABELS: Node label detection test environment ready!" 85 | 86 | echo "K8S_SHREDDER: waiting for k8s-shredder deployment to become ready!" 87 | retry_count=0 88 | i=1 89 | sp="/-\|" 90 | while [[ ${status} == *"False"* || -z ${status} ]]; do 91 | # set 5 minute timeout 92 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi 93 | # shellcheck disable=SC2059 94 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5; 95 | status=$(kubectl get pods -n kube-system -l app.kubernetes.io/name=k8s-shredder -o json | \ 96 | jq '.items[].status.conditions[] | select(.type=="Ready")| .status' 2> /dev/null) 97 | retry_count=$((retry_count+1)) 98 | done 99 | echo "" 100 | 101 | echo "K8S_SHREDDER: waiting for rollout object PDB to become ready!" 102 | retry_count=0 103 | while [[ $(kubectl get pdb -n ns-team-k8s-shredder-test test-app-argo-rollout \ 104 | -o jsonpath="{.status.currentHealthy}" 2> /dev/null) != "2" ]]; do 105 | # set 5 minute timeout 106 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi 107 | # shellcheck disable=SC2059 108 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5; 109 | retry_count=$((retry_count+1)) 110 | done 111 | 112 | echo "" 113 | kubectl logs -l app.kubernetes.io/name=k8s-shredder -n kube-system 114 | 115 | echo "K8S_SHREDDER: waiting for prometheus deployment to become ready!" 116 | retry_count=0 117 | while [[ $(kubectl get pods -n kube-system -l app=prometheus \ 118 | -o jsonpath="{.items[0].status.conditions[?(@.type=='Ready')].status}" 2> /dev/null) != "True" ]]; do 119 | # set 5 minute timeout 120 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi 121 | # shellcheck disable=SC2059 122 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5; 123 | retry_count=$((retry_count+1)) 124 | done 125 | 126 | echo "" 127 | 128 | echo -e "K8S_SHREDDER: You can access k8s-shredder metrics at http://localhost:1234/metrics after running 129 | kubectl port-forward -n kube-system svc/k8s-shredder --kubeconfig=${KUBECONFIG_FILE} 1234:8080\n 130 | It can take few minutes before seeing k8s-shredder metrics..." 131 | 132 | echo -e "K8S_SHREDDER: You can access k8s-shredder logs by running 133 | kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder --kubeconfig=${KUBECONFIG_FILE} \n" 134 | 135 | echo -e "K8S_SHREDDER: You can access prometheus metrics at http://localhost:1234 after running 136 | kubectl port-forward -n kube-system svc/prometheus --kubeconfig=${KUBECONFIG_FILE} 1234:9090\n" 137 | 138 | echo "NODE_LABELS: Environment setup complete!" 139 | echo "NODE_LABELS: Configured to detect nodes with these labels:" 140 | echo " - test-label (key only)" 141 | echo " - maintenance=scheduled (key=value)" 142 | echo " - node.test.io/park (key only)" 143 | echo "" 144 | 145 | echo "NODE_LABELS: Now applying test labels to trigger node label detection..." 146 | 147 | # Apply test labels to trigger k8s-shredder's node label detection 148 | WORKER_NODES=($(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | grep -v control-plane)) 149 | WORKER_NODE1=${WORKER_NODES[0]} 150 | WORKER_NODE2=${WORKER_NODES[1]} 151 | 152 | echo "NODE_LABELS: Adding 'test-label=test-value' to node ${WORKER_NODE1}" 153 | kubectl label node "${WORKER_NODE1}" test-label=test-value 154 | 155 | echo "NODE_LABELS: Adding 'maintenance=scheduled' to node ${WORKER_NODE2}" 156 | kubectl label node "${WORKER_NODE2}" maintenance=scheduled 157 | 158 | echo "NODE_LABELS: Labels applied! k8s-shredder should detect and park these nodes shortly..." 159 | echo "NODE_LABELS: You can monitor the process with:" 160 | echo " kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder --kubeconfig=${KUBECONFIG_FILE} -f" 161 | -------------------------------------------------------------------------------- /internal/testing/cluster_upgrade_node_labels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | K8S_CLUSTER_NAME=$1 5 | KUBECONFIG_FILE=${2:-kubeconfig} 6 | test_dir=$(dirname "${BASH_SOURCE[0]}") 7 | 8 | export KUBECONFIG=${KUBECONFIG_FILE} 9 | 10 | echo "===============================================================================" 11 | echo "NODE_LABELS: Starting node label detection test" 12 | echo "===============================================================================" 13 | 14 | echo "NODE_LABELS: This test will demonstrate k8s-shredder's node label detection functionality" 15 | echo "NODE_LABELS: We'll add specific labels to nodes and verify they get parked automatically" 16 | echo "" 17 | 18 | echo "NODE_LABELS: Getting available nodes for testing..." 19 | node_count=$(kubectl get nodes --no-headers | wc -l) 20 | if [[ ${node_count} -eq 0 ]]; then 21 | echo "ERROR: No nodes available for testing" 22 | exit 1 23 | fi 24 | 25 | # Get a worker node for testing (prefer worker nodes over control-plane) 26 | test_node=$(kubectl get nodes --no-headers -o custom-columns=":metadata.name" | grep -v control-plane | head -1 || kubectl get nodes --no-headers -o custom-columns=":metadata.name" | head -1) 27 | echo "NODE_LABELS: Using node '${test_node}' for testing" 28 | 29 | echo "NODE_LABELS: Current node labels before adding test labels:" 30 | kubectl get node ${test_node} --show-labels 31 | echo "" 32 | 33 | echo "NODE_LABELS: Checking for any existing parking labels..." 34 | parking_status=$(kubectl get node ${test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/upgrade-status}' 2>/dev/null || echo "") 35 | echo "NODE_LABELS: Current parking status: ${parking_status:-"Not parked"}" 36 | 37 | echo "" 38 | echo "===============================================================================" 39 | echo "NODE_LABELS: Adding test label to trigger node label detection..." 40 | echo "===============================================================================" 41 | 42 | # We'll test with the "test-label" key-only selector 43 | echo "NODE_LABELS: Adding label 'test-label=test-value' to node ${test_node}" 44 | kubectl label node ${test_node} test-label=test-value 45 | 46 | echo "NODE_LABELS: Node labeled successfully!" 47 | 48 | echo "NODE_LABELS: Current node labels after adding test label:" 49 | kubectl get node ${test_node} --show-labels 50 | echo "" 51 | 52 | echo "" 53 | echo "===============================================================================" 54 | echo "NODE_LABELS: Waiting for k8s-shredder to detect and park the labeled node..." 55 | echo "===============================================================================" 56 | 57 | echo "NODE_LABELS: Current k8s-shredder logs:" 58 | kubectl logs -l app=k8s-shredder -n kube-system --tail=20 59 | echo "" 60 | 61 | echo "NODE_LABELS: Monitoring k8s-shredder activity for next 3 minutes..." 62 | start_time=$(date +%s) 63 | end_time=$((start_time + 180)) 64 | 65 | while [[ $(date +%s) -lt ${end_time} ]]; do 66 | current_time=$(date +%s) 67 | remaining=$((end_time - current_time)) 68 | 69 | echo "NODE_LABELS: Checking node parking status... (${remaining}s remaining)" 70 | 71 | # Check if node is parked 72 | parking_status=$(kubectl get node ${test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/upgrade-status}' 2>/dev/null || echo "") 73 | parked_by=$(kubectl get node ${test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/parked-by}' 2>/dev/null || echo "") 74 | expires_on=$(kubectl get node ${test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/parked-node-expires-on}' 2>/dev/null || echo "") 75 | 76 | if [[ "${parking_status}" == "parked" ]]; then 77 | echo "" 78 | echo "===============================================================================" 79 | echo "NODE_LABELS: SUCCESS! Node ${test_node} has been parked by k8s-shredder!" 80 | echo "===============================================================================" 81 | echo "NODE_LABELS: Parking details:" 82 | echo " - Status: ${parking_status}" 83 | echo " - Parked by: ${parked_by}" 84 | echo " - Expires on: ${expires_on}" 85 | echo "" 86 | 87 | echo "NODE_LABELS: Checking if node is also cordoned and tainted..." 88 | node_unschedulable=$(kubectl get node ${test_node} -o jsonpath='{.spec.unschedulable}' 2>/dev/null || echo "") 89 | echo " - Unschedulable (cordoned): ${node_unschedulable}" 90 | 91 | echo " - Taints:" 92 | kubectl get node ${test_node} -o jsonpath='{.spec.taints}' | jq -r '.[] | " \(.key)=\(.value):\(.effect)"' 2>/dev/null || echo " No taints found" 93 | 94 | echo "" 95 | echo "NODE_LABELS: Checking pods on the node..." 96 | kubectl get pods --all-namespaces --field-selector spec.nodeName=${test_node} -o wide 97 | 98 | echo "" 99 | echo "NODE_LABELS: Final k8s-shredder logs:" 100 | kubectl logs -l app=k8s-shredder -n kube-system --tail=30 101 | 102 | echo "" 103 | echo "===============================================================================" 104 | echo "NODE_LABELS: Test completed successfully!" 105 | echo "===============================================================================" 106 | echo "NODE_LABELS: Summary:" 107 | echo " 1. ✅ Test label was added to node" 108 | echo " 2. ✅ k8s-shredder detected the labeled node" 109 | echo " 3. ✅ k8s-shredder parked the node with labels" 110 | echo " 4. ✅ Node was cordoned and tainted" 111 | echo " 5. ✅ Pods on the node were also labeled" 112 | 113 | echo "" 114 | echo "NODE_LABELS: Testing additional label formats..." 115 | 116 | # Test another node with a different label format 117 | available_nodes=$(kubectl get nodes --no-headers -o custom-columns=":metadata.name" | grep -v "${test_node}") 118 | if [[ -n "${available_nodes}" ]]; then 119 | second_test_node=$(echo "${available_nodes}" | head -1) 120 | echo "NODE_LABELS: Testing key=value format on node '${second_test_node}'" 121 | kubectl label node ${second_test_node} maintenance=scheduled 122 | 123 | # Wait a bit to see if this gets detected too 124 | sleep 45 125 | 126 | second_parking_status=$(kubectl get node ${second_test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/upgrade-status}' 2>/dev/null || echo "") 127 | if [[ "${second_parking_status}" == "parked" ]]; then 128 | echo " ✅ Second node with key=value label also parked successfully!" 129 | else 130 | echo " ⏳ Second node not yet parked (may need more time)" 131 | fi 132 | fi 133 | 134 | echo "" 135 | exit 0 136 | fi 137 | 138 | echo "NODE_LABELS: Node parking status: ${parking_status:-"Not parked yet"}" 139 | sleep 10 140 | done 141 | 142 | echo "" 143 | echo "===============================================================================" 144 | echo "NODE_LABELS: Test completed but node was not parked within timeout" 145 | echo "===============================================================================" 146 | echo "NODE_LABELS: Final status check:" 147 | 148 | echo "NODE_LABELS: Node labels:" 149 | kubectl get node ${test_node} --show-labels 150 | echo "" 151 | 152 | echo "NODE_LABELS: Final k8s-shredder logs:" 153 | kubectl logs -l app=k8s-shredder -n kube-system --tail=50 154 | echo "" 155 | 156 | echo "NODE_LABELS: All Nodes:" 157 | kubectl get nodes -o wide 158 | echo "" 159 | 160 | echo "NODE_LABELS: k8s-shredder may need more time or there might be an issue." 161 | echo "NODE_LABELS: Check the logs above for any errors or continue monitoring manually." 162 | echo "NODE_LABELS: This could be expected behavior if node label detection is disabled or" 163 | echo "NODE_LABELS: if k8s-shredder hasn't run its eviction loop yet." 164 | 165 | exit 1 -------------------------------------------------------------------------------- /internal/testing/test_apps.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: test-app-disallow-eviction 6 | namespace: ns-team-k8s-shredder-test 7 | spec: 8 | replicas: 4 9 | selector: 10 | matchLabels: 11 | app: test-app-disallow-eviction 12 | strategy: 13 | rollingUpdate: 14 | maxSurge: 25% 15 | maxUnavailable: 25% 16 | type: RollingUpdate 17 | template: 18 | metadata: 19 | labels: 20 | app: test-app-disallow-eviction 21 | shredder.ethos.adobe.net/allow-eviction: 'false' 22 | spec: 23 | containers: 24 | - name: canary 25 | image: busybox:1.35 26 | command: [sh, -c, while true; do sleep 30; done] 27 | --- 28 | apiVersion: policy/v1 29 | kind: PodDisruptionBudget 30 | metadata: 31 | name: test-app-disallow-eviction 32 | namespace: ns-team-k8s-shredder-test 33 | spec: 34 | minAvailable: 1 35 | selector: 36 | matchLabels: 37 | app: test-app-disallow-eviction 38 | # 2. Good citizen 39 | --- 40 | apiVersion: apps/v1 41 | kind: Deployment 42 | metadata: 43 | name: test-app-allow-eviction 44 | namespace: ns-team-k8s-shredder-test 45 | spec: 46 | replicas: 4 47 | selector: 48 | matchLabels: 49 | app: test-app-allow-eviction 50 | strategy: 51 | rollingUpdate: 52 | maxSurge: 25% 53 | maxUnavailable: 25% 54 | type: RollingUpdate 55 | template: 56 | metadata: 57 | labels: 58 | app: test-app-allow-eviction 59 | spec: 60 | containers: 61 | - name: canary 62 | image: busybox:1.35 63 | command: [sh, -c, while true; do sleep 30; done] 64 | --- 65 | apiVersion: policy/v1 66 | kind: PodDisruptionBudget 67 | metadata: 68 | name: test-app-allow-eviction 69 | namespace: ns-team-k8s-shredder-test 70 | spec: 71 | minAvailable: 1 72 | selector: 73 | matchLabels: 74 | app: test-app-allow-eviction 75 | # 3. Bad citizen with wrongly configured PDB 76 | --- 77 | apiVersion: apps/v1 78 | kind: Deployment 79 | metadata: 80 | name: test-app-with-bad-pdb 81 | namespace: ns-team-k8s-shredder-test 82 | spec: 83 | replicas: 4 84 | selector: 85 | matchLabels: 86 | app: test-app-with-bad-pdb 87 | strategy: 88 | rollingUpdate: 89 | maxSurge: 0 90 | maxUnavailable: 25% 91 | template: 92 | metadata: 93 | labels: 94 | app: test-app-with-bad-pdb 95 | spec: 96 | affinity: 97 | nodeAffinity: 98 | preferredDuringSchedulingIgnoredDuringExecution: 99 | - weight: 100 100 | preference: 101 | matchExpressions: 102 | - key: will-be-parked 103 | operator: In 104 | values: ['true'] 105 | containers: 106 | - name: canary 107 | image: busybox:1.35 108 | command: [sh, -c, while true; do sleep 30; done] 109 | --- 110 | apiVersion: policy/v1 111 | kind: PodDisruptionBudget 112 | metadata: 113 | name: test-app-with-bad-pdb 114 | namespace: ns-team-k8s-shredder-test 115 | spec: 116 | minAvailable: 10 117 | selector: 118 | matchLabels: 119 | app: test-app-with-bad-pdb 120 | # 4. Good citizen with recreate update strategy 121 | --- 122 | apiVersion: apps/v1 123 | kind: Deployment 124 | metadata: 125 | name: test-app-with-recreate 126 | namespace: ns-team-k8s-shredder-test 127 | spec: 128 | replicas: 4 129 | selector: 130 | matchLabels: 131 | app: test-app-recreate 132 | strategy: 133 | type: Recreate 134 | template: 135 | metadata: 136 | labels: 137 | app: test-app-recreate 138 | spec: 139 | containers: 140 | - name: canary 141 | image: busybox:1.35 142 | command: [sh, -c, while true; do sleep 30; done] 143 | --- 144 | apiVersion: policy/v1 145 | kind: PodDisruptionBudget 146 | metadata: 147 | name: test-app-recreate 148 | namespace: ns-team-k8s-shredder-test 149 | spec: 150 | minAvailable: 1 151 | selector: 152 | matchLabels: 153 | app: test-app-recreate 154 | ##### CAAS ##### 155 | # 1. Good citizen in CaaS world 156 | --- 157 | apiVersion: apps/v1 158 | kind: Deployment 159 | metadata: 160 | name: test-app-caas 161 | namespace: ns-k8s-shredder-test 162 | spec: 163 | replicas: 4 164 | selector: 165 | matchLabels: 166 | app: test-app-caas 167 | strategy: 168 | rollingUpdate: 169 | maxSurge: 25% 170 | maxUnavailable: 25% 171 | type: RollingUpdate 172 | template: 173 | metadata: 174 | labels: 175 | app: test-app-caas 176 | spec: 177 | containers: 178 | - name: canary 179 | image: busybox:1.35 180 | command: [sh, -c, while true; do sleep 30; done] 181 | --- 182 | apiVersion: policy/v1 183 | kind: PodDisruptionBudget 184 | metadata: 185 | name: test-app-caas 186 | namespace: ns-k8s-shredder-test 187 | spec: 188 | minAvailable: 1 189 | selector: 190 | matchLabels: 191 | app: test-app-caas 192 | --- 193 | apiVersion: v1 194 | kind: Service 195 | metadata: 196 | name: test-app-statefulset 197 | namespace: ns-team-k8s-shredder-test 198 | spec: 199 | ports: 200 | - port: 80 201 | targetPort: 8080 202 | name: web 203 | clusterIP: None 204 | selector: 205 | app: test-app-statefulset 206 | --- 207 | apiVersion: apps/v1 208 | kind: StatefulSet 209 | metadata: 210 | name: test-app-statefulset 211 | namespace: ns-team-k8s-shredder-test 212 | spec: 213 | selector: 214 | matchLabels: 215 | app: test-app-statefulset 216 | serviceName: test-app-statefulset 217 | replicas: 3 218 | template: 219 | metadata: 220 | labels: 221 | app: test-app-statefulset 222 | spec: 223 | terminationGracePeriodSeconds: 10 224 | containers: 225 | - name: test-app-statefulset 226 | image: busybox:1.35 227 | command: [sh, -c, while true; do sleep 30; done] 228 | ports: 229 | - containerPort: 8080 230 | name: web 231 | --- 232 | apiVersion: policy/v1 233 | kind: PodDisruptionBudget 234 | metadata: 235 | name: test-app-statefulset 236 | namespace: ns-team-k8s-shredder-test 237 | spec: 238 | minAvailable: 1 239 | selector: 240 | matchLabels: 241 | app: test-app-statefulset 242 | #### FLEX #### 243 | # 1. Good citizen Argo Rollout in Flex world 244 | --- 245 | apiVersion: apps/v1 246 | kind: ReplicaSet 247 | metadata: 248 | name: test-app-argo-rollout 249 | namespace: ns-team-k8s-shredder-test 250 | ownerReferences: 251 | - apiVersion: argoproj.io/v1alpha1 252 | kind: Rollout 253 | blockOwnerDeletion: true 254 | name: test-app-argo-rollout 255 | uid: REPLACE_WITH_ROLLOUT_UID 256 | spec: 257 | replicas: 2 258 | selector: 259 | matchLabels: 260 | app: test-app-argo-rollout 261 | template: 262 | metadata: 263 | labels: 264 | app: test-app-argo-rollout 265 | spec: 266 | affinity: 267 | podAntiAffinity: 268 | requiredDuringSchedulingIgnoredDuringExecution: 269 | - labelSelector: 270 | matchExpressions: 271 | - key: app 272 | operator: In 273 | values: [test-app-argo-rollout] 274 | topologyKey: kubernetes.io/hostname 275 | containers: 276 | - name: test-app-argo-rollout 277 | image: busybox:1.35 278 | command: [sh, -c, while true; do sleep 30; done] 279 | ports: 280 | - containerPort: 8080 281 | name: web 282 | --- 283 | apiVersion: argoproj.io/v1alpha1 284 | kind: Rollout 285 | metadata: 286 | name: test-app-argo-rollout 287 | namespace: ns-team-k8s-shredder-test 288 | spec: 289 | replicas: 2 290 | workloadRef: 291 | apiVersion: apps/v1 292 | kind: ReplicaSet 293 | name: test-app-argo-rollout 294 | --- 295 | apiVersion: policy/v1 296 | kind: PodDisruptionBudget 297 | metadata: 298 | name: test-app-argo-rollout 299 | namespace: ns-team-k8s-shredder-test 300 | spec: 301 | minAvailable: 10 302 | selector: 303 | matchLabels: 304 | app: test-app-argo-rollout 305 | -------------------------------------------------------------------------------- /charts/k8s-shredder/values.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # -- Container image configuration 3 | image: 4 | # -- Container registry where the k8s-shredder image is hosted 5 | registry: ghcr.io/adobe/k8s-shredder 6 | # -- Image pull policy - IfNotPresent, Always, or Never 7 | pullPolicy: IfNotPresent 8 | # -- Image tag to use 9 | tag: latest 10 | # -- Number of k8s-shredder pods to run 11 | replicaCount: 1 12 | # -- Deployment strategy for rolling updates (e.g., RollingUpdate, Recreate) 13 | deploymentStrategy: {} 14 | # -- Secrets for pulling images from private registries 15 | imagePullSecrets: [] 16 | # -- Override the name of the chart 17 | nameOverride: '' 18 | # -- Override the full name used for resources 19 | fullnameOverride: '' 20 | # -- Additional environment variables to set in the container 21 | environmentVars: [] 22 | # -- Enable dry-run mode - when true, k8s-shredder will log actions but not execute them 23 | dryRun: false 24 | # -- Logging configuration 25 | # -- Available log levels: panic, fatal, error, warn, warning, info, debug, trace 26 | logLevel: debug 27 | # -- Log output format: text (human-readable) or json (structured logging) 28 | logFormat: text 29 | # -- Core k8s-shredder configuration 30 | shredder: 31 | # -- How often to run the main eviction loop 32 | EvictionLoopInterval: 1h 33 | # -- Optional cron schedule for when eviction operations are allowed. If set, parking and shredding operations will only occur during the scheduled time window. Supports standard cron syntax and macros (@yearly, @monthly, @weekly, @daily, @hourly). Example: "@daily" (runs at midnight UTC), "0 2 * * *" (runs at 2 AM UTC daily). When omitted, operations run continuously. 34 | EvictionLoopSchedule: '' 35 | # -- Duration for how long the scheduled window stays active after the schedule triggers. Only used when EvictionLoopSchedule is set. Supports compound durations with hours and minutes (e.g., "10h5m", "30m", "160h"). Example: "10h" (window stays active for 10 hours), "30m" (window stays active for 30 minutes). 36 | EvictionLoopDuration: '' 37 | # -- How long parked nodes should remain before being eligible for deletion (7 days default) 38 | ParkedNodeTTL: 168h 39 | # -- Maximum percentage of nodes that can be restarted simultaneously during rolling restarts 40 | RollingRestartThreshold: 0.1 41 | # -- Label used to track node upgrade status 42 | UpgradeStatusLabel: shredder.ethos.adobe.net/upgrade-status 43 | # -- Label used to track when a parked node expires 44 | ExpiresOnLabel: shredder.ethos.adobe.net/parked-node-expires-on 45 | # -- Namespace prefix to skip during initial eviction (useful for system namespaces) 46 | NamespacePrefixSkipInitialEviction: ns-ethos- 47 | # -- Annotation to track when a workload was last restarted 48 | RestartedAtAnnotation: shredder.ethos.adobe.net/restartedAt 49 | # -- Label to explicitly allow eviction on specific resources 50 | AllowEvictionLabel: shredder.ethos.adobe.net/allow-eviction 51 | # -- Taint indicating nodes scheduled for deletion by cluster autoscaler 52 | ToBeDeletedTaint: ToBeDeletedByClusterAutoscaler 53 | # -- API version for Argo Rollouts integration 54 | ArgoRolloutsAPIVersion: v1alpha1 55 | # -- Enable Karpenter drift detection for node lifecycle management 56 | EnableKarpenterDriftDetection: false 57 | # -- Enable Karpenter disruption detection for node lifecycle management 58 | EnableKarpenterDisruptionDetection: false 59 | # -- Label to track which component parked a node 60 | ParkedByLabel: shredder.ethos.adobe.net/parked-by 61 | # -- Value set in the ParkedByLabel to identify k8s-shredder as the parking agent 62 | ParkedByValue: k8s-shredder 63 | # -- Taint applied to parked nodes to prevent new pod scheduling 64 | ParkedNodeTaint: shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule 65 | # -- Enable detection of nodes based on specific labels 66 | EnableNodeLabelDetection: false 67 | # -- List of node labels to monitor for triggering shredder actions 68 | NodeLabelsToDetect: [] 69 | # -- Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" for no limit 70 | MaxParkedNodes: '0' 71 | # -- Controls whether to perform safety checks before force eviction 72 | EvictionSafetyCheck: true 73 | # -- Label used to track why a node or pod was parked 74 | ParkingReasonLabel: shredder.ethos.adobe.net/parked-reason 75 | # -- Additional labels to apply to nodes and pods during parking 76 | ExtraParkingLabels: {} 77 | # Example configuration: 78 | # example.com/owner: "infrastructure" 79 | # example.com/maintenance: "true" 80 | # -- RBAC (Role-Based Access Control) configuration 81 | rbac: 82 | # -- Create RBAC resources (ClusterRole, ClusterRoleBinding) 83 | create: true 84 | # -- Kubernetes service account configuration 85 | serviceAccount: 86 | # -- Create a service account for k8s-shredder 87 | create: true 88 | # -- Name of the service account 89 | name: k8s-shredder 90 | # -- Additional annotations for the service account (useful for IAM roles, etc.) 91 | annotations: {} 92 | # -- Kubernetes service configuration 93 | service: 94 | # -- Create a service for k8s-shredder metrics endpoint 95 | create: false 96 | # -- Service type (ClusterIP, NodePort, LoadBalancer) 97 | type: ClusterIP 98 | # -- Service port for metrics endpoint 99 | port: 8080 100 | # -- Target port for metrics endpoint 101 | targetPort: metrics 102 | # -- Additional annotations for the service 103 | annotations: {} 104 | # -- Additional labels for the service 105 | labels: {} 106 | # -- Annotations to add to k8s-shredder pod(s) 107 | podAnnotations: {} 108 | # -- Additional labels to add to k8s-shredder pod(s) 109 | podLabels: {} 110 | # -- Security context applied to the entire pod 111 | podSecurityContext: {} 112 | # -- Security context applied to the k8s-shredder container 113 | securityContext: {} 114 | # -- Init containers to run before the main k8s-shredder container starts 115 | initContainers: [] 116 | # -- Additional containers to run alongside k8s-shredder in the same pod 117 | additionalContainers: [] 118 | # -- Resource requests and limits for the k8s-shredder container 119 | resources: 120 | limits: 121 | # -- Maximum CPU cores the container can use 122 | cpu: '1' 123 | # -- Maximum memory the container can use 124 | memory: 1Gi 125 | requests: 126 | # -- CPU cores requested for the container (guaranteed allocation) 127 | cpu: 250m 128 | # -- Memory requested for the container (guaranteed allocation) 129 | memory: 250Mi 130 | # -- Additional volumes to mount in the pod 131 | volumes: [] 132 | # Example volume configuration: 133 | # - name: ca 134 | # secret: 135 | # secretName: k8s-shredder-ca 136 | # items: 137 | # - key: ca.pem 138 | # path: ca.pem 139 | 140 | # -- Node selector to constrain pod scheduling to specific nodes 141 | nodeSelector: {} 142 | # -- Tolerations to allow scheduling on nodes with specific taints 143 | tolerations: [] 144 | # -- Affinity rules for advanced pod scheduling (node affinity, pod affinity/anti-affinity) 145 | affinity: {} 146 | # -- Prometheus monitoring configuration 147 | podMonitor: 148 | # -- Enable creation of a PodMonitor resource for Prometheus scraping 149 | enabled: false 150 | # -- Labels to apply to the PodMonitor resource 151 | labels: {} 152 | # app: k8s-shredder 153 | # subsystem: k8s-a 154 | # -- How often Prometheus should scrape metrics 155 | interval: 60s 156 | # -- Timeout for each scrape attempt 157 | scrapeTimeout: 10s 158 | # -- Whether to honor labels from the target 159 | honorLabels: true 160 | # -- Metric relabeling configuration 161 | relabelings: [] 162 | # -- Priority class for pod scheduling - system-cluster-critical ensures high priority 163 | priorityClassName: system-cluster-critical 164 | # -- Topology spread constraints to control pod distribution across failure domains 165 | # -- Helps ensure high availability by spreading pods across zones/nodes 166 | topologySpreadConstraints: [] 167 | # Example configuration: 168 | # - maxSkew: 1 169 | # topologyKey: topology.kubernetes.io/zone 170 | # whenUnsatisfiable: DoNotSchedule 171 | # labelSelector: 172 | # matchLabels: 173 | # app.kubernetes.io/name=k8s-shredder 174 | -------------------------------------------------------------------------------- /pkg/metrics/metrics.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE-2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package metrics 13 | 14 | import ( 15 | "github.com/prometheus/client_golang/prometheus" 16 | ) 17 | 18 | var ( 19 | 20 | // ShredderAPIServerRequestsTotal = Total requests for Kubernetes API 21 | ShredderAPIServerRequestsTotal = prometheus.NewCounterVec( 22 | prometheus.CounterOpts{ 23 | Name: "shredder_apiserver_requests_total", 24 | Help: "Total requests for Kubernetes API", 25 | }, 26 | []string{"verb", "resource", "status"}, 27 | ) 28 | 29 | // ShredderAPIServerRequestsDurationSeconds = Requests duration seconds for calling Kubernetes API 30 | ShredderAPIServerRequestsDurationSeconds = prometheus.NewSummaryVec( 31 | prometheus.SummaryOpts{ 32 | Name: "shredder_apiserver_requests_duration_seconds", 33 | Help: "Requests duration when calling Kubernetes API", 34 | Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 35 | }, 36 | []string{"verb", "resource", "status"}, 37 | ) 38 | 39 | // ShredderLoopsTotal = Total loops 40 | ShredderLoopsTotal = prometheus.NewCounter( 41 | prometheus.CounterOpts{ 42 | Name: "shredder_loops_total", 43 | Help: "Total loops", 44 | }, 45 | ) 46 | 47 | // ShredderLoopsDurationSeconds = Loops duration in seconds 48 | ShredderLoopsDurationSeconds = prometheus.NewSummary( 49 | prometheus.SummaryOpts{ 50 | Name: "shredder_loops_duration_seconds", 51 | Help: "Loops duration in seconds", 52 | Objectives: map[float64]float64{0.5: 1200, 0.9: 900, 0.99: 600}, 53 | }, 54 | ) 55 | 56 | // ShredderProcessedNodesTotal = Total processed nodes 57 | ShredderProcessedNodesTotal = prometheus.NewCounter( 58 | prometheus.CounterOpts{ 59 | Name: "shredder_processed_nodes_total", 60 | Help: "Total processed nodes", 61 | }, 62 | ) 63 | 64 | // ShredderProcessedPodsTotal = Total processed pods 65 | ShredderProcessedPodsTotal = prometheus.NewCounter( 66 | prometheus.CounterOpts{ 67 | Name: "shredder_processed_pods_total", 68 | Help: "Total processed pods", 69 | }, 70 | ) 71 | 72 | // ShredderErrorsTotal = Total errors 73 | ShredderErrorsTotal = prometheus.NewCounter( 74 | prometheus.CounterOpts{ 75 | Name: "shredder_errors_total", 76 | Help: "Total errors", 77 | }, 78 | ) 79 | 80 | // ShredderPodErrorsTotal = Total pod errors 81 | ShredderPodErrorsTotal = prometheus.NewGaugeVec( 82 | prometheus.GaugeOpts{ 83 | Name: "shredder_pod_errors_total", 84 | Help: "Total pod errors per eviction loop", 85 | }, 86 | []string{"pod_name", "namespace", "reason", "action"}, 87 | ) 88 | 89 | // ShredderNodeForceToEvictTime = Time when the node will be forcibly evicted 90 | ShredderNodeForceToEvictTime = prometheus.NewGaugeVec( 91 | prometheus.GaugeOpts{ 92 | Name: "shredder_node_force_to_evict_time", 93 | Help: "Time when the node will be forcibly evicted", 94 | }, 95 | []string{"node_name"}, 96 | ) 97 | 98 | // ShredderPodForceToEvictTime = Time when the pod will be forcibly evicted 99 | ShredderPodForceToEvictTime = prometheus.NewGaugeVec( 100 | prometheus.GaugeOpts{ 101 | Name: "shredder_pod_force_to_evict_time", 102 | Help: "Time when the pod will be forcibly evicted", 103 | }, 104 | []string{"pod_name", "namespace"}, 105 | ) 106 | 107 | // ShredderKarpenterDriftedNodesTotal = Total number of drifted Karpenter nodes detected 108 | ShredderKarpenterDriftedNodesTotal = prometheus.NewCounter( 109 | prometheus.CounterOpts{ 110 | Name: "shredder_karpenter_drifted_nodes_total", 111 | Help: "Total number of drifted Karpenter nodes detected", 112 | }, 113 | ) 114 | 115 | // ShredderKarpenterDisruptedNodesTotal = Total number of disrupted Karpenter nodes detected 116 | ShredderKarpenterDisruptedNodesTotal = prometheus.NewCounter( 117 | prometheus.CounterOpts{ 118 | Name: "shredder_karpenter_disrupted_nodes_total", 119 | Help: "Total number of disrupted Karpenter nodes detected", 120 | }, 121 | ) 122 | 123 | // ShredderKarpenterNodesParkedTotal = Total number of Karpenter nodes successfully parked 124 | ShredderKarpenterNodesParkedTotal = prometheus.NewCounter( 125 | prometheus.CounterOpts{ 126 | Name: "shredder_karpenter_nodes_parked_total", 127 | Help: "Total number of Karpenter nodes successfully parked", 128 | }, 129 | ) 130 | 131 | // ShredderKarpenterNodesParkingFailedTotal = Total number of Karpenter nodes that failed to be parked 132 | ShredderKarpenterNodesParkingFailedTotal = prometheus.NewCounter( 133 | prometheus.CounterOpts{ 134 | Name: "shredder_karpenter_nodes_parking_failed_total", 135 | Help: "Total number of Karpenter nodes that failed to be parked", 136 | }, 137 | ) 138 | 139 | // ShredderKarpenterProcessingDurationSeconds = Duration of Karpenter node processing in seconds 140 | ShredderKarpenterProcessingDurationSeconds = prometheus.NewSummary( 141 | prometheus.SummaryOpts{ 142 | Name: "shredder_karpenter_processing_duration_seconds", 143 | Help: "Duration of Karpenter node processing in seconds", 144 | Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 145 | }, 146 | ) 147 | 148 | // ShredderNodeLabelNodesParkedTotal = Total number of nodes successfully parked via node label detection 149 | ShredderNodeLabelNodesParkedTotal = prometheus.NewCounter( 150 | prometheus.CounterOpts{ 151 | Name: "shredder_node_label_nodes_parked_total", 152 | Help: "Total number of nodes successfully parked via node label detection", 153 | }, 154 | ) 155 | 156 | // ShredderNodeLabelNodesParkingFailedTotal = Total number of nodes that failed to be parked via node label detection 157 | ShredderNodeLabelNodesParkingFailedTotal = prometheus.NewCounter( 158 | prometheus.CounterOpts{ 159 | Name: "shredder_node_label_nodes_parking_failed_total", 160 | Help: "Total number of nodes that failed to be parked via node label detection", 161 | }, 162 | ) 163 | 164 | // ShredderNodeLabelProcessingDurationSeconds = Duration of node label detection and parking process in seconds 165 | ShredderNodeLabelProcessingDurationSeconds = prometheus.NewSummary( 166 | prometheus.SummaryOpts{ 167 | Name: "shredder_node_label_processing_duration_seconds", 168 | Help: "Duration of node label detection and parking process in seconds", 169 | Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 170 | }, 171 | ) 172 | 173 | // ShredderNodeLabelMatchingNodesTotal = Total number of nodes matching the label criteria 174 | ShredderNodeLabelMatchingNodesTotal = prometheus.NewGauge( 175 | prometheus.GaugeOpts{ 176 | Name: "shredder_node_label_matching_nodes_total", 177 | Help: "Total number of nodes matching the label criteria", 178 | }, 179 | ) 180 | 181 | // ShredderNodesParkedTotal = Total number of nodes successfully parked (shared across all detection methods) 182 | ShredderNodesParkedTotal = prometheus.NewCounter( 183 | prometheus.CounterOpts{ 184 | Name: "shredder_nodes_parked_total", 185 | Help: "Total number of nodes successfully parked (shared across all detection methods)", 186 | }, 187 | ) 188 | 189 | // ShredderNodesParkingFailedTotal = Total number of nodes that failed to be parked (shared across all detection methods) 190 | ShredderNodesParkingFailedTotal = prometheus.NewCounter( 191 | prometheus.CounterOpts{ 192 | Name: "shredder_nodes_parking_failed_total", 193 | Help: "Total number of nodes that failed to be parked (shared across all detection methods)", 194 | }, 195 | ) 196 | 197 | // ShredderProcessingDurationSeconds = Duration of node processing in seconds (shared across all detection methods) 198 | ShredderProcessingDurationSeconds = prometheus.NewSummary( 199 | prometheus.SummaryOpts{ 200 | Name: "shredder_processing_duration_seconds", 201 | Help: "Duration of node processing in seconds (shared across all detection methods)", 202 | Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 203 | }, 204 | ) 205 | ) 206 | -------------------------------------------------------------------------------- /internal/testing/local_env_prep_karpenter_helm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | K8S_SHREDDER_VERSION=$1 5 | KINDNODE_VERSION=$2 6 | K8S_CLUSTER_NAME=$3 7 | KUBECONFIG_FILE=${4:-kubeconfig} 8 | 9 | test_dir=$(dirname "${BASH_SOURCE[0]}") 10 | 11 | if kind get clusters | grep "${K8S_CLUSTER_NAME}" ; then 12 | echo "Local environment should be already set up. If that is not the case run 'make clean' first"; 13 | [[ -z "${KUBECONFIG}" ]] && export KUBECONFIG=${KUBECONFIG_FILE} 14 | else 15 | # create a k8s cluster 16 | echo "KIND: creating cluster ${K8S_CLUSTER_NAME} with version ${KINDNODE_VERSION}..." 17 | kind create cluster --name "${K8S_CLUSTER_NAME}" --kubeconfig=${KUBECONFIG_FILE} --image "kindest/node:${KINDNODE_VERSION}" \ 18 | --config "${test_dir}/kind-karpenter.yaml" 19 | export KUBECONFIG=${KUBECONFIG_FILE} 20 | fi 21 | 22 | # upload k8s-shredder image inside kind cluster 23 | kind load docker-image adobe/k8s-shredder:"${K8S_SHREDDER_VERSION}" --name "${K8S_CLUSTER_NAME}" 24 | 25 | namespace_status=$(kubectl get ns ns-k8s-shredder-test -o json | jq .status.phase -r) 26 | 27 | if [[ $namespace_status == "Active" ]] 28 | then 29 | echo "KIND: Namespace ns-k8s-shredder-test and ns-team-k8s-shredder-test already present" 30 | else 31 | echo "KIND: creating ns-team-k8s-shredder-test and ns-k8s-shredder-test namespaces..." 32 | kubectl create namespace ns-k8s-shredder-test 33 | kubectl create namespace ns-team-k8s-shredder-test 34 | fi 35 | 36 | if [[ ${ENABLE_APISERVER_DEBUG} == "true" ]] 37 | then 38 | echo -e "K8S_SHREDDER: Enable debug logging on apiserver" 39 | TOKEN=$(kubectl create token default) 40 | 41 | APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"kind-${K8S_CLUSTER_NAME}\")].cluster.server}") 42 | curl -s -X PUT -d '5' "$APISERVER"/debug/flags/v --header "Authorization: Bearer $TOKEN" -k 43 | fi 44 | 45 | echo "KARPENTER: Note - this is a simplified test setup that simulates Karpenter without installing it" 46 | echo "KARPENTER: In this test environment, we'll simulate drifted NodeClaims using mock objects" 47 | echo "KARPENTER: The k8s-shredder Karpenter drift detection will be tested against these objects" 48 | 49 | # Create karpenter namespace for testing 50 | kubectl create namespace karpenter || true 51 | 52 | # Create mock Karpenter CRDs for testing (simplified versions) 53 | echo "KARPENTER: Creating mock Karpenter CRDs for testing..." 54 | cat < /dev/null) 166 | retry_count=$((retry_count+1)) 167 | done 168 | echo "" 169 | 170 | echo "K8S_SHREDDER: waiting for rollout object PDB to become ready!" 171 | retry_count=0 172 | while [[ $(kubectl get pdb -n ns-team-k8s-shredder-test test-app-argo-rollout \ 173 | -o jsonpath="{.status.currentHealthy}" 2> /dev/null) != "2" ]]; do 174 | # set 5 minute timeout 175 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi 176 | # shellcheck disable=SC2059 177 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5; 178 | retry_count=$((retry_count+1)) 179 | done 180 | 181 | echo "" 182 | kubectl logs -l app.kubernetes.io/name=k8s-shredder -n kube-system 183 | 184 | echo "K8S_SHREDDER: waiting for prometheus deployment to become ready!" 185 | retry_count=0 186 | while [[ $(kubectl get pods -n kube-system -l app=prometheus \ 187 | -o jsonpath="{.items[0].status.conditions[?(@.type=='Ready')].status}" 2> /dev/null) != "True" ]]; do 188 | # set 5 minute timeout 189 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi 190 | # shellcheck disable=SC2059 191 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5; 192 | retry_count=$((retry_count+1)) 193 | done 194 | 195 | echo "" 196 | 197 | echo -e "K8S_SHREDDER: You can access k8s-shredder metrics at http://localhost:1234/metrics after running 198 | kubectl port-forward -n kube-system svc/k8s-shredder --kubeconfig=${KUBECONFIG_FILE} 1234:8080\n 199 | It can take few minutes before seeing k8s-shredder metrics..." 200 | 201 | echo -e "K8S_SHREDDER: You can access k8s-shredder logs by running 202 | kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder --kubeconfig=${KUBECONFIG_FILE} \n" 203 | 204 | echo -e "K8S_SHREDDER: You can access prometheus metrics at http://localhost:1234 after running 205 | kubectl port-forward -n kube-system svc/prometheus --kubeconfig=${KUBECONFIG_FILE} 1234:9090\n" 206 | 207 | echo "KARPENTER: Environment setup complete!" 208 | echo "KARPENTER: Mock Karpenter CRDs are ready for testing" 209 | echo "" 210 | echo "KARPENTER: To test drift detection, the upgrade script will create mock drifted NodeClaims..." 211 | -------------------------------------------------------------------------------- /pkg/utils/node_label_detection.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE-2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package utils 13 | 14 | import ( 15 | "context" 16 | "strings" 17 | "time" 18 | 19 | "github.com/adobe/k8s-shredder/pkg/config" 20 | "github.com/adobe/k8s-shredder/pkg/metrics" 21 | "github.com/pkg/errors" 22 | log "github.com/sirupsen/logrus" 23 | v1 "k8s.io/api/core/v1" 24 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 | "k8s.io/client-go/kubernetes" 26 | ) 27 | 28 | // NodeLabelInfo holds information about a node that matches the label criteria 29 | type NodeLabelInfo struct { 30 | Name string 31 | Labels map[string]string 32 | } 33 | 34 | // parseLabelSelector parses a label selector string that can be either "key" or "key=value" 35 | func parseLabelSelector(selector string, logger *log.Entry) (string, string, bool) { 36 | logger.WithField("selector", selector).Debug("Parsing label selector") 37 | 38 | if strings.Contains(selector, "=") { 39 | parts := strings.SplitN(selector, "=", 2) 40 | logger.WithFields(log.Fields{ 41 | "key": parts[0], 42 | "value": parts[1], 43 | }).Debug("Parsed key=value selector") 44 | return parts[0], parts[1], true 45 | } 46 | 47 | logger.WithField("key", selector).Debug("Parsed key-only selector") 48 | return selector, "", false 49 | } 50 | 51 | // nodeMatchesLabelSelectors checks if a node matches any (rather than all) of the label selectors 52 | // and excludes nodes that are already parked 53 | func nodeMatchesLabelSelectors(node *v1.Node, labelSelectors []string, upgradeStatusLabel string, logger *log.Entry) bool { 54 | nodeLogger := logger.WithField("nodeName", node.Name) 55 | nodeLogger.Debug("Checking if node matches label selectors") 56 | 57 | nodeLabels := node.Labels 58 | if nodeLabels == nil { 59 | nodeLogger.Debug("Node has no labels") 60 | return false 61 | } 62 | 63 | // First check if the node is already parked - if so, exclude it 64 | if upgradeStatusLabel != "" { 65 | if upgradeStatus, exists := nodeLabels[upgradeStatusLabel]; exists && upgradeStatus == "parked" { 66 | nodeLogger.Debug("Node is already parked, excluding from selection") 67 | return false 68 | } 69 | } 70 | 71 | for _, selector := range labelSelectors { 72 | selectorLogger := nodeLogger.WithField("selector", selector) 73 | key, value, hasValue := parseLabelSelector(selector, selectorLogger) 74 | 75 | if nodeValue, exists := nodeLabels[key]; exists { 76 | if !hasValue { 77 | // If the selector is just a key, match if the key exists 78 | selectorLogger.WithField("nodeValue", nodeValue).Info("Node matches key-only selector") 79 | return true 80 | } else if nodeValue == value { 81 | // If the selector has a value, match if key=value 82 | selectorLogger.WithFields(log.Fields{ 83 | "expectedValue": value, 84 | "nodeValue": nodeValue, 85 | }).Info("Node matches key=value selector") 86 | return true 87 | } else { 88 | selectorLogger.WithFields(log.Fields{ 89 | "expectedValue": value, 90 | "nodeValue": nodeValue, 91 | }).Debug("Node value doesn't match selector value") 92 | } 93 | } else { 94 | selectorLogger.Debug("Node doesn't have the selector key") 95 | } 96 | } 97 | 98 | nodeLogger.Debug("Node doesn't match any label selectors") 99 | return false 100 | } 101 | 102 | // FindNodesWithLabels scans the kubernetes cluster for nodes that match the specified label selectors 103 | // and excludes nodes that are already labeled as parked 104 | func FindNodesWithLabels(ctx context.Context, k8sClient kubernetes.Interface, cfg config.Config, logger *log.Entry) ([]NodeLabelInfo, error) { 105 | logger = logger.WithField("function", "FindNodesWithLabels") 106 | 107 | if len(cfg.NodeLabelsToDetect) == 0 { 108 | logger.Debug("No node labels configured for detection") 109 | return []NodeLabelInfo{}, nil 110 | } 111 | 112 | logger.WithField("labelSelectors", cfg.NodeLabelsToDetect).Debug("Listing nodes with specified labels") 113 | 114 | // List all nodes (we'll filter them using an OR condition in nodeMatchesLabelSelectors) 115 | listOptions := metav1.ListOptions{} 116 | nodeList, err := k8sClient.CoreV1().Nodes().List(ctx, listOptions) 117 | if err != nil { 118 | logger.WithError(err).Error("Failed to list nodes") 119 | return nil, errors.Wrap(err, "failed to list nodes") 120 | } 121 | 122 | logger.WithField("totalNodes", len(nodeList.Items)).Debug("Retrieved nodes list") 123 | 124 | var matchingNodes []NodeLabelInfo 125 | 126 | for _, node := range nodeList.Items { 127 | // Check if the node matches any of the label selectors (this now also excludes already parked nodes) 128 | if nodeMatchesLabelSelectors(&node, cfg.NodeLabelsToDetect, cfg.UpgradeStatusLabel, logger) { 129 | logger.WithField("nodeName", node.Name).Info("Found node matching label criteria") 130 | 131 | matchingNodes = append(matchingNodes, NodeLabelInfo{ 132 | Name: node.Name, 133 | Labels: node.Labels, 134 | }) 135 | } 136 | } 137 | 138 | logger.WithField("matchingCount", len(matchingNodes)).Info("Found nodes matching label criteria") 139 | 140 | return matchingNodes, nil 141 | } 142 | 143 | // ParkNodesWithLabels labels nodes that match the configured label selectors with the standard parking labels 144 | func ParkNodesWithLabels(ctx context.Context, k8sClient kubernetes.Interface, matchingNodes []NodeLabelInfo, cfg config.Config, dryRun bool, logger *log.Entry) error { 145 | logger = logger.WithField("function", "ParkNodesWithLabels") 146 | 147 | logger.WithField("matchingNodesCount", len(matchingNodes)).Info("Starting to park nodes with labels") 148 | 149 | // Convert NodeLabelInfo to NodeInfo for the common parking function 150 | var nodesToPark []NodeInfo 151 | for _, nodeInfo := range matchingNodes { 152 | logger.WithField("nodeName", nodeInfo.Name).Debug("Adding node to parking list") 153 | nodesToPark = append(nodesToPark, NodeInfo(nodeInfo)) 154 | } 155 | 156 | logger.WithField("nodesToPark", len(nodesToPark)).Info("Converted labeled nodes to parking list") 157 | 158 | // Apply MaxParkedNodes limit if configured 159 | limitedNodes, err := LimitNodesToPark(ctx, k8sClient, nodesToPark, cfg.MaxParkedNodes, cfg.UpgradeStatusLabel, logger) 160 | if err != nil { 161 | logger.WithError(err).Error("Failed to apply MaxParkedNodes limit") 162 | return errors.Wrap(err, "failed to apply MaxParkedNodes limit") 163 | } 164 | 165 | if len(limitedNodes) == 0 { 166 | logger.Info("No nodes to park after applying MaxParkedNodes limit") 167 | return nil 168 | } 169 | 170 | // Use the common parking function 171 | return ParkNodes(ctx, k8sClient, limitedNodes, cfg, dryRun, "node-labels", logger) 172 | } 173 | 174 | // ProcessNodesWithLabels is the main function that combines finding nodes with specific labels and parking them 175 | func ProcessNodesWithLabels(ctx context.Context, appContext *AppContext, logger *log.Entry) error { 176 | logger = logger.WithField("function", "ProcessNodesWithLabels") 177 | 178 | logger.Info("Starting node label detection and parking process") 179 | 180 | // Start timing the processing duration 181 | startTime := time.Now() 182 | 183 | // Find nodes with specified labels 184 | matchingNodes, err := FindNodesWithLabels(ctx, appContext.K8sClient, appContext.Config, logger) 185 | if err != nil { 186 | logger.WithError(err).Error("Failed to find nodes with specified labels") 187 | return errors.Wrap(err, "failed to find nodes with specified labels") 188 | } 189 | 190 | // Update the matching nodes gauge 191 | metrics.ShredderNodeLabelMatchingNodesTotal.Set(float64(len(matchingNodes))) 192 | 193 | if len(matchingNodes) == 0 { 194 | logger.Info("No nodes found matching the specified label criteria") 195 | return nil 196 | } 197 | 198 | // Park the nodes that match the criteria 199 | err = ParkNodesWithLabels(ctx, appContext.K8sClient, matchingNodes, appContext.Config, appContext.IsDryRun(), logger) 200 | if err != nil { 201 | logger.WithError(err).Error("Failed to label nodes matching criteria") 202 | metrics.ShredderNodeLabelNodesParkingFailedTotal.Add(float64(len(matchingNodes))) 203 | metrics.ShredderNodesParkingFailedTotal.Add(float64(len(matchingNodes))) 204 | return errors.Wrap(err, "failed to label nodes matching criteria") 205 | } 206 | 207 | // Increment the successfully parked nodes counter 208 | metrics.ShredderNodeLabelNodesParkedTotal.Add(float64(len(matchingNodes))) 209 | metrics.ShredderNodesParkedTotal.Add(float64(len(matchingNodes))) 210 | 211 | // Record the processing duration 212 | metrics.ShredderNodeLabelProcessingDurationSeconds.Observe(time.Since(startTime).Seconds()) 213 | metrics.ShredderProcessingDurationSeconds.Observe(time.Since(startTime).Seconds()) 214 | 215 | logger.WithField("processedNodes", len(matchingNodes)).Info("Completed node label detection and parking process") 216 | 217 | return nil 218 | } 219 | -------------------------------------------------------------------------------- /docs/metrics.md: -------------------------------------------------------------------------------- 1 | # k8s-shredder Metrics 2 | 3 | This document describes all the metrics exposed by k8s-shredder. These metrics are available at the `/metrics` endpoint and can be scraped by Prometheus or other monitoring systems. 4 | 5 | ## Overview 6 | 7 | k8s-shredder exposes metrics in Prometheus format to help operators monitor the health and performance of the node parking and eviction processes. The metrics are organized into several categories: 8 | 9 | - **Core Operation Metrics**: General operation counters and timing 10 | - **API Server Metrics**: Kubernetes API interaction metrics 11 | - **Node Processing Metrics**: Node parking and processing statistics 12 | - **Pod Processing Metrics**: Pod eviction and processing statistics 13 | - **Karpenter Integration Metrics**: Karpenter drift detection metrics 14 | - **Node Label Detection Metrics**: Node label-based detection metrics 15 | - **Shared Metrics**: Aggregated metrics across all detection methods 16 | 17 | ## Core Operation Metrics 18 | 19 | ### `shredder_loops_total` 20 | - **Type**: Counter 21 | - **Description**: Total number of eviction loops completed 22 | - **Use Case**: Monitor the frequency of eviction loop execution and overall system activity 23 | 24 | ### `shredder_loops_duration_seconds` 25 | - **Type**: Summary 26 | - **Description**: Duration of eviction loops in seconds 27 | - **Objectives**: 0.5: 1200, 0.9: 900, 0.99: 600 28 | - **Use Case**: Monitor the performance of eviction loops and identify slow operations 29 | 30 | ### `shredder_errors_total` 31 | - **Type**: Counter 32 | - **Description**: Total number of errors encountered during operation 33 | - **Use Case**: Monitor system health and identify operational issues 34 | 35 | ## API Server Metrics 36 | 37 | ### `shredder_apiserver_requests_total` 38 | - **Type**: Counter Vector 39 | - **Labels**: `verb`, `resource`, `status` 40 | - **Description**: Total requests made to the Kubernetes API 41 | - **Use Case**: Monitor API usage patterns and identify potential rate limiting issues 42 | 43 | ### `shredder_apiserver_requests_duration_seconds` 44 | - **Type**: Summary Vector 45 | - **Labels**: `verb`, `resource`, `status` 46 | - **Description**: Duration of Kubernetes API requests in seconds 47 | - **Objectives**: 0.5: 0.05, 0.9: 0.01, 0.99: 0.001 48 | - **Use Case**: Monitor API performance and identify slow API calls 49 | 50 | ## Node Processing Metrics 51 | 52 | ### `shredder_processed_nodes_total` 53 | - **Type**: Counter 54 | - **Description**: Total number of nodes processed during eviction loops 55 | - **Use Case**: Monitor the volume of node processing activity 56 | 57 | ### `shredder_node_force_to_evict_time` 58 | - **Type**: Gauge Vector 59 | - **Labels**: `node_name` 60 | - **Description**: Unix timestamp when a node will be forcibly evicted 61 | - **Use Case**: Monitor when nodes are scheduled for forced eviction 62 | 63 | ## Pod Processing Metrics 64 | 65 | ### `shredder_processed_pods_total` 66 | - **Type**: Counter 67 | - **Description**: Total number of pods processed during eviction loops 68 | - **Use Case**: Monitor the volume of pod processing activity 69 | 70 | ### `shredder_pod_errors_total` 71 | - **Type**: Gauge Vector 72 | - **Labels**: `pod_name`, `namespace`, `reason`, `action` 73 | - **Description**: Total pod errors per eviction loop 74 | - **Use Case**: Monitor pod eviction failures and their reasons 75 | 76 | ### `shredder_pod_force_to_evict_time` 77 | - **Type**: Gauge Vector 78 | - **Labels**: `pod_name`, `namespace` 79 | - **Description**: Unix timestamp when a pod will be forcibly evicted 80 | - **Use Case**: Monitor when pods are scheduled for forced eviction 81 | 82 | ## Karpenter Integration Metrics 83 | 84 | ### `shredder_karpenter_drifted_nodes_total` 85 | - **Type**: Counter 86 | - **Description**: Total number of drifted Karpenter nodes detected 87 | - **Use Case**: Monitor the volume of Karpenter drift detection activity 88 | 89 | ### `shredder_karpenter_disrupted_nodes_total` 90 | - **Type**: Counter 91 | - **Description**: Total number of disrupted Karpenter nodes detected 92 | - **Use Case**: Monitor the volume of Karpenter disruption detection activity 93 | 94 | ### `shredder_karpenter_nodes_parked_total` 95 | - **Type**: Counter 96 | - **Description**: Total number of Karpenter nodes successfully parked 97 | - **Use Case**: Monitor successful Karpenter node parking operations 98 | 99 | ### `shredder_karpenter_nodes_parking_failed_total` 100 | - **Type**: Counter 101 | - **Description**: Total number of Karpenter nodes that failed to be parked 102 | - **Use Case**: Monitor Karpenter node parking failures 103 | 104 | ### `shredder_karpenter_processing_duration_seconds` 105 | - **Type**: Summary 106 | - **Description**: Duration of Karpenter node processing in seconds 107 | - **Objectives**: 0.5: 0.05, 0.9: 0.01, 0.99: 0.001 108 | - **Use Case**: Monitor the performance of Karpenter drift detection and parking operations 109 | 110 | ## Node Label Detection Metrics 111 | 112 | ### `shredder_node_label_nodes_parked_total` 113 | - **Type**: Counter 114 | - **Description**: Total number of nodes successfully parked via node label detection 115 | - **Use Case**: Monitor successful node label-based parking operations 116 | 117 | ### `shredder_node_label_nodes_parking_failed_total` 118 | - **Type**: Counter 119 | - **Description**: Total number of nodes that failed to be parked via node label detection 120 | - **Use Case**: Monitor node label-based parking failures 121 | 122 | ### `shredder_node_label_processing_duration_seconds` 123 | - **Type**: Summary 124 | - **Description**: Duration of node label detection and parking process in seconds 125 | - **Objectives**: 0.5: 0.05, 0.9: 0.01, 0.99: 0.001 126 | - **Use Case**: Monitor the performance of node label detection and parking operations 127 | 128 | ### `shredder_node_label_matching_nodes_total` 129 | - **Type**: Gauge 130 | - **Description**: Total number of nodes matching the label criteria 131 | - **Use Case**: Monitor the current number of nodes that match the configured label selectors 132 | 133 | ## Shared Metrics 134 | 135 | These metrics aggregate data across all detection methods (Karpenter and node label detection) to provide a unified view of node parking activity. 136 | 137 | ### `shredder_nodes_parked_total` 138 | - **Type**: Counter 139 | - **Description**: Total number of nodes successfully parked (shared across all detection methods) 140 | - **Use Case**: Monitor total node parking activity regardless of detection method 141 | 142 | ### `shredder_nodes_parking_failed_total` 143 | - **Type**: Counter 144 | - **Description**: Total number of nodes that failed to be parked (shared across all detection methods) 145 | - **Use Case**: Monitor total node parking failures regardless of detection method 146 | 147 | ### `shredder_processing_duration_seconds` 148 | - **Type**: Summary 149 | - **Description**: Duration of node processing in seconds (shared across all detection methods) 150 | - **Objectives**: 0.5: 0.05, 0.9: 0.01, 0.99: 0.001 151 | - **Use Case**: Monitor total node processing performance regardless of detection method 152 | 153 | ## Metric Relationships 154 | 155 | ### Detection Method Metrics 156 | - **Karpenter metrics** are incremented when `EnableKarpenterDriftDetection=true` 157 | - **Node label metrics** are incremented when `EnableNodeLabelDetection=true` 158 | - **Shared metrics** are incremented whenever either detection method processes nodes 159 | 160 | ### Processing Flow 161 | 1. **Detection**: Nodes are identified via Karpenter drift or label matching 162 | 2. **Parking**: Nodes are labeled, cordoned, and tainted 163 | 3. **Eviction**: Pods are evicted from parked nodes over time 164 | 4. **Cleanup**: Nodes are eventually removed when all pods are evicted 165 | 166 | ## Alerting Recommendations 167 | 168 | ### High Error Rates 169 | ```promql 170 | rate(shredder_errors_total[5m]) > 0.1 171 | ``` 172 | 173 | ### Slow Processing 174 | ```promql 175 | histogram_quantile(0.95, rate(shredder_processing_duration_seconds_bucket[5m])) > 30 176 | ``` 177 | 178 | ### Failed Node Parking 179 | ```promql 180 | rate(shredder_nodes_parking_failed_total[5m]) > 0 181 | ``` 182 | 183 | ### High API Latency 184 | ```promql 185 | histogram_quantile(0.95, rate(shredder_apiserver_requests_duration_seconds_bucket[5m])) > 5 186 | ``` 187 | 188 | ### Parked Pods Alert 189 | ```promql 190 | # Alert when pods are running on parked nodes 191 | kube_ethos_upgrade:parked_pod > 0 192 | ``` 193 | 194 | ## Example Queries 195 | 196 | ### Node Parking Success Rate 197 | ```promql 198 | rate(shredder_nodes_parked_total[5m]) / (rate(shredder_nodes_parked_total[5m]) + rate(shredder_nodes_parking_failed_total[5m])) 199 | ``` 200 | 201 | ### Average Processing Duration 202 | ```promql 203 | histogram_quantile(0.5, rate(shredder_processing_duration_seconds_bucket[5m])) 204 | ``` 205 | 206 | ### Nodes Parked by Detection Method 207 | ```promql 208 | # Karpenter nodes 209 | rate(shredder_karpenter_nodes_parked_total[5m]) 210 | 211 | # Label-based nodes 212 | rate(shredder_node_label_nodes_parked_total[5m]) 213 | ``` 214 | 215 | ### Current Matching Nodes 216 | ```promql 217 | shredder_node_label_matching_nodes_total 218 | ``` 219 | 220 | ## Configuration 221 | 222 | Metrics are exposed on the configured port (default: 8080) at the `/metrics` endpoint. The metrics server can be configured using the following options: 223 | 224 | - **Metrics Port**: Configure the port for metrics exposure 225 | - **Health Endpoint**: Available at `/healthz` for health checks 226 | - **OpenMetrics Format**: Enabled by default for better compatibility 227 | 228 | For more information about configuring k8s-shredder, see the [main README](../README.md). 229 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: default help format lint vet security build build-prereq push unit-test local-test local-test-karpenter local-test-node-labels ci clean e2e-tests check-license helm-docs 2 | 3 | NAME ?= adobe/k8s-shredder 4 | K8S_SHREDDER_VERSION ?= "dev" 5 | KINDNODE_VERSION ?= "v1.34.0" 6 | COMMIT ?= $(shell git rev-parse --short HEAD) 7 | TEST_CLUSTERNAME ?= "k8s-shredder-test-cluster" 8 | TEST_CLUSTERNAME_KARPENTER ?= "k8s-shredder-test-cluster-karpenter" 9 | TEST_CLUSTERNAME_NODE_LABELS ?= "k8s-shredder-test-cluster-node-labels" 10 | KUBECONFIG_LOCALTEST ?= "kubeconfig-localtest" 11 | KUBECONFIG_KARPENTER ?= "kubeconfig-local-test-karpenter" 12 | KUBECONFIG_NODE_LABELS ?= "kubeconfig-local-test-node-labels" 13 | 14 | GOSEC=gosec -quiet -exclude=G107 15 | 16 | default: help 17 | 18 | help: ## Print this help text 19 | @printf "\n" 20 | @awk 'BEGIN {FS = ":.*?## "}; ($$2 && !/@awk/){printf "${CYAN}%-30s${NC} %s\n", $$1, $$2}' $(lastword ${MAKEFILE_LIST}) | sort 21 | @printf "\n" 22 | 23 | # CI 24 | # ----------- 25 | format: helm-docs ## Format go code and YAML files 26 | @echo "Format go code..." 27 | @go fmt ./... 28 | @hash golangci-lint 2>/dev/null && { golangci-lint run --fix ./... ; } || { \ 29 | echo >&2 "[WARN] I require golangci-lint but it's not installed (see https://github.com/golangci/golangci-lint). Skipping golangci-lint format."; \ 30 | } 31 | @hash yamlfix 2>/dev/null && { \ 32 | echo "Format YAML files..."; \ 33 | find . -name "*.yaml" -o -name "*.yml" | grep -v "/templates/" | xargs yamlfix 2>/dev/null || true ; \ 34 | echo "YAML files formatted!" ; \ 35 | } || { \ 36 | echo >&2 "[WARN] I require yamlfix but it's not installed (see https://github.com/lyz-code/yamlfix). Skipping YAML format."; \ 37 | } 38 | 39 | lint: ## Lint go code and YAML files 40 | @hash golangci-lint 2>/dev/null && { \ 41 | echo "Checking go code style..."; \ 42 | echo "Run "make format" in case of failures!"; \ 43 | golangci-lint run -v --timeout 5m --no-config ./... ; \ 44 | echo "Go code style OK!" ; \ 45 | } || { \ 46 | echo >&2 "[WARN] I require golangci-lint but it's not installed (see https://github.com/golangci/golangci-lint). Skipping lint."; \ 47 | } 48 | @hash yamlfix 2>/dev/null && { \ 49 | echo "Checking YAML files..."; \ 50 | find . -name "*.yaml" -o -name "*.yml" | grep -v "/templates/" | xargs yamlfix --check 2>/dev/null || { \ 51 | echo "YAML files have formatting issues. Run 'make format' to fix them."; \ 52 | exit 1; \ 53 | } ; \ 54 | echo "YAML files OK!" ; \ 55 | } || { \ 56 | echo >&2 "[WARN] I require yamlfix but it's not installed (see https://github.com/lyz-code/yamlfix). Skipping YAML lint."; \ 57 | } 58 | @hash kubeconform 2>/dev/null && { \ 59 | echo "Validating Kubernetes manifests with kubeconform..."; \ 60 | find internal/testing -name "*.yaml" -o -name "*.yml" | xargs kubeconform -strict -skip CustomResourceDefinition,EC2NodeClass,NodePool,Rollout,Cluster || { \ 61 | echo "Kubeconform found schema errors. Please fix them."; \ 62 | exit 1; \ 63 | } ; \ 64 | echo "Kubeconform validation OK!" ; \ 65 | } || { \ 66 | echo >&2 "[WARN] I require kubeconform but it's not installed (see https://github.com/yannh/kubeconform). Skipping kubeconform lint."; \ 67 | } 68 | @hash helm-docs 2>/dev/null && { \ 69 | echo "Checking Helm documentation..."; \ 70 | helm-docs --chart-search-root=charts --template-files=README.md.gotmpl --dry-run >/dev/null 2>&1 || { \ 71 | echo "Helm documentation is out of date. Run 'make format' to update it."; \ 72 | exit 1; \ 73 | } ; \ 74 | echo "Helm documentation OK!" ; \ 75 | } || { \ 76 | echo >&2 "[WARN] I require helm-docs but it's not installed (see https://github.com/norwoodj/helm-docs). Skipping Helm documentation lint."; \ 77 | } 78 | 79 | vet: ## Vetting go code 80 | @echo 'Vetting go code and identify subtle source code issues...' 81 | @go vet ./... 82 | @echo 'Not issues found in go codebase!' 83 | 84 | security: ## Inspects go source code for security problems 85 | @hash gosec 2>/dev/null && { \ 86 | echo "Checking go source code for security problems..."; \ 87 | $(GOSEC) ./... ; \ 88 | echo "No security problems found in the go codebase!" ; \ 89 | } || { \ 90 | echo >&2 "[WARN] I require gosec but it's not installed (see https://github.com/securego/gosec). Skipping security inspections."; \ 91 | } 92 | check-license: ## Check if all go files have the license header set 93 | @echo "Checking files for license header" 94 | @./internal/check_license.sh 95 | 96 | helm-docs: ## Generate Helm chart documentation 97 | @hash helm-docs 2>/dev/null && { \ 98 | echo "Generating Helm chart documentation..."; \ 99 | helm-docs --chart-search-root=charts --template-files=README.md.gotmpl ; \ 100 | echo "Helm documentation generated!" ; \ 101 | } || { \ 102 | echo >&2 "[WARN] I require helm-docs but it's not installed (see https://github.com/norwoodj/helm-docs). Skipping documentation generation."; \ 103 | } 104 | 105 | build: check-license lint vet security unit-test ## Builds the local Docker container for development 106 | @CGO_ENABLED=0 GOOS=linux go build \ 107 | -ldflags="-s -w -X github.com/adobe/k8s-shredder/cmd.buildVersion=${K8S_SHREDDER_VERSION}-${COMMIT} -X github.com/adobe/k8s-shredder/cmd.gitSHA=${COMMIT} -X github.com/adobe/k8s-shredder/cmd.buildTime=$(date)" \ 108 | -o k8s-shredder 109 | @CGO_ENABLED=0 go build \ 110 | -ldflags="-s -w" \ 111 | -o park-node \ 112 | ./cmd/park-node 113 | @DOCKER_BUILDKIT=1 docker build -t ${NAME}:${K8S_SHREDDER_VERSION} . 114 | 115 | # TEST 116 | # ----------- 117 | local-test: build ## Test docker image in a kind cluster (with Karpenter drift and node label detection disabled) 118 | @hash kind 2>/dev/null && { \ 119 | echo "Test docker image in a kind cluster..."; \ 120 | ./internal/testing/local_env_prep_helm.sh "${K8S_SHREDDER_VERSION}" "${KINDNODE_VERSION}" "${TEST_CLUSTERNAME}" "${KUBECONFIG_LOCALTEST}" && \ 121 | ./internal/testing/cluster_upgrade.sh "${TEST_CLUSTERNAME}" "${KUBECONFIG_LOCALTEST}" || \ 122 | exit 1; \ 123 | } || { \ 124 | echo >&2 "[WARN] I require kind but it's not installed(see https://kind.sigs.k8s.io). Assuming a cluster is already accessible."; \ 125 | } 126 | 127 | local-test-karpenter: build ## Test docker image in a kind cluster with Karpenter drift and disruption detection enabled 128 | @hash kind 2>/dev/null && { \ 129 | echo "Test docker image in a kind cluster with Karpenter drift and disruption detection..."; \ 130 | ./internal/testing/local_env_prep_karpenter_helm.sh "${K8S_SHREDDER_VERSION}" "${KINDNODE_VERSION}" "${TEST_CLUSTERNAME_KARPENTER}" "${KUBECONFIG_KARPENTER}" && \ 131 | ./internal/testing/cluster_upgrade_karpenter.sh "${TEST_CLUSTERNAME_KARPENTER}" "${KUBECONFIG_KARPENTER}" || \ 132 | exit 1; \ 133 | } || { \ 134 | echo >&2 "[WARN] I require kind but it's not installed(see https://kind.sigs.k8s.io). Assuming a cluster is already accessible."; \ 135 | } 136 | 137 | 138 | local-test-node-labels: build ## Test docker image in a kind cluster with node label detection enabled 139 | @hash kind 2>/dev/null && { \ 140 | echo "Test docker image in a kind cluster with node label detection..."; \ 141 | ./internal/testing/local_env_prep_node_labels_helm.sh "${K8S_SHREDDER_VERSION}" "${KINDNODE_VERSION}" "${TEST_CLUSTERNAME_NODE_LABELS}" "${KUBECONFIG_NODE_LABELS}" && \ 142 | ./internal/testing/cluster_upgrade_node_labels.sh "${TEST_CLUSTERNAME_NODE_LABELS}" "${KUBECONFIG_NODE_LABELS}" || \ 143 | exit 1; \ 144 | } || { \ 145 | echo >&2 "[WARN] I require kind but it's not installed(see https://kind.sigs.k8s.io). Assuming a cluster is already accessible."; \ 146 | } 147 | 148 | unit-test: ## Run unit tests 149 | @echo "Run unit tests for k8s-shredder..." 150 | @go test ./pkg/... -coverprofile= 151 | 152 | 153 | e2e-tests: ## Run e2e tests for k8s-shredder deployed in a local kind cluster 154 | @echo "Run e2e tests for k8s-shredder..." 155 | @if [ -f "${PWD}/${KUBECONFIG_KARPENTER}" ]; then \ 156 | echo "Using Karpenter test cluster configuration..."; \ 157 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_KARPENTER} go test internal/testing/e2e_test.go -v; \ 158 | elif [ -f "${PWD}/${KUBECONFIG_NODE_LABELS}" ]; then \ 159 | echo "Using node labels test cluster configuration..."; \ 160 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_NODE_LABELS} go test internal/testing/e2e_test.go -v; \ 161 | else \ 162 | echo "Using default test cluster configuration..."; \ 163 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_LOCALTEST} go test internal/testing/e2e_test.go -v; \ 164 | fi 165 | 166 | # DEMO targets 167 | # ----------- 168 | .PHONY: demo.prep demo.run demo.rollback 169 | demo.prep: build ## Setup demo cluster 170 | echo "Setup demo cluster..." 171 | ./internal/testing/local_env_prep_helm.sh "${K8S_SHREDDER_VERSION}" "${KINDNODE_VERSION}" "${TEST_CLUSTERNAME}" 172 | 173 | demo.run: ## Run demo 174 | ./internal/testing/cluster_upgrade.sh "${TEST_CLUSTERNAME}" 175 | 176 | demo.rollback: ## Rollback demo 177 | ./internal/testing/rollback_cluster_upgrade.sh "${TEST_CLUSTERNAME}" 178 | 179 | 180 | ci: local-test e2e-tests clean ## Run CI 181 | 182 | # PUBLISH 183 | # ----------- 184 | publish: ## Release a new version 185 | @goreleaser release --clean 186 | 187 | # CLEANUP 188 | # ----------- 189 | clean: ## Clean up local testing environment 190 | @echo "Cleaning up your local testing environment..." 191 | @kind delete cluster --name="${TEST_CLUSTERNAME}" ## > /dev/null 2>&1 || true 192 | @kind delete cluster --name="${TEST_CLUSTERNAME_KARPENTER}" ## > /dev/null 2>&1 || true 193 | @kind delete cluster --name="${TEST_CLUSTERNAME_NODE_LABELS}" ## > /dev/null 2>&1 || true 194 | @echo "Removing all generated files and directories" 195 | @rm -rf dist/ k8s-shredder park-node kubeconfig ${KUBECONFIG_LOCALTEST} ${KUBECONFIG_KARPENTER} ${KUBECONFIG_NODE_LABELS} 196 | @echo "Done!" 197 | -------------------------------------------------------------------------------- /pkg/schedule/schedule.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Adobe. All rights reserved. 3 | This file is licensed to you under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. You may obtain a copy 5 | of the License at http://www.apache.org/licenses/LICENSE/2.0 6 | Unless required by applicable law or agreed to in writing, software distributed under 7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | OF ANY KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | */ 11 | 12 | package schedule 13 | 14 | import ( 15 | "fmt" 16 | "strings" 17 | "time" 18 | 19 | "github.com/pkg/errors" 20 | "github.com/robfig/cron/v3" 21 | ) 22 | 23 | // Schedule represents a time window defined by a cron schedule and duration 24 | type Schedule struct { 25 | // CronSchedule is the cron expression (supports macros like @daily, @hourly, etc.) 26 | CronSchedule string 27 | // Duration is how long the window stays active after the schedule triggers 28 | Duration time.Duration 29 | // parser is the cron parser instance 30 | parser cron.Parser 31 | // schedule is the parsed cron schedule 32 | schedule cron.Schedule 33 | } 34 | 35 | // NewSchedule creates a new Schedule instance from a cron expression and duration string 36 | // The cron expression supports standard cron syntax and macros (@yearly, @monthly, @weekly, @daily, @hourly) 37 | // The duration string supports compound durations with minutes and hours (e.g., "10h5m", "30m", "160h") 38 | func NewSchedule(cronExpr string, durationStr string) (*Schedule, error) { 39 | if cronExpr == "" { 40 | return nil, errors.New("cron schedule cannot be empty") 41 | } 42 | 43 | if durationStr == "" { 44 | return nil, errors.New("duration cannot be empty") 45 | } 46 | 47 | // Parse duration - supports compound durations like "10h5m", "30m", "160h" 48 | duration, err := parseDuration(durationStr) 49 | if err != nil { 50 | return nil, errors.Wrapf(err, "failed to parse duration: %s", durationStr) 51 | } 52 | 53 | if duration <= 0 { 54 | return nil, errors.New("duration must be greater than zero") 55 | } 56 | 57 | // Create parser with support for standard cron format and macros 58 | // Try parsing with seconds first (6 fields), then without seconds (5 fields - Kubernetes format) 59 | var schedule cron.Schedule 60 | var parser cron.Parser 61 | 62 | // First try with seconds (6 fields: second minute hour dom month dow) 63 | parser6 := cron.NewParser(cron.Second | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor) 64 | schedule, err = parser6.Parse(cronExpr) 65 | if err != nil { 66 | // If that fails, try without seconds (5 fields: minute hour dom month dow - Kubernetes format) 67 | parser5 := cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor) 68 | schedule, err = parser5.Parse(cronExpr) 69 | if err != nil { 70 | return nil, errors.Wrapf(err, "failed to parse cron schedule: %s", cronExpr) 71 | } 72 | // Use the 5-field parser for future operations 73 | parser = parser5 74 | } else { 75 | parser = parser6 76 | } 77 | 78 | return &Schedule{ 79 | CronSchedule: cronExpr, 80 | Duration: duration, 81 | parser: parser, 82 | schedule: schedule, 83 | }, nil 84 | } 85 | 86 | // IsActive checks if the current time (or provided time) falls within the active window 87 | // The window is active from when the schedule triggers until Duration time has passed 88 | func (s *Schedule) IsActive(now time.Time) bool { 89 | if s.schedule == nil { 90 | return false 91 | } 92 | 93 | // Get the most recent time the schedule triggered (before or at now) 94 | // We need to find the last trigger time that is <= now 95 | lastTrigger := s.getLastTriggerTime(now) 96 | 97 | if lastTrigger.IsZero() { 98 | return false 99 | } 100 | 101 | // Check if we're still within the duration window 102 | windowEnd := lastTrigger.Add(s.Duration) 103 | return now.Before(windowEnd) || now.Equal(windowEnd) 104 | } 105 | 106 | // getLastTriggerTime finds the most recent time the schedule triggered before or at the given time 107 | func (s *Schedule) getLastTriggerTime(now time.Time) time.Time { 108 | // For macros, we can calculate directly for efficiency 109 | cronLower := strings.ToLower(s.CronSchedule) 110 | switch cronLower { 111 | case "@yearly", "@annually": 112 | // Triggers at 00:00:00 UTC on January 1st 113 | lastYear := time.Date(now.Year(), 1, 1, 0, 0, 0, 0, time.UTC) 114 | if lastYear.After(now) { 115 | lastYear = time.Date(now.Year()-1, 1, 1, 0, 0, 0, 0, time.UTC) 116 | } 117 | return lastYear 118 | case "@monthly": 119 | // Triggers at 00:00:00 UTC on the 1st of each month 120 | lastMonth := time.Date(now.Year(), now.Month(), 1, 0, 0, 0, 0, time.UTC) 121 | if lastMonth.After(now) { 122 | if now.Month() == 1 { 123 | lastMonth = time.Date(now.Year()-1, 12, 1, 0, 0, 0, 0, time.UTC) 124 | } else { 125 | lastMonth = time.Date(now.Year(), now.Month()-1, 1, 0, 0, 0, 0, time.UTC) 126 | } 127 | } 128 | return lastMonth 129 | case "@weekly": 130 | // Triggers at 00:00:00 UTC on Sunday 131 | lastWeek := now 132 | // Go back to the most recent Sunday 133 | for lastWeek.Weekday() != time.Sunday { 134 | lastWeek = lastWeek.AddDate(0, 0, -1) 135 | } 136 | lastWeek = time.Date(lastWeek.Year(), lastWeek.Month(), lastWeek.Day(), 0, 0, 0, 0, time.UTC) 137 | if lastWeek.After(now) { 138 | lastWeek = lastWeek.AddDate(0, 0, -7) 139 | } 140 | return lastWeek 141 | case "@daily", "@midnight": 142 | // Triggers at 00:00:00 UTC each day 143 | lastDay := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) 144 | if lastDay.After(now) { 145 | lastDay = lastDay.AddDate(0, 0, -1) 146 | } 147 | return lastDay 148 | case "@hourly": 149 | // Triggers at the top of each hour 150 | lastHour := time.Date(now.Year(), now.Month(), now.Day(), now.Hour(), 0, 0, 0, time.UTC) 151 | if lastHour.After(now) { 152 | lastHour = lastHour.Add(-time.Hour) 153 | } 154 | return lastHour 155 | } 156 | 157 | // For standard cron expressions, iterate backwards to find the last trigger 158 | // We use a binary-search-like approach: start from now and go back 159 | checkWindow := s.getCheckWindow() 160 | checkTime := now 161 | maxIterations := 1000 // Safety limit 162 | 163 | for i := 0; i < maxIterations; i++ { 164 | // Get what the next trigger would be from (checkTime - 1 second) 165 | nextTrigger := s.schedule.Next(checkTime.Add(-time.Second)) 166 | 167 | // If the next trigger is at or before now, we found our last trigger 168 | if !nextTrigger.After(now) { 169 | return nextTrigger 170 | } 171 | 172 | // Move back in time - use a smart increment based on the schedule 173 | // For most schedules, going back by the minimum interval works 174 | // We'll go back by minutes, but could optimize further 175 | checkTime = checkTime.Add(-time.Minute) 176 | 177 | // Safety check: don't go back too far 178 | if now.Sub(checkTime) > checkWindow { 179 | break 180 | } 181 | } 182 | 183 | return time.Time{} 184 | } 185 | 186 | // getCheckWindow returns the maximum time window to check backwards 187 | // This is optimized based on the schedule type 188 | func (s *Schedule) getCheckWindow() time.Duration { 189 | cronLower := strings.ToLower(s.CronSchedule) 190 | 191 | // Handle macros 192 | switch cronLower { 193 | case "@yearly", "@annually": 194 | return 2 * 365 * 24 * time.Hour 195 | case "@monthly": 196 | return 2 * 30 * 24 * time.Hour 197 | case "@weekly": 198 | return 2 * 7 * 24 * time.Hour 199 | case "@daily", "@midnight": 200 | return 2 * 24 * time.Hour 201 | case "@hourly": 202 | return 2 * time.Hour 203 | default: 204 | // For standard cron, check up to 7 days back 205 | // This should cover most common schedules 206 | return 7 * 24 * time.Hour 207 | } 208 | } 209 | 210 | // parseDuration parses a duration string supporting compound durations 211 | // Supports formats like "10h5m", "30m", "160h", "1h30m", etc. 212 | // Only supports hours and minutes as per Karpenter's duration format 213 | func parseDuration(durationStr string) (time.Duration, error) { 214 | durationStr = strings.TrimSpace(durationStr) 215 | if durationStr == "" { 216 | return 0, errors.New("duration string cannot be empty") 217 | } 218 | 219 | var totalDuration time.Duration 220 | 221 | // Parse hours 222 | if strings.Contains(durationStr, "h") { 223 | parts := strings.Split(durationStr, "h") 224 | if len(parts) > 0 && parts[0] != "" { 225 | var hours int64 226 | _, err := fmt.Sscanf(parts[0], "%d", &hours) 227 | if err != nil { 228 | return 0, errors.Wrapf(err, "invalid hours in duration: %s", durationStr) 229 | } 230 | totalDuration += time.Duration(hours) * time.Hour 231 | } 232 | // Remaining part might contain minutes 233 | if len(parts) > 1 && parts[1] != "" { 234 | durationStr = parts[1] 235 | } else { 236 | durationStr = "" 237 | } 238 | } 239 | 240 | // Parse minutes 241 | if strings.Contains(durationStr, "m") { 242 | parts := strings.Split(durationStr, "m") 243 | if len(parts) > 0 && parts[0] != "" { 244 | var minutes int64 245 | _, err := fmt.Sscanf(parts[0], "%d", &minutes) 246 | if err != nil { 247 | return 0, errors.Wrapf(err, "invalid minutes in duration: %s", durationStr) 248 | } 249 | totalDuration += time.Duration(minutes) * time.Minute 250 | } 251 | } else if durationStr != "" { 252 | // If there's remaining string that's not "m", it's invalid 253 | return 0, errors.Errorf("invalid duration format: %s (only hours 'h' and minutes 'm' are supported)", durationStr) 254 | } 255 | 256 | if totalDuration == 0 { 257 | return 0, errors.New("duration must be greater than zero") 258 | } 259 | 260 | return totalDuration, nil 261 | } 262 | -------------------------------------------------------------------------------- /charts/k8s-shredder/README.md: -------------------------------------------------------------------------------- 1 | # k8s-shredder 2 | 3 | ![Version: 0.2.8](https://img.shields.io/badge/Version-0.2.8-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.3.8](https://img.shields.io/badge/AppVersion-v0.3.8-informational?style=flat-square) 4 | 5 | a novel way of dealing with kubernetes nodes blocked from draining 6 | 7 | **Homepage:** 8 | 9 | ## Maintainers 10 | 11 | | Name | Email | Url | 12 | | ---- | ------ | --- | 13 | | adriananeci | | | 14 | | sfotony | | | 15 | 16 | ## Values 17 | 18 | | Key | Type | Default | Description | 19 | |-----|------|---------|-------------| 20 | | additionalContainers | list | `[]` | Additional containers to run alongside k8s-shredder in the same pod | 21 | | affinity | object | `{}` | Affinity rules for advanced pod scheduling (node affinity, pod affinity/anti-affinity) | 22 | | deploymentStrategy | object | `{}` | Deployment strategy for rolling updates (e.g., RollingUpdate, Recreate) | 23 | | dryRun | bool | `false` | Enable dry-run mode - when true, k8s-shredder will log actions but not execute them | 24 | | environmentVars | list | `[]` | Additional environment variables to set in the container | 25 | | fullnameOverride | string | `""` | Override the full name used for resources | 26 | | image | object | `{"pullPolicy":"IfNotPresent","registry":"ghcr.io/adobe/k8s-shredder","tag":"latest"}` | Container image configuration | 27 | | image.pullPolicy | string | `"IfNotPresent"` | Image pull policy - IfNotPresent, Always, or Never | 28 | | image.registry | string | `"ghcr.io/adobe/k8s-shredder"` | Container registry where the k8s-shredder image is hosted | 29 | | image.tag | string | `"latest"` | Image tag to use | 30 | | imagePullSecrets | list | `[]` | Secrets for pulling images from private registries | 31 | | initContainers | list | `[]` | Init containers to run before the main k8s-shredder container starts | 32 | | logFormat | string | `"text"` | Log output format: text (human-readable) or json (structured logging) | 33 | | logLevel | string | `"debug"` | Available log levels: panic, fatal, error, warn, warning, info, debug, trace | 34 | | nameOverride | string | `""` | Override the name of the chart | 35 | | nodeSelector | object | `{}` | Node selector to constrain pod scheduling to specific nodes | 36 | | podAnnotations | object | `{}` | Annotations to add to k8s-shredder pod(s) | 37 | | podLabels | object | `{}` | Additional labels to add to k8s-shredder pod(s) | 38 | | podMonitor | object | `{"enabled":false,"honorLabels":true,"interval":"60s","labels":{},"relabelings":[],"scrapeTimeout":"10s"}` | Prometheus monitoring configuration | 39 | | podMonitor.enabled | bool | `false` | Enable creation of a PodMonitor resource for Prometheus scraping | 40 | | podMonitor.honorLabels | bool | `true` | Whether to honor labels from the target | 41 | | podMonitor.interval | string | `"60s"` | How often Prometheus should scrape metrics | 42 | | podMonitor.labels | object | `{}` | Labels to apply to the PodMonitor resource | 43 | | podMonitor.relabelings | list | `[]` | Metric relabeling configuration | 44 | | podMonitor.scrapeTimeout | string | `"10s"` | Timeout for each scrape attempt | 45 | | podSecurityContext | object | `{}` | Security context applied to the entire pod | 46 | | priorityClassName | string | `"system-cluster-critical"` | Priority class for pod scheduling - system-cluster-critical ensures high priority | 47 | | rbac | object | `{"create":true}` | RBAC (Role-Based Access Control) configuration | 48 | | rbac.create | bool | `true` | Create RBAC resources (ClusterRole, ClusterRoleBinding) | 49 | | replicaCount | int | `1` | Number of k8s-shredder pods to run | 50 | | resources | object | `{"limits":{"cpu":"1","memory":"1Gi"},"requests":{"cpu":"250m","memory":"250Mi"}}` | Resource requests and limits for the k8s-shredder container | 51 | | resources.limits.cpu | string | `"1"` | Maximum CPU cores the container can use | 52 | | resources.limits.memory | string | `"1Gi"` | Maximum memory the container can use | 53 | | resources.requests.cpu | string | `"250m"` | CPU cores requested for the container (guaranteed allocation) | 54 | | resources.requests.memory | string | `"250Mi"` | Memory requested for the container (guaranteed allocation) | 55 | | securityContext | object | `{}` | Security context applied to the k8s-shredder container | 56 | | service | object | `{"annotations":{},"create":false,"labels":{},"port":8080,"targetPort":"metrics","type":"ClusterIP"}` | Kubernetes service configuration | 57 | | service.annotations | object | `{}` | Additional annotations for the service | 58 | | service.create | bool | `false` | Create a service for k8s-shredder metrics endpoint | 59 | | service.labels | object | `{}` | Additional labels for the service | 60 | | service.port | int | `8080` | Service port for metrics endpoint | 61 | | service.targetPort | string | `"metrics"` | Target port for metrics endpoint | 62 | | service.type | string | `"ClusterIP"` | Service type (ClusterIP, NodePort, LoadBalancer) | 63 | | serviceAccount | object | `{"annotations":{},"create":true,"name":"k8s-shredder"}` | Kubernetes service account configuration | 64 | | serviceAccount.annotations | object | `{}` | Additional annotations for the service account (useful for IAM roles, etc.) | 65 | | serviceAccount.create | bool | `true` | Create a service account for k8s-shredder | 66 | | serviceAccount.name | string | `"k8s-shredder"` | Name of the service account | 67 | | shredder | object | `{"AllowEvictionLabel":"shredder.ethos.adobe.net/allow-eviction","ArgoRolloutsAPIVersion":"v1alpha1","EnableKarpenterDisruptionDetection":false,"EnableKarpenterDriftDetection":false,"EnableNodeLabelDetection":false,"EvictionLoopDuration":"","EvictionLoopInterval":"1h","EvictionLoopSchedule":"","EvictionSafetyCheck":true,"ExpiresOnLabel":"shredder.ethos.adobe.net/parked-node-expires-on","ExtraParkingLabels":{},"MaxParkedNodes":"0","NamespacePrefixSkipInitialEviction":"ns-ethos-","NodeLabelsToDetect":[],"ParkedByLabel":"shredder.ethos.adobe.net/parked-by","ParkedByValue":"k8s-shredder","ParkedNodeTTL":"168h","ParkedNodeTaint":"shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule","ParkingReasonLabel":"shredder.ethos.adobe.net/parked-reason","RestartedAtAnnotation":"shredder.ethos.adobe.net/restartedAt","RollingRestartThreshold":0.1,"ToBeDeletedTaint":"ToBeDeletedByClusterAutoscaler","UpgradeStatusLabel":"shredder.ethos.adobe.net/upgrade-status"}` | Core k8s-shredder configuration | 68 | | shredder.AllowEvictionLabel | string | `"shredder.ethos.adobe.net/allow-eviction"` | Label to explicitly allow eviction on specific resources | 69 | | shredder.ArgoRolloutsAPIVersion | string | `"v1alpha1"` | API version for Argo Rollouts integration | 70 | | shredder.EnableKarpenterDisruptionDetection | bool | `false` | Enable Karpenter disruption detection for node lifecycle management | 71 | | shredder.EnableKarpenterDriftDetection | bool | `false` | Enable Karpenter drift detection for node lifecycle management | 72 | | shredder.EnableNodeLabelDetection | bool | `false` | Enable detection of nodes based on specific labels | 73 | | shredder.EvictionLoopDuration | string | `""` | Duration for how long the scheduled window stays active after the schedule triggers. Only used when EvictionLoopSchedule is set. Supports compound durations with hours and minutes (e.g., "10h5m", "30m", "160h"). Example: "10h" (window stays active for 10 hours), "30m" (window stays active for 30 minutes). | 74 | | shredder.EvictionLoopInterval | string | `"1h"` | How often to run the main eviction loop | 75 | | shredder.EvictionLoopSchedule | string | `""` | Optional cron schedule for when eviction operations are allowed. If set, parking and shredding operations will only occur during the scheduled time window. Supports standard cron syntax and macros (@yearly, @monthly, @weekly, @daily, @hourly). Example: "@daily" (runs at midnight UTC), "0 2 * * *" (runs at 2 AM UTC daily). When omitted, operations run continuously. | 76 | | shredder.EvictionSafetyCheck | bool | `true` | Controls whether to perform safety checks before force eviction | 77 | | shredder.ExpiresOnLabel | string | `"shredder.ethos.adobe.net/parked-node-expires-on"` | Label used to track when a parked node expires | 78 | | shredder.ExtraParkingLabels | object | `{}` | Additional labels to apply to nodes and pods during parking | 79 | | shredder.MaxParkedNodes | string | `"0"` | Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" for no limit | 80 | | shredder.NamespacePrefixSkipInitialEviction | string | `"ns-ethos-"` | Namespace prefix to skip during initial eviction (useful for system namespaces) | 81 | | shredder.NodeLabelsToDetect | list | `[]` | List of node labels to monitor for triggering shredder actions | 82 | | shredder.ParkedByLabel | string | `"shredder.ethos.adobe.net/parked-by"` | Label to track which component parked a node | 83 | | shredder.ParkedByValue | string | `"k8s-shredder"` | Value set in the ParkedByLabel to identify k8s-shredder as the parking agent | 84 | | shredder.ParkedNodeTTL | string | `"168h"` | How long parked nodes should remain before being eligible for deletion (7 days default) | 85 | | shredder.ParkedNodeTaint | string | `"shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule"` | Taint applied to parked nodes to prevent new pod scheduling | 86 | | shredder.ParkingReasonLabel | string | `"shredder.ethos.adobe.net/parked-reason"` | Label used to track why a node or pod was parked | 87 | | shredder.RestartedAtAnnotation | string | `"shredder.ethos.adobe.net/restartedAt"` | Annotation to track when a workload was last restarted | 88 | | shredder.RollingRestartThreshold | float | `0.1` | Maximum percentage of nodes that can be restarted simultaneously during rolling restarts | 89 | | shredder.ToBeDeletedTaint | string | `"ToBeDeletedByClusterAutoscaler"` | Taint indicating nodes scheduled for deletion by cluster autoscaler | 90 | | shredder.UpgradeStatusLabel | string | `"shredder.ethos.adobe.net/upgrade-status"` | Label used to track node upgrade status | 91 | | tolerations | list | `[]` | Tolerations to allow scheduling on nodes with specific taints | 92 | | topologySpreadConstraints | list | `[]` | Helps ensure high availability by spreading pods across zones/nodes | 93 | | volumes | list | `[]` | Additional volumes to mount in the pod | 94 | 95 | ---------------------------------------------- 96 | Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) 97 | -------------------------------------------------------------------------------- /docs/e2e_tests.md: -------------------------------------------------------------------------------- 1 | # k8s-shredder End-to-End Tests 2 | 3 | This document describes all the end-to-end tests for k8s-shredder, including their functionality, skip conditions, and execution environments. 4 | 5 | ## Test Overview 6 | 7 | The e2e tests verify various aspects of k8s-shredder functionality including node parking, pod eviction, metrics collection, and safety checks. Tests are designed to run in different environments (standard, Karpenter, node-labels) and have specific skip conditions. 8 | 9 | ## Test Environments 10 | 11 | ### Standard Environment (`local-test`) 12 | - **Purpose**: Basic k8s-shredder functionality testing 13 | - **Cluster**: 4 nodes (control-plane, worker, worker2, worker3-monitoring) 14 | - **Features**: Standard node parking and pod eviction 15 | 16 | ### Karpenter Environment (`local-test-karpenter`) 17 | - **Purpose**: Karpenter drift detection testing 18 | - **Cluster**: 4 nodes (control-plane, worker, worker2, worker3-monitoring) 19 | - **Features**: Mock Karpenter CRDs, drift detection simulation 20 | 21 | ### Node Labels Environment (`local-test-node-labels`) 22 | - **Purpose**: Node label detection testing 23 | - **Cluster**: 5 nodes (control-plane, worker, worker2, worker3, worker4-monitoring) 24 | - **Features**: Node label detection, automatic parking based on labels 25 | 26 | ## Test Cases 27 | 28 | ### TestNodeIsCleanedUp 29 | **Always Run**: ✅ Yes (in all environments) 30 | 31 | **Purpose**: Verifies that k8s-shredder properly cleans up parked nodes after their TTL expires. 32 | 33 | **Steps**: 34 | 1. Parks a worker node with a 1-minute TTL 35 | 2. Waits for the TTL to expire 36 | 3. Verifies that all user pods are evicted from the node 37 | 4. Collects metrics to verify operation 38 | 39 | **Expected Result**: The node should be parked, then after TTL expiration, all user pods should be evicted. 40 | 41 | **Skip Conditions**: None - runs in all environments 42 | 43 | --- 44 | 45 | ### TestShredderMetrics 46 | **Always Run**: ✅ Yes (in all environments) 47 | 48 | **Purpose**: Verifies that k8s-shredder metrics are properly collected and exposed via Prometheus. 49 | 50 | **Steps**: 51 | 1. Collects metrics from Prometheus 52 | 2. Verifies that expected metrics are present 53 | 3. Logs metric values for verification 54 | 55 | **Expected Result**: Should find metrics like `shredder_processed_pods_total`, `shredder_errors_total`, etc. 56 | 57 | **Skip Conditions**: None - runs in all environments 58 | 59 | **Note**: Requires Prometheus to be running on the dedicated monitoring node (worker3/worker4) 60 | 61 | --- 62 | 63 | ### TestArgoRolloutRestartAt 64 | **Always Run**: ✅ Yes (in all environments) 65 | 66 | **Purpose**: Verifies that k8s-shredder properly sets the `restartAt` field on Argo Rollouts. 67 | 68 | **Steps**: 69 | 1. Waits for the Argo Rollout to have its `restartAt` field set 70 | 2. Verifies the field is properly configured 71 | 72 | **Expected Result**: The Argo Rollout should have a `restartAt` field set to a future timestamp. 73 | 74 | **Skip Conditions**: None - runs in all environments 75 | 76 | --- 77 | 78 | ### TestKarpenterMetrics 79 | **Conditional Run**: Only in Karpenter environment 80 | 81 | **Purpose**: Verifies Karpenter-specific metrics when drift detection is enabled. 82 | 83 | **Steps**: 84 | 1. Collects Karpenter-specific metrics from Prometheus 85 | 2. Verifies expected Karpenter metrics are present 86 | 3. Logs metric values for verification 87 | 88 | **Expected Result**: Should find metrics like `shredder_karpenter_drifted_nodes_total`, `shredder_karpenter_nodes_parked_total`, etc. 89 | 90 | **Skip Conditions**: 91 | - ❌ Not running in Karpenter test environment 92 | - ❌ Prometheus not accessible 93 | 94 | --- 95 | 96 | ### TestNodeLabelMetrics 97 | **Conditional Run**: Only in node-labels environment 98 | 99 | **Purpose**: Verifies node label detection metrics when node label detection is enabled. 100 | 101 | **Steps**: 102 | 1. Collects node label detection metrics from Prometheus 103 | 2. Verifies expected node label metrics are present 104 | 3. Logs metric values for verification 105 | 106 | **Expected Result**: Should find metrics like `shredder_node_label_nodes_parked_total`, `shredder_node_label_matching_nodes_total`, etc. 107 | 108 | **Skip Conditions**: 109 | - ❌ Not running in node-labels test environment 110 | - ❌ Prometheus not accessible 111 | 112 | --- 113 | 114 | ### TestEvictionSafetyCheck 115 | **Conditional Run**: Only when EvictionSafetyCheck is enabled 116 | 117 | **Purpose**: Tests the EvictionSafetyCheck failure case - verifies that nodes are unparked when pods lack proper parking labels. 118 | 119 | **Steps**: 120 | 1. Scale k8s-shredder replicas to zero to disable actions 121 | 2. Park the worker2 node and all pods on it (properly labels all existing pods) 122 | 3. Create a new pod without proper parking labels on worker2 123 | 4. Create a PodDisruptionBudget to prevent soft eviction of the unlabeled pod 124 | 5. Scale k8s-shredder replicas to 1 to start the test 125 | 6. Monitor worker2 parking status - it should be unparked due to safety check failure 126 | 127 | **Expected Result**: The node should be unparked because the EvictionSafetyCheck detects that not all pods have proper parking labels. 128 | 129 | **Skip Conditions**: 130 | - ❌ EvictionSafetyCheck is disabled in k8s-shredder configuration 131 | - ❌ Running in Karpenter or node-labels test environments (different node structures) 132 | - ❌ Cannot access k8s-shredder-config configmap 133 | 134 | --- 135 | 136 | ### TestEvictionSafetyCheckPasses 137 | **Conditional Run**: Only when EvictionSafetyCheck is enabled 138 | 139 | **Purpose**: Tests the EvictionSafetyCheck success case - verifies that force eviction proceeds when all pods are properly labeled. 140 | 141 | **Steps**: 142 | 1. Scale k8s-shredder replicas to zero to disable actions 143 | 2. Park the worker2 node and all pods on it (this properly labels all pods) 144 | 3. Scale k8s-shredder replicas to 1 to start the test 145 | 4. Monitor worker2 parking status - it should remain parked until TTL expires, then get force evicted 146 | 147 | **Expected Result**: The node should remain parked and eventually be force evicted because all pods have proper parking labels. 148 | 149 | **Skip Conditions**: 150 | - ❌ EvictionSafetyCheck is disabled in k8s-shredder configuration 151 | - ❌ Running in Karpenter or node-labels test environments (different node structures) 152 | - ❌ Cannot access k8s-shredder-config configmap 153 | 154 | ## Running the Tests 155 | 156 | ### Prerequisites 157 | 158 | 1. A running kind cluster with k8s-shredder deployed 159 | 2. The `park-node` binary built and available 160 | 3. Prometheus running on the dedicated monitoring node 161 | 162 | ### Running All Tests 163 | 164 | ```bash 165 | # Build the park-node binary 166 | make build 167 | 168 | # Run all e2e tests 169 | make e2e-tests 170 | ``` 171 | 172 | ### Running Specific Test Environments 173 | 174 | ```bash 175 | # Standard environment 176 | make local-test 177 | 178 | # Karpenter environment 179 | make local-test-karpenter 180 | 181 | # Node labels environment 182 | make local-test-node-labels 183 | ``` 184 | 185 | ### Running Individual Tests 186 | 187 | ```bash 188 | # Run specific test 189 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/kubeconfig-localtest go test internal/testing/e2e_test.go -v -run TestShredderMetrics 190 | 191 | # Run all EvictionSafetyCheck tests 192 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/kubeconfig-localtest go test internal/testing/e2e_test.go -v -run 'TestEvictionSafetyCheck.*' 193 | ``` 194 | 195 | ## Test Configuration 196 | 197 | ### EvictionSafetyCheck Configuration 198 | 199 | The EvictionSafetyCheck tests check the `k8s-shredder-config` ConfigMap in the `kube-system` namespace for the `EvictionSafetyCheck: true` setting. If this setting is not found or is set to `false`, the tests will be skipped. 200 | 201 | ### Prometheus Configuration 202 | 203 | All metrics tests require Prometheus to be running on the dedicated monitoring node. The monitoring node is configured with: 204 | - **Node Label**: `monitoring=dedicated` 205 | - **Node Taint**: `monitoring=dedicated:NoSchedule` 206 | - **Prometheus Node Selector**: `monitoring: dedicated` 207 | - **Prometheus Toleration**: For the `monitoring=dedicated:NoSchedule` taint 208 | 209 | This ensures Prometheus is never affected by k8s-shredder node parking operations. 210 | 211 | ## PodDisruptionBudget Usage 212 | 213 | In the `TestEvictionSafetyCheck` failure test case, a PodDisruptionBudget is created to prevent the unlabeled pod from being evicted by normal "soft" eviction mechanisms before the EvictionSafetyCheck runs. This ensures that: 214 | 215 | 1. The pod remains on the node when k8s-shredder performs the safety check 216 | 2. The safety check can properly detect the missing parking labels 217 | 3. The node gets unparked as expected 218 | 219 | The PDB uses `minAvailable: 1` and targets the specific test pod using the `test-pod: "true"` label selector. 220 | 221 | ## Test Results Interpretation 222 | 223 | ### Successful Test Results 224 | - **PASS**: Test completed successfully with expected behavior 225 | - **SKIP**: Test was skipped due to environment or configuration conditions 226 | 227 | ### Failed Test Results 228 | - **FAIL**: Test failed due to unexpected behavior or errors 229 | - **TIMEOUT**: Test exceeded maximum execution time 230 | 231 | ### Common Skip Reasons 232 | - `EvictionSafetyCheck is disabled in k8s-shredder configuration` 233 | - `not running in a Karpenter test environment` 234 | - `not running in a node labels test environment` 235 | - `Prometheus is not accessible after 30 retries` 236 | 237 | ## Troubleshooting 238 | 239 | ### Prometheus Issues 240 | If metrics tests are failing with "Prometheus port not set" errors: 241 | 1. Check that Prometheus is running: `kubectl get pods -n kube-system | grep prometheus` 242 | 2. Verify Prometheus is on the monitoring node: `kubectl get pods -n kube-system -o wide | grep prometheus` 243 | 3. Check node labels and taints: `kubectl describe node ` 244 | 245 | ### EvictionSafetyCheck Issues 246 | If EvictionSafetyCheck tests are being skipped: 247 | 1. Check the configmap: `kubectl get configmap k8s-shredder-config -n kube-system -o yaml` 248 | 2. Verify `EvictionSafetyCheck: true` is set in the configuration 249 | 3. Ensure you're running in the standard test environment (not Karpenter or node-labels) 250 | 251 | ### Node Parking Issues 252 | If node parking tests are failing: 253 | 1. Check k8s-shredder logs: `kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder` 254 | 2. Verify the park-node binary exists: `ls -la park-node` 255 | 3. Check node status: `kubectl get nodes` 256 | --------------------------------------------------------------------------------