├── .github
├── CODEOWNERS
├── workflows
│ ├── release-chart.yaml
│ ├── release.yaml
│ ├── ci-chart.yaml
│ └── ci.yaml
├── ISSUE_TEMPLATE.md
├── dependabot.yml
└── PULL_REQUEST_TEMPLATE.md
├── docs
├── k8s-shredder.gif
├── loop_diagram.png
├── k8s-shredder-logo.png
├── shredder_firefly.png
├── architecture.md
├── loop-diagram.md
├── node-parking.md
├── metrics.md
└── e2e_tests.md
├── renovate.json
├── pyproject.toml
├── .gitignore
├── .yamlfix
├── charts
└── k8s-shredder
│ ├── templates
│ ├── service-account.yaml
│ ├── NOTES.txt
│ ├── cluster-role-binding.yaml
│ ├── service.yaml
│ ├── cluster-role.yaml
│ ├── podmonitor.yaml
│ ├── configmap.yaml
│ ├── _helpers.tpl
│ └── deployment.yaml
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── values.yaml
│ └── README.md
├── Dockerfile
├── main.go
├── internal
├── testing
│ ├── rollback_cluster_upgrade.sh
│ ├── cluster_upgrade.sh
│ ├── kind.yaml
│ ├── kind-karpenter.yaml
│ ├── kind-node-labels.yaml
│ ├── rbac.yaml
│ ├── test_eviction_safety_check.sh
│ ├── park_node.go
│ ├── prometheus_stuffs.yaml
│ ├── prometheus_stuffs_karpenter.yaml
│ ├── prometheus_stuffs_node_labels.yaml
│ ├── k8s-shredder.yaml
│ ├── k8s-shredder-karpenter.yaml
│ ├── k8s-shredder-node-labels.yaml
│ ├── karpenter-manifests.yaml
│ ├── local_env_prep_helm.sh
│ ├── local_env_prep_node_labels_helm.sh
│ ├── cluster_upgrade_node_labels.sh
│ ├── test_apps.yaml
│ └── local_env_prep_karpenter_helm.sh
└── check_license.sh
├── cmd
└── park-node
│ └── main.go
├── pkg
├── utils
│ ├── signal.go
│ ├── context.go
│ ├── k8s.go
│ └── node_label_detection.go
├── metrics
│ ├── types.go
│ └── metrics.go
├── config
│ └── config.go
└── schedule
│ └── schedule.go
├── RELEASE.md
├── CONTRIBUTING.md
├── .goreleaser.yml
├── config.yaml
├── CODE_OF_CONDUCT.md
├── go.mod
└── Makefile
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Maintainers
2 | * @adobe/ethos
--------------------------------------------------------------------------------
/docs/k8s-shredder.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/k8s-shredder/HEAD/docs/k8s-shredder.gif
--------------------------------------------------------------------------------
/docs/loop_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/k8s-shredder/HEAD/docs/loop_diagram.png
--------------------------------------------------------------------------------
/docs/k8s-shredder-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/k8s-shredder/HEAD/docs/k8s-shredder-logo.png
--------------------------------------------------------------------------------
/docs/shredder_firefly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/k8s-shredder/HEAD/docs/shredder_firefly.png
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 | "extends": [
4 | "config:recommended"
5 | ]
6 | }
7 |
--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
1 | # Architecture
2 |
3 |

4 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.yamlfix]
2 | # Preserve block scalar format and prevent conversion to quoted strings
3 | # This configuration helps maintain readable YAML block scalars
4 |
5 | # Don't convert multiline strings to quoted format
6 | # This preserves the |- and | block scalar indicators
7 | preserve_block_scalars = true
8 |
9 | # Maintain original formatting where possible
10 | preserve_formatting = true
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # OSX os files
2 | .DS_Store
3 | .DS_Store?
4 |
5 | # build artifacts
6 | kubeconfig*
7 | dist
8 | /k8s-shredder
9 | my-k8s-shredder-values.yaml
10 | /park-node
11 |
12 | # Test binary, build with `go test -c`
13 | *.test
14 |
15 | # Output of the go coverage tool, specifically when used with LiteIDE
16 | *.out
17 |
18 | # editor and IDE paraphernalia
19 | .idea
20 | *.swp
21 | *.swo
22 | *~
23 | .vscode
24 |
--------------------------------------------------------------------------------
/.yamlfix:
--------------------------------------------------------------------------------
1 | # yamlfix configuration
2 | # Preserve block scalar format and prevent conversion to quoted strings
3 |
4 | # Configuration options for yamlfix
5 | # This file should be in the root directory of the project
6 |
7 | # Preserve block scalar format (|- and |)
8 | # This prevents conversion to quoted strings with escaped newlines
9 | preserve_block_scalars: true
10 |
11 | # Maintain original formatting
12 | preserve_formatting: true
13 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/service-account.yaml:
--------------------------------------------------------------------------------
1 | {{ if .Values.serviceAccount.create }}
2 | apiVersion: v1
3 | kind: ServiceAccount
4 | metadata:
5 | name: {{ include "k8s-shredder.serviceAccountName" . }}
6 | namespace: {{ .Release.Namespace }}
7 | labels:
8 | {{ include "k8s-shredder.labels" . | indent 4 }}
9 | {{- with .Values.serviceAccount.annotations }}
10 | annotations:
11 | {{- toYaml . | nindent 4 }}
12 | {{- end }}
13 | {{ end }}
--------------------------------------------------------------------------------
/charts/k8s-shredder/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | 1. Get the application metrics URL by running these commands:
2 |
3 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "k8s-shredder.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
4 | echo "Visit http://127.0.0.1:8080/metrics to get shredder metrics"
5 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080
6 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/Chart.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v2
3 | name: k8s-shredder
4 | description: a novel way of dealing with kubernetes nodes blocked from draining
5 | type: application
6 | home: https://github.com/adobe/k8s-shredder
7 | icon: https://raw.githubusercontent.com/adobe/k8s-shredder/main/docs/k8s-shredder_logo.jpg
8 | maintainers:
9 | - name: adriananeci
10 | email: aneci@adobe.com
11 | url: https://adobe.com
12 | - name: sfotony
13 | email: gosselin@adobe.com
14 | url: https://adobe.com
15 | version: 0.2.8
16 | appVersion: v0.3.8
17 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/cluster-role-binding.yaml:
--------------------------------------------------------------------------------
1 | {{ if .Values.rbac.create}}
2 | apiVersion: rbac.authorization.k8s.io/v1
3 | kind: ClusterRoleBinding
4 | metadata:
5 | name: {{ include "k8s-shredder.fullname" . }}
6 | labels:
7 | {{ include "k8s-shredder.labels" . | indent 4 }}
8 | roleRef:
9 | apiGroup: rbac.authorization.k8s.io
10 | kind: ClusterRole
11 | name: {{ include "k8s-shredder.fullname" . }}
12 | subjects:
13 | - kind: ServiceAccount
14 | name: {{ include "k8s-shredder.serviceAccountName" . }}
15 | namespace: {{ .Release.Namespace }}
16 | {{ end }}
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine AS show_me_your_security
2 | # Install the Certificate-Authority certificates for the app to be able to make
3 | # calls to HTTPS endpoints.
4 | RUN apk add --no-cache ca-certificates
5 |
6 | # The second stage, create a small final image
7 | FROM scratch
8 | # Copy the /etc/passwd file we created in the builder stage. This creates a new
9 | # non-root user as a security best practice.
10 | COPY --from=show_me_your_security /etc/passwd /etc/passwd
11 | # Copy the certs from the builder stage
12 | COPY --from=show_me_your_security /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
13 | # copy our binary
14 | COPY k8s-shredder /k8s-shredder
15 | ENTRYPOINT ["/k8s-shredder"]
16 |
--------------------------------------------------------------------------------
/.github/workflows/release-chart.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | name: Release Charts
3 | on:
4 | push:
5 | branches: [main]
6 | jobs:
7 | release:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout
11 | uses: actions/checkout@v6
12 | with:
13 | fetch-depth: 0
14 | - name: Configure Git
15 | run: |
16 | git config user.name "$GITHUB_ACTOR"
17 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
18 | - name: Run chart-releaser
19 | uses: helm/chart-releaser-action@v1.7.0
20 | env:
21 | CR_TOKEN: ${{ secrets.GITHUB_TOKEN }}
22 | CR_SKIP_EXISTING: true
23 | CR_RELEASE_NAME_TEMPLATE: Helm-Chart-v{{ .Version }}
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ### Expected Behaviour
5 |
6 | ### Actual Behaviour
7 |
8 | ### Reproduce Scenario (including but not limited to)
9 |
10 | #### Steps to Reproduce
11 |
12 | #### Platform and Version
13 |
14 | #### Sample Code that illustrates the problem
15 |
16 | #### Logs taken while reproducing problem
17 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE-2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package main
13 |
14 | import "github.com/adobe/k8s-shredder/cmd"
15 |
16 | func main() {
17 | cmd.Execute()
18 | }
19 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/service.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.service.create }}
2 | apiVersion: v1
3 | kind: Service
4 | metadata:
5 | name: {{ include "k8s-shredder.fullname" . }}
6 | namespace: {{ .Release.Namespace }}
7 | labels:
8 | {{ include "k8s-shredder.labels" . | indent 4 }}
9 | {{- if .Values.service.labels }}
10 | {{ toYaml .Values.service.labels | indent 4 }}
11 | {{- end }}
12 | {{- if .Values.service.annotations }}
13 | annotations:
14 | {{ toYaml .Values.service.annotations | indent 4 }}
15 | {{- end }}
16 | spec:
17 | type: {{ .Values.service.type }}
18 | selector:
19 | {{ include "k8s-shredder.matchLabels" . | indent 4 }}
20 | ports:
21 | - name: metrics
22 | port: {{ .Values.service.port }}
23 | targetPort: {{ .Values.service.targetPort }}
24 | protocol: TCP
25 | {{- end }}
26 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/cluster-role.yaml:
--------------------------------------------------------------------------------
1 | {{ if .Values.rbac.create }}
2 | apiVersion: rbac.authorization.k8s.io/v1
3 | kind: ClusterRole
4 | metadata:
5 | name: {{ include "k8s-shredder.fullname" . }}
6 | labels:
7 | {{ include "k8s-shredder.labels" . | indent 4 }}
8 | rules:
9 | - apiGroups: ["*"]
10 | resources: [nodes]
11 | verbs: [get, list, watch, update, patch]
12 | - apiGroups: ["*"]
13 | resources: [pods, pods/eviction]
14 | verbs: ["*"]
15 | - apiGroups: [apps, extensions]
16 | resources: [statefulsets, deployments, replicasets]
17 | verbs: [get, list, watch, update, patch]
18 | - apiGroups: [ "argoproj.io" ]
19 | resources: [ rollouts ]
20 | verbs: [ get, list, watch, update, patch ]
21 | - apiGroups: [ "karpenter.sh" ]
22 | resources: [ nodeclaims ]
23 | verbs: [ get, list, watch ]
24 | {{ end }}
25 |
--------------------------------------------------------------------------------
/internal/testing/rollback_cluster_upgrade.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | K8S_CLUSTER_NAME=$1
5 |
6 | # For moving node back as active, useful during debug process
7 | export K8S_CLUSTER_NAME=k8s-shredder-test-cluster
8 | kubectl uncordon "${K8S_CLUSTER_NAME}-worker" --kubeconfig=kubeconfig
9 | kubectl label node "${K8S_CLUSTER_NAME}-worker" --kubeconfig=kubeconfig shredder.ethos.adobe.net/upgrade-status-
10 | kubectl label node "${K8S_CLUSTER_NAME}-worker" --kubeconfig=kubeconfig --overwrite shredder.ethos.adobe.net/parked-node-expires-on-
11 | kubectl delete -n ns-k8s-shredder-test $(kubectl get pods -n ns-k8s-shredder-test -oname) --force --wait=0 --timeout=0
12 | kubectl delete -n ns-team-k8s-shredder-test $(kubectl get pods -n ns-team-k8s-shredder-test -oname) --force --wait=0 --timeout=0
13 | kubectl get po -A --field-selector=spec.nodeName=k8s-shredder-test-cluster-worker
--------------------------------------------------------------------------------
/docs/loop-diagram.md:
--------------------------------------------------------------------------------
1 | # Loop diagram
2 |
3 | 1. Get all nodes with `labels=shredder.ethos.adobe.net/upgrade-status=parked` and `taints!=ToBeDeletedByClusterAutoscaler`.
4 | 2. Loop through all parked nodes.
5 | 3. Check if the node has reached `shredder.ethos.adobe.net/parked-node-expires-on` time.
6 | 4. Force delete all pods from the node.
7 | 5. Get all pods from the parked node.
8 | 6. Loop through each pod.
9 | 7. Check if the pod is part of a skipped eviction namespace.
10 | 8. Check if the controller object that owns the pod has a rollout restart already in progress.
11 | 9. Check if the elapsed time is greater than the time node needs to be parked.
12 | 10. Perform a rollout restart of the controller object which owns the pod.
13 | 11. Check if the pod has the label `shredder.ethos.adobe.net/allow-eviction=false` attached.
14 | 12. Evict the pod.
15 |
16 | 
17 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/podmonitor.yaml:
--------------------------------------------------------------------------------
1 | {{ if .Values.podMonitor.enabled }}
2 | apiVersion: monitoring.coreos.com/v1
3 | kind: PodMonitor
4 | metadata:
5 | name: {{ include "k8s-shredder.fullname" . }}
6 | namespace: {{ .Release.Namespace }}
7 | labels:
8 | {{ include "k8s-shredder.labels" . | indent 4 }}
9 | {{- with .Values.podMonitor.labels }}
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | spec:
13 | jobLabel: "k8s-shredder"
14 | namespaceSelector:
15 | matchNames:
16 | - {{ .Release.Namespace }}
17 | podMetricsEndpoints:
18 | - interval: {{ .Values.podMonitor.interval }}
19 | path: /metrics
20 | port: metrics
21 | scheme: http
22 | honorLabels: {{ .Values.podMonitor.honorLabels }}
23 | {{- if .Values.podMonitor.relabelings }}
24 | relabelings:
25 | {{- toYaml .Values.podMonitor.relabelings | nindent 8 }}
26 | {{- end }}
27 | selector:
28 | matchLabels:
29 | {{ include "k8s-shredder.matchLabels" . | indent 6 }}
30 | {{- end }}
--------------------------------------------------------------------------------
/cmd/park-node/main.go:
--------------------------------------------------------------------------------
1 | // Copyright 2025 Adobe. All rights reserved.
2 | package main
3 |
4 | import (
5 | "flag"
6 | "log"
7 | "os"
8 |
9 | e2e "github.com/adobe/k8s-shredder/internal/testing"
10 | )
11 |
12 | func main() {
13 | var nodeName, kubeconfigPath string
14 |
15 | // Use a custom flag set to avoid conflicts with client-go flags
16 | fs := flag.NewFlagSet("park-node", flag.ExitOnError)
17 | fs.StringVar(&nodeName, "node", "", "Name of the node to park")
18 | fs.StringVar(&kubeconfigPath, "park-kubeconfig", "", "Path to kubeconfig file")
19 | if err := fs.Parse(os.Args[1:]); err != nil {
20 | log.Fatal(err)
21 | }
22 |
23 | if nodeName == "" {
24 | log.Fatal("Node name is required. Use -node flag")
25 | }
26 | if kubeconfigPath == "" {
27 | log.Fatal("Kubeconfig path is required. Use -park-kubeconfig flag")
28 | }
29 |
30 | if err := e2e.ParkNodeForTesting(nodeName, kubeconfigPath); err != nil {
31 | log.Fatalf("Failed to park node: %v", err)
32 | }
33 |
34 | log.Printf("Successfully parked node %s", nodeName)
35 | os.Exit(0)
36 | }
37 |
--------------------------------------------------------------------------------
/pkg/utils/signal.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE-2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package utils
13 |
14 | import (
15 | "context"
16 | "os"
17 | "os/signal"
18 | "syscall"
19 |
20 | log "github.com/sirupsen/logrus"
21 | )
22 |
23 | // HandleOsSignals gracefully handles OS signals
24 | func HandleOsSignals(cancel context.CancelFunc) {
25 | c := make(chan os.Signal, 1)
26 | signal.Notify(c,
27 | syscall.SIGHUP,
28 | syscall.SIGINT,
29 | syscall.SIGTERM,
30 | syscall.SIGQUIT,
31 | )
32 |
33 | sig := <-c
34 | log.Debugf("Got signal %s, terminating gracefully", sig.String())
35 | cancel()
36 | os.Exit(0)
37 | }
38 |
--------------------------------------------------------------------------------
/internal/testing/cluster_upgrade.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | K8S_CLUSTER_NAME=$1
5 | KUBECONFIG_FILE=${2:-kubeconfig}
6 |
7 | echo "K8S_SHREDDER: Simulating cluster upgrade..."
8 | echo "K8S_SHREDDER: Parking k8s-shredder-worker with proper pod labeling and a TTL of 1 minute!"
9 |
10 | # Use the park-node binary to properly park the node (labels both node and pods)
11 | ./park-node -node "${K8S_CLUSTER_NAME}-worker" -park-kubeconfig "${KUBECONFIG_FILE}"
12 |
13 | if [[ ${WAIT_FOR_PODS:-false} == "true" ]]
14 | then
15 | while [[ $pod_status != "No resources found" ]]
16 | do
17 | echo "Info: Waiting for all pods to be evicted from the node..."
18 | sleep 10
19 | pod_status=$(kubectl get pods -A --field-selector metadata.namespace!=kube-system,metadata.namespace!=local-path-storage,spec.nodeName=k8s-shredder-test-cluster-worker 2>&1 >/dev/null)
20 | done
21 |
22 | # This is to simulate the upgrade process. We are going to wait for 1 minute and then uncordon the node.
23 | kubectl label node "${K8S_CLUSTER_NAME}-worker" --kubeconfig=${KUBECONFIG_FILE} shredder.ethos.adobe.net/upgrade-status-
24 | kubectl label node "${K8S_CLUSTER_NAME}-worker" --kubeconfig=${KUBECONFIG_FILE} --overwrite shredder.ethos.adobe.net/parked-node-expires-on-
25 | fi
26 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # Please see the documentation for all configuration options: https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
3 | version: 2
4 | updates:
5 | # github-actions
6 | - directory: /
7 | package-ecosystem: github-actions
8 | schedule:
9 | interval: weekly
10 | time: 09:00
11 | # Use Europe/Bucharest Standard Time (UTC +02:00)
12 | timezone: Europe/Bucharest
13 | commit-message:
14 | prefix: dependabot
15 | include: scope
16 | labels:
17 | - kind/cleanup
18 | - dependabot
19 | # Go
20 | - directory: /
21 | package-ecosystem: gomod
22 | schedule:
23 | interval: weekly
24 | time: 09:00
25 | # Use Europe/Bucharest Standard Time (UTC +02:00)
26 | timezone: Europe/Bucharest
27 | commit-message:
28 | prefix: dependabot
29 | include: scope
30 | # TODO decide if we should enable ignore
31 | # ignore:
32 | # # Ignore controller-runtime as its upgraded manually.
33 | # - dependency-name: "sigs.k8s.io/controller-runtime"
34 | # # Ignore k8s and its transitives modules as they are upgraded manually together with controller-runtime.
35 | # - dependency-name: "k8s.io/*"
36 | labels: [kind/cleanup, dependabot]
37 |
--------------------------------------------------------------------------------
/internal/testing/kind.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | kind: Cluster
3 | apiVersion: kind.x-k8s.io/v1alpha4
4 | networking:
5 | apiServerPort: 6443
6 | apiServerAddress: 0.0.0.0
7 | nodes:
8 | - role: control-plane
9 | extraPortMappings:
10 | - containerPort: 30007
11 | hostPort: 30007
12 | kubeadmConfigPatches:
13 | - |
14 | kind: InitConfiguration
15 | nodeRegistration:
16 | kubeletExtraArgs:
17 | node-labels: "node.kubernetes.io/role=etcd,node.kubernetes.io/role=master"
18 | - role: worker
19 | kubeadmConfigPatches:
20 | - |
21 | kind: JoinConfiguration
22 | nodeRegistration:
23 | kubeletExtraArgs:
24 | node-labels: "node.kubernetes.io/role=worker,will-be-parked=yes"
25 | - role: worker
26 | kubeadmConfigPatches:
27 | - |-
28 | kind: JoinConfiguration
29 | nodeRegistration:
30 | kubeletExtraArgs:
31 | node-labels: "node.kubernetes.io/role=worker"
32 | - role: worker
33 | kubeadmConfigPatches:
34 | - |-
35 | kind: JoinConfiguration
36 | nodeRegistration:
37 | kubeletExtraArgs:
38 | node-labels: "node.kubernetes.io/role=worker,monitoring=dedicated"
39 | taints:
40 | - key: "monitoring"
41 | value: "dedicated"
42 | effect: "NoSchedule"
43 |
--------------------------------------------------------------------------------
/internal/testing/kind-karpenter.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | kind: Cluster
3 | apiVersion: kind.x-k8s.io/v1alpha4
4 | networking:
5 | apiServerPort: 6444
6 | apiServerAddress: 0.0.0.0
7 | nodes:
8 | - role: control-plane
9 | extraPortMappings:
10 | - containerPort: 30007
11 | hostPort: 30008
12 | kubeadmConfigPatches:
13 | - |
14 | kind: InitConfiguration
15 | nodeRegistration:
16 | kubeletExtraArgs:
17 | node-labels: "node.kubernetes.io/role=etcd,node.kubernetes.io/role=master"
18 | - role: worker
19 | kubeadmConfigPatches:
20 | - |
21 | kind: JoinConfiguration
22 | nodeRegistration:
23 | kubeletExtraArgs:
24 | node-labels: "node.kubernetes.io/role=worker,will-be-parked=yes"
25 | - role: worker
26 | kubeadmConfigPatches:
27 | - |-
28 | kind: JoinConfiguration
29 | nodeRegistration:
30 | kubeletExtraArgs:
31 | node-labels: "node.kubernetes.io/role=worker"
32 | - role: worker
33 | kubeadmConfigPatches:
34 | - |-
35 | kind: JoinConfiguration
36 | nodeRegistration:
37 | kubeletExtraArgs:
38 | node-labels: "node.kubernetes.io/role=worker,monitoring=dedicated"
39 | taints:
40 | - key: "monitoring"
41 | value: "dedicated"
42 | effect: "NoSchedule"
43 |
--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | name: release
3 | on:
4 | push:
5 | tags: [v*]
6 | permissions:
7 | contents: write # needed to write releases
8 | id-token: write # needed for keyless signing
9 | packages: write # needed for ghcr access
10 | jobs:
11 | release:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v6
15 | with:
16 | fetch-depth: 0 # this is important, otherwise it won't checkout the full tree (i.e. no previous tags)
17 | # Add support for more platforms with QEMU (optional)
18 | # https://github.com/docker/setup-qemu-action
19 | - name: Set up QEMU
20 | uses: docker/setup-qemu-action@v3
21 | - name: Set up Docker Buildx
22 | uses: docker/setup-buildx-action@v3
23 | - uses: actions/setup-go@v6
24 | with:
25 | go-version: '1.25'
26 | cache: true
27 | - uses: sigstore/cosign-installer@v4.0.0 # installs cosign
28 | # - uses: anchore/sbom-action/download-syft@v0.14.1 # installs syft
29 | - uses: docker/login-action@v3 # login to ghcr
30 | with:
31 | registry: ghcr.io
32 | username: ${{ github.repository_owner }}
33 | password: ${{ secrets.GITHUB_TOKEN }}
34 | - uses: goreleaser/goreleaser-action@v6 # run goreleaser
35 | with:
36 | version: latest
37 | args: release --clean
38 | env:
39 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
40 |
--------------------------------------------------------------------------------
/internal/testing/kind-node-labels.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | kind: Cluster
3 | apiVersion: kind.x-k8s.io/v1alpha4
4 | networking:
5 | apiServerPort: 6445
6 | apiServerAddress: 0.0.0.0
7 | nodes:
8 | - role: control-plane
9 | extraPortMappings:
10 | - containerPort: 30007
11 | hostPort: 30009
12 | kubeadmConfigPatches:
13 | - |
14 | kind: InitConfiguration
15 | nodeRegistration:
16 | kubeletExtraArgs:
17 | node-labels: "node.kubernetes.io/role=etcd,node.kubernetes.io/role=master"
18 | - role: worker
19 | kubeadmConfigPatches:
20 | - |
21 | kind: JoinConfiguration
22 | nodeRegistration:
23 | kubeletExtraArgs:
24 | node-labels: "node.kubernetes.io/role=worker"
25 | - role: worker
26 | kubeadmConfigPatches:
27 | - |-
28 | kind: JoinConfiguration
29 | nodeRegistration:
30 | kubeletExtraArgs:
31 | node-labels: "node.kubernetes.io/role=worker"
32 | - role: worker
33 | kubeadmConfigPatches:
34 | - |-
35 | kind: JoinConfiguration
36 | nodeRegistration:
37 | kubeletExtraArgs:
38 | node-labels: "node.kubernetes.io/role=worker"
39 | - role: worker
40 | kubeadmConfigPatches:
41 | - |-
42 | kind: JoinConfiguration
43 | nodeRegistration:
44 | kubeletExtraArgs:
45 | node-labels: "node.kubernetes.io/role=worker,monitoring=dedicated"
46 | taints:
47 | - key: "monitoring"
48 | value: "dedicated"
49 | effect: "NoSchedule"
50 |
--------------------------------------------------------------------------------
/.github/workflows/ci-chart.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | name: Lint and Test Helm Chart
3 | on: pull_request
4 | jobs:
5 | lint-test:
6 | runs-on: ubuntu-latest
7 | steps:
8 | - name: Checkout
9 | uses: actions/checkout@v6
10 | with:
11 | fetch-depth: 0
12 | - name: Set up Helm
13 | uses: azure/setup-helm@v4
14 | with:
15 | version: v3.12.1
16 | # Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and
17 | # yamllint (https://github.com/adrienverge/yamllint) which require Python
18 | - name: Set up Python
19 | uses: actions/setup-python@v6
20 | with:
21 | python-version: '3.12'
22 | check-latest: true
23 | - name: Set up chart-testing
24 | uses: helm/chart-testing-action@v2.8.0
25 | - name: Run chart-testing (list-changed)
26 | id: list-changed
27 | run: |
28 | changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
29 | if [[ -n "$changed" ]]; then
30 | echo "changed=true" >> "$GITHUB_OUTPUT"
31 | fi
32 | - name: Run chart-testing (lint)
33 | if: steps.list-changed.outputs.changed == 'true'
34 | run: ct lint --target-branch ${{ github.event.repository.default_branch }}
35 | - name: Create kind cluster
36 | if: steps.list-changed.outputs.changed == 'true'
37 | uses: helm/kind-action@v1.13.0
38 | with:
39 | version: v0.29.0
40 | - name: Run chart-testing (install)
41 | if: steps.list-changed.outputs.changed == 'true'
42 | run: ct install --target-branch ${{ github.event.repository.default_branch }}
43 |
--------------------------------------------------------------------------------
/internal/testing/rbac.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: ServiceAccount
4 | metadata:
5 | name: k8s-shredder
6 | namespace: kube-system
7 | ---
8 | apiVersion: rbac.authorization.k8s.io/v1
9 | kind: ClusterRoleBinding
10 | metadata:
11 | name: k8s-shredder
12 | roleRef:
13 | apiGroup: rbac.authorization.k8s.io
14 | kind: ClusterRole
15 | name: k8s-shredder
16 | subjects:
17 | - kind: ServiceAccount
18 | name: k8s-shredder
19 | namespace: kube-system
20 | ---
21 | apiVersion: rbac.authorization.k8s.io/v1
22 | kind: ClusterRole
23 | metadata:
24 | name: k8s-shredder
25 | rules:
26 | - apiGroups: ['*']
27 | resources: [nodes]
28 | verbs: [get, list, watch, update, patch]
29 | - apiGroups: ['*']
30 | resources: [pods, pods/eviction]
31 | verbs: ['*']
32 | - apiGroups: [apps, extensions]
33 | resources: [statefulsets, deployments, replicasets]
34 | verbs: [get, list, watch, update, patch]
35 | - apiGroups: [argoproj.io]
36 | resources: [rollouts]
37 | verbs: [get, list, watch, update, patch]
38 | - apiGroups: [karpenter.sh]
39 | resources: [nodeclaims]
40 | verbs: [get, list, watch]
41 | ---
42 | apiVersion: rbac.authorization.k8s.io/v1
43 | kind: ClusterRole
44 | metadata:
45 | name: edit-debug-flags-v
46 | rules:
47 | - apiGroups: ['']
48 | resources: [nodes/proxy]
49 | verbs: [update]
50 | - nonResourceURLs: [/debug/flags/v]
51 | verbs: [put]
52 | ---
53 | apiVersion: rbac.authorization.k8s.io/v1
54 | kind: ClusterRoleBinding
55 | metadata:
56 | name: edit-debug-flags-v
57 | roleRef:
58 | apiGroup: rbac.authorization.k8s.io
59 | kind: ClusterRole
60 | name: edit-debug-flags-v
61 | subjects:
62 | - kind: ServiceAccount
63 | name: default
64 | namespace: default
65 |
--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
1 | ## Creating a new release of `k8s-shredder`
2 |
3 | All k8s-shredder binaries, container image and helm chart are released using github actions workflows.
4 | See [release workflow](.github/workflows/release.yaml) and [release chart workflow](.github/workflows/release-chart.yaml) for more details.
5 |
6 | For publishing a new release follow below steps:
7 |
8 | ```
9 | export NEW_VERSION=vX.Y.Z
10 | git tag -a ${NEW_VERSION} -m "Release ${NEW_VERSION}"
11 | git push origin ${NEW_VERSION}
12 | ```
13 |
14 | ## Manually Releasing new `k8s-shredder` version
15 |
16 | For release process we're using [`goreleaser`](https://goreleaser.com/). You must install it first before being able to
17 | release a new version.
18 | Config file for `goreleaser` can be found in [goreleaser file](.goreleaser.yml)
19 |
20 | GoReleaser requires an API token with the `repo` scope selected to deploy the artifacts to GitHub.
21 | For generating a new token, you can create one from [tokens section](https://github.com/settings/tokens/new). For more details see
22 | [creating-a-personal-access-token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
23 |
24 | For publishing a new release follow below steps:
25 |
26 | ```
27 | export NEW_VERSION=vX.Y.Z
28 | git tag -a ${NEW_VERSION} -m "Release ${NEW_VERSION}"
29 | git push origin ${NEW_VERSION}
30 |
31 | export GITHUB_TOKEN=
32 |
33 | docker login ghcr.io
34 | Username:
35 | Password:
36 |
37 | make publish
38 | ```
39 |
40 | You can check if the new release and associated artifacts were properly pushed into GitHub by accessing
41 | [k8s-shredder releases](https://github.com/adobe/k8s-shredder/releases)
--------------------------------------------------------------------------------
/internal/testing/test_eviction_safety_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # EvictionSafetyCheck E2E Test Script
4 | #
5 | # This script runs the EvictionSafetyCheck e2e tests for k8s-shredder.
6 | # The tests verify that k8s-shredder properly validates that all pods on a parked node
7 | # have the required parking labels before proceeding with force eviction.
8 | #
9 | # Test Cases:
10 | # 1. TestEvictionSafetyCheck (Failure Case): Tests that nodes are unparked when pods lack proper labels
11 | # 2. TestEvictionSafetyCheckPasses (Success Case): Tests that force eviction proceeds when all pods are properly labeled
12 | #
13 | # The failure test includes a PodDisruptionBudget step to prevent soft eviction of the unlabeled pod,
14 | # ensuring the pod remains on the node when the safety check runs.
15 | #
16 | # Prerequisites:
17 | # - A running kind cluster with k8s-shredder deployed
18 | # - EvictionSafetyCheck enabled in the k8s-shredder configuration
19 | # - The park-node binary built and available
20 | #
21 | # Usage:
22 | # ./test_eviction_safety_check.sh
23 | #
24 | # The tests will automatically skip if:
25 | # - EvictionSafetyCheck is disabled in the k8s-shredder configuration
26 | # - Running in Karpenter or node-labels test environments (different node structures)
27 |
28 | set -e
29 |
30 | echo "Running EvictionSafetyCheck E2E Tests..."
31 |
32 | # Check if we're in the right directory
33 | if [ ! -f "internal/testing/e2e_test.go" ]; then
34 | echo "Error: This script must be run from the k8s-shredder project root"
35 | exit 1
36 | fi
37 |
38 | # Build the park-node binary if it doesn't exist
39 | if [ ! -f "park-node" ]; then
40 | echo "Building park-node binary..."
41 | make build
42 | fi
43 |
44 | # Run the e2e tests
45 | echo "Running e2e tests..."
46 | make e2e-tests
47 |
48 | echo "EvictionSafetyCheck E2E Tests completed!"
49 |
--------------------------------------------------------------------------------
/pkg/utils/context.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE-2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package utils
13 |
14 | import (
15 | "context"
16 |
17 | "github.com/adobe/k8s-shredder/pkg/config"
18 | "k8s.io/client-go/dynamic"
19 |
20 | "k8s.io/client-go/kubernetes"
21 | )
22 |
23 | // AppContext struct stores a context and a k8s client
24 | type AppContext struct {
25 | Context context.Context
26 | K8sClient kubernetes.Interface
27 | DynamicK8SClient dynamic.Interface
28 | Config config.Config
29 | dryRun bool
30 | }
31 |
32 | // NewAppContext creates a new AppContext object
33 | func NewAppContext(cfg config.Config, dryRun bool) (*AppContext, error) {
34 | client, err := getK8SClient()
35 | if err != nil {
36 | return nil, err
37 | }
38 |
39 | dynamicClient, err := getDynamicK8SClient()
40 | if err != nil {
41 | return nil, err
42 | }
43 |
44 | ctx, cancel := context.WithCancel(context.Background())
45 |
46 | go HandleOsSignals(cancel)
47 |
48 | return &AppContext{
49 | Context: ctx,
50 | K8sClient: client,
51 | DynamicK8SClient: dynamicClient,
52 | Config: cfg,
53 | dryRun: dryRun,
54 | }, nil
55 | }
56 |
57 | // IsDryRun returns true if the "--dry-run" flag was provided
58 | func (ac *AppContext) IsDryRun() bool {
59 | return ac.dryRun
60 | }
61 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Description
4 |
5 |
6 |
7 | ## Related Issue
8 |
9 |
10 |
11 |
12 |
13 |
14 | ## Motivation and Context
15 |
16 |
17 |
18 | ## How Has This Been Tested?
19 |
20 |
21 |
22 |
23 |
24 | ## Screenshots (if appropriate):
25 |
26 | ## Types of changes
27 |
28 |
29 |
30 | - [ ] Bug fix (non-breaking change which fixes an issue)
31 | - [ ] New feature (non-breaking change which adds functionality)
32 | - [ ] Breaking change (fix or feature that would cause existing functionality to change)
33 |
34 | ## Checklist:
35 |
36 |
37 |
38 |
39 | - [ ] I have signed the [Adobe Open Source CLA](https://opensource.adobe.com/cla.html).
40 | - [ ] My code follows the code style of this project.
41 | - [ ] My change requires a change to the documentation.
42 | - [ ] I have updated the documentation accordingly.
43 | - [ ] I have read the **CONTRIBUTING** document.
44 | - [ ] I have added tests to cover my changes.
45 | - [ ] All new and existing tests passed.
46 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: {{ include "k8s-shredder.fullname" . }}-config
5 | namespace: {{ .Release.Namespace }}
6 | labels:
7 | {{ include "k8s-shredder.labels" . | indent 4 }}
8 | data:
9 | config.yaml: |-
10 | EvictionLoopInterval: "{{.Values.shredder.EvictionLoopInterval}}"
11 | {{- if .Values.shredder.EvictionLoopSchedule }}
12 | EvictionLoopSchedule: "{{.Values.shredder.EvictionLoopSchedule}}"
13 | {{- end }}
14 | {{- if .Values.shredder.EvictionLoopDuration }}
15 | EvictionLoopDuration: "{{.Values.shredder.EvictionLoopDuration}}"
16 | {{- end }}
17 | ParkedNodeTTL: "{{.Values.shredder.ParkedNodeTTL}}"
18 | RollingRestartThreshold: "{{.Values.shredder.RollingRestartThreshold}}"
19 | UpgradeStatusLabel: "{{.Values.shredder.UpgradeStatusLabel}}"
20 | ExpiresOnLabel: "{{.Values.shredder.ExpiresOnLabel}}"
21 | NamespacePrefixSkipInitialEviction: "{{.Values.shredder.NamespacePrefixSkipInitialEviction}}"
22 | RestartedAtAnnotation: "{{.Values.shredder.RestartedAtAnnotation}}"
23 | AllowEvictionLabel: "{{.Values.shredder.AllowEvictionLabel}}"
24 | ToBeDeletedTaint: "{{.Values.shredder.ToBeDeletedTaint}}"
25 | ArgoRolloutsAPIVersion: "{{.Values.shredder.ArgoRolloutsAPIVersion}}"
26 | EnableKarpenterDriftDetection: {{.Values.shredder.EnableKarpenterDriftDetection}}
27 | EnableKarpenterDisruptionDetection: {{.Values.shredder.EnableKarpenterDisruptionDetection}}
28 | ParkedByLabel: "{{.Values.shredder.ParkedByLabel}}"
29 | ParkedByValue: "{{.Values.shredder.ParkedByValue}}"
30 | ParkedNodeTaint: "{{.Values.shredder.ParkedNodeTaint}}"
31 | EnableNodeLabelDetection: {{.Values.shredder.EnableNodeLabelDetection}}
32 | NodeLabelsToDetect: {{.Values.shredder.NodeLabelsToDetect | toJson}}
33 | MaxParkedNodes: {{.Values.shredder.MaxParkedNodes}}
34 | EvictionSafetyCheck: {{.Values.shredder.EvictionSafetyCheck}}
35 | ParkingReasonLabel: "{{.Values.shredder.ParkingReasonLabel}}"
36 | ExtraParkingLabels: {{.Values.shredder.ExtraParkingLabels | toJson}}
37 |
--------------------------------------------------------------------------------
/internal/check_license.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ueo pipefail
3 |
4 | CURRENT_YEAR=$(date +%Y)
5 | export CURRENT_YEAR
6 |
7 | # Function to update copyright year in Go files
8 | update_go_copyright_year() {
9 | local file=$1
10 | local temp_file=$(mktemp)
11 |
12 | # Check if file has a copyright header
13 | if head -n3 "$file" | grep -q "Copyright.*20[0-9]\{2\}"; then
14 | # Update the year to current year
15 | echo "Processing file: $file"
16 | # Now do the replacement
17 | sed "s/202[0-9]/$CURRENT_YEAR/g" "$file" > "$temp_file"
18 | else
19 | # Add copyright header if missing
20 | echo "// Copyright $CURRENT_YEAR Adobe. All rights reserved." > "$temp_file"
21 | cat "$file" >> "$temp_file"
22 | fi
23 |
24 | # Replace original file with modified content
25 | mv "$temp_file" "$file"
26 | }
27 |
28 | # Function to update copyright year in LICENSE file
29 | update_license_copyright_year() {
30 | local file=$1
31 | local temp_file=$(mktemp)
32 |
33 | echo "Processing LICENSE file"
34 |
35 | # Update only the line containing "Copyright 2022 Adobe"
36 | sed "s/Copyright 202[0-9] Adobe/Copyright $CURRENT_YEAR Adobe/g" "$file" > "$temp_file"
37 |
38 | # Replace original file with modified content
39 | mv "$temp_file" "$file"
40 | }
41 |
42 | export -f update_go_copyright_year
43 | export -f update_license_copyright_year
44 |
45 | # Update LICENSE file if it exists
46 | if [ -f "LICENSE" ]; then
47 | update_license_copyright_year "LICENSE"
48 | fi
49 |
50 | # Find all Go files and update their copyright headers
51 | find . -type f -iname '*.go' ! -path '*/vendor/*' -exec bash -c 'update_go_copyright_year "$1"' _ {} \;
52 |
53 | # Check if any files are missing the license header
54 | licRes=$(
55 | find . -type f -iname '*.go' ! -path '*/vendor/*' -exec \
56 | sh -c 'head -n3 $1 | grep -Eq "(Copyright|generated|GENERATED)" || echo "$1"' {} {} \;
57 | )
58 |
59 | if [ -n "${licRes}" ]; then
60 | echo -e "License header is missing in:\\n${licRes}"
61 | exit 255
62 | fi
63 |
--------------------------------------------------------------------------------
/internal/testing/park_node.go:
--------------------------------------------------------------------------------
1 | // Copyright 2025 Adobe. All rights reserved.
2 | package e2e
3 |
4 | import (
5 | "context"
6 | "time"
7 |
8 | "github.com/adobe/k8s-shredder/pkg/config"
9 | "github.com/adobe/k8s-shredder/pkg/utils"
10 | log "github.com/sirupsen/logrus"
11 | "k8s.io/client-go/kubernetes"
12 | "k8s.io/client-go/tools/clientcmd"
13 | )
14 |
15 | // ParkNodeForTesting properly parks a node using the ParkNodes function
16 | func ParkNodeForTesting(nodeName string, kubeconfigPath string) error {
17 | // Load kubeconfig from file without registering flags
18 | kubeconfig, err := clientcmd.LoadFromFile(kubeconfigPath)
19 | if err != nil {
20 | return err
21 | }
22 |
23 | k8sConfig, err := clientcmd.NewDefaultClientConfig(*kubeconfig, &clientcmd.ConfigOverrides{}).ClientConfig()
24 | if err != nil {
25 | return err
26 | }
27 |
28 | // Create Kubernetes client
29 | clientset, err := kubernetes.NewForConfig(k8sConfig)
30 | if err != nil {
31 | return err
32 | }
33 |
34 | // Create test configuration
35 | cfg := config.Config{
36 | ParkedNodeTTL: 1 * time.Minute, // 1 minute TTL for testing
37 | UpgradeStatusLabel: "shredder.ethos.adobe.net/upgrade-status",
38 | ExpiresOnLabel: "shredder.ethos.adobe.net/parked-node-expires-on",
39 | ParkedByLabel: "shredder.ethos.adobe.net/parked-by",
40 | ParkedByValue: "k8s-shredder",
41 | ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule",
42 | EvictionSafetyCheck: true, // Keep safety check enabled
43 | ExtraParkingLabels: map[string]string{},
44 | ParkingReasonLabel: "shredder.ethos.adobe.net/parked-reason",
45 | }
46 |
47 | // Create logger
48 | logEntry := log.NewEntry(log.New())
49 |
50 | // Create node info for parking
51 | nodesToPark := []utils.NodeInfo{
52 | {
53 | Name: nodeName,
54 | Labels: map[string]string{},
55 | },
56 | }
57 |
58 | // Park the node (this will label both node and pods)
59 | ctx := context.Background()
60 | err = utils.ParkNodes(ctx, clientset, nodesToPark, cfg, false, "e2e-test", logEntry)
61 | if err != nil {
62 | return err
63 | }
64 |
65 | logEntry.Infof("Successfully parked node %s with proper pod labeling", nodeName)
66 | return nil
67 | }
68 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Thanks for choosing to contribute!
4 |
5 | The following are a set of guidelines to follow when contributing to this project.
6 |
7 | ## Code Of Conduct
8 |
9 | This project adheres to the Adobe [code of conduct](../CODE_OF_CONDUCT.md). By participating,
10 | you are expected to uphold this code. Please report unacceptable behavior to
11 | [Grp-opensourceoffice@adobe.com](mailto:Grp-opensourceoffice@adobe.com).
12 |
13 | ## Have A Question?
14 |
15 | Start by filing an issue. The existing committers on this project work to reach
16 | consensus around project direction and issue solutions within issue threads
17 | (when appropriate).
18 |
19 | ## Contributor License Agreement
20 |
21 | All third-party contributions to this project must be accompanied by a signed contributor
22 | license agreement. This gives Adobe permission to redistribute your contributions
23 | as part of the project. [Sign our CLA](https://opensource.adobe.com/cla.html). You
24 | only need to submit an Adobe CLA one time, so if you have submitted one previously,
25 | you are good to go!
26 |
27 | ## Code Reviews
28 |
29 | All submissions should come in the form of pull requests and need to be reviewed
30 | by project committers. Read [GitHub's pull request documentation](https://help.github.com/articles/about-pull-requests/)
31 | for more information on sending pull requests.
32 |
33 | Lastly, please follow the [pull request template](.github/PULL_REQUEST_TEMPLATE.md) when
34 | submitting a pull request!
35 |
36 | ## From Contributor To Committer
37 |
38 | We love contributions from our community! If you'd like to go a step beyond contributor
39 | and become a committer with full write access and a say in the project, you must
40 | be invited to the project. The existing committers employ an internal nomination
41 | process that must reach lazy consensus (silence is approval) before invitations
42 | are issued. If you feel you are qualified and want to get more deeply involved,
43 | feel free to reach out to existing committers to have a conversation about that.
44 |
45 | ## Security Issues
46 |
47 | Security issues shouldn't be reported on this issue tracker. Instead, [file an issue to our security experts](https://helpx.adobe.com/security/alertus.html).
--------------------------------------------------------------------------------
/internal/testing/prometheus_stuffs.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: prometheus
6 | namespace: kube-system
7 | labels:
8 | app: prometheus
9 | spec:
10 | replicas: 1
11 | selector:
12 | matchLabels:
13 | app: prometheus
14 | template:
15 | metadata:
16 | labels:
17 | app: prometheus
18 | spec:
19 | nodeSelector:
20 | monitoring: dedicated
21 | tolerations:
22 | - key: monitoring
23 | value: dedicated
24 | effect: NoSchedule
25 | containers:
26 | - name: prometheus
27 | image: prom/prometheus:v2.54.1
28 | args:
29 | - --storage.tsdb.retention.time=1h
30 | - --config.file=/etc/prometheus/prometheus.yml
31 | - --storage.tsdb.path=/prometheus/
32 | ports:
33 | - containerPort: 9090
34 | resources:
35 | requests:
36 | cpu: 500m
37 | memory: 500M
38 | limits:
39 | cpu: '1'
40 | memory: 1Gi
41 | volumeMounts:
42 | - name: prometheus-config-volume
43 | mountPath: /etc/prometheus/
44 | - name: prometheus-storage-volume
45 | mountPath: /prometheus/
46 | volumes:
47 | - name: prometheus-config-volume
48 | configMap:
49 | defaultMode: 420
50 | name: prometheus-server-conf
51 | - name: prometheus-storage-volume
52 | emptyDir: {}
53 | ---
54 | apiVersion: v1
55 | kind: Service
56 | metadata:
57 | name: prometheus
58 | namespace: kube-system
59 | spec:
60 | type: NodePort
61 | selector:
62 | app: prometheus
63 | ports:
64 | - port: 9090
65 | targetPort: 9090
66 | nodePort: 30007
67 | ---
68 | apiVersion: v1
69 | kind: ConfigMap
70 | metadata:
71 | name: prometheus-server-conf
72 | labels:
73 | name: prometheus-server-conf
74 | namespace: kube-system
75 | data:
76 | prometheus.yml: |-
77 | global:
78 | scrape_interval: 5s
79 | evaluation_interval: 5s
80 | scrape_configs:
81 | - job_name: 'k8s-shredder'
82 | static_configs:
83 | - targets: ['k8s-shredder.kube-system.svc.cluster.local:8080']
84 |
--------------------------------------------------------------------------------
/internal/testing/prometheus_stuffs_karpenter.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: prometheus
6 | namespace: kube-system
7 | labels:
8 | app: prometheus
9 | spec:
10 | replicas: 1
11 | selector:
12 | matchLabels:
13 | app: prometheus
14 | template:
15 | metadata:
16 | labels:
17 | app: prometheus
18 | spec:
19 | nodeSelector:
20 | monitoring: dedicated
21 | tolerations:
22 | - key: monitoring
23 | value: dedicated
24 | effect: NoSchedule
25 | containers:
26 | - name: prometheus
27 | image: prom/prometheus:v2.54.1
28 | args:
29 | - --storage.tsdb.retention.time=1h
30 | - --config.file=/etc/prometheus/prometheus.yml
31 | - --storage.tsdb.path=/prometheus/
32 | ports:
33 | - containerPort: 9090
34 | resources:
35 | requests:
36 | cpu: 500m
37 | memory: 500M
38 | limits:
39 | cpu: '1'
40 | memory: 1Gi
41 | volumeMounts:
42 | - name: prometheus-config-volume
43 | mountPath: /etc/prometheus/
44 | - name: prometheus-storage-volume
45 | mountPath: /prometheus/
46 | volumes:
47 | - name: prometheus-config-volume
48 | configMap:
49 | defaultMode: 420
50 | name: prometheus-server-conf
51 | - name: prometheus-storage-volume
52 | emptyDir: {}
53 | ---
54 | apiVersion: v1
55 | kind: Service
56 | metadata:
57 | name: prometheus
58 | namespace: kube-system
59 | spec:
60 | type: NodePort
61 | selector:
62 | app: prometheus
63 | ports:
64 | - port: 9090
65 | targetPort: 9090
66 | nodePort: 30008
67 | ---
68 | apiVersion: v1
69 | kind: ConfigMap
70 | metadata:
71 | name: prometheus-server-conf
72 | labels:
73 | name: prometheus-server-conf
74 | namespace: kube-system
75 | data:
76 | prometheus.yml: |-
77 | global:
78 | scrape_interval: 5s
79 | evaluation_interval: 5s
80 | scrape_configs:
81 | - job_name: 'k8s-shredder'
82 | static_configs:
83 | - targets: ['k8s-shredder.kube-system.svc.cluster.local:8080']
84 |
--------------------------------------------------------------------------------
/internal/testing/prometheus_stuffs_node_labels.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: prometheus
6 | namespace: kube-system
7 | labels:
8 | app: prometheus
9 | spec:
10 | replicas: 1
11 | selector:
12 | matchLabels:
13 | app: prometheus
14 | template:
15 | metadata:
16 | labels:
17 | app: prometheus
18 | spec:
19 | nodeSelector:
20 | monitoring: dedicated
21 | tolerations:
22 | - key: monitoring
23 | value: dedicated
24 | effect: NoSchedule
25 | containers:
26 | - name: prometheus
27 | image: prom/prometheus:v2.54.1
28 | args:
29 | - --storage.tsdb.retention.time=1h
30 | - --config.file=/etc/prometheus/prometheus.yml
31 | - --storage.tsdb.path=/prometheus/
32 | ports:
33 | - containerPort: 9090
34 | resources:
35 | requests:
36 | cpu: 500m
37 | memory: 500M
38 | limits:
39 | cpu: '1'
40 | memory: 1Gi
41 | volumeMounts:
42 | - name: prometheus-config-volume
43 | mountPath: /etc/prometheus/
44 | - name: prometheus-storage-volume
45 | mountPath: /prometheus/
46 | volumes:
47 | - name: prometheus-config-volume
48 | configMap:
49 | defaultMode: 420
50 | name: prometheus-server-conf
51 | - name: prometheus-storage-volume
52 | emptyDir: {}
53 | ---
54 | apiVersion: v1
55 | kind: Service
56 | metadata:
57 | name: prometheus
58 | namespace: kube-system
59 | spec:
60 | type: NodePort
61 | selector:
62 | app: prometheus
63 | ports:
64 | - port: 9090
65 | targetPort: 9090
66 | nodePort: 30009
67 | ---
68 | apiVersion: v1
69 | kind: ConfigMap
70 | metadata:
71 | name: prometheus-server-conf
72 | labels:
73 | name: prometheus-server-conf
74 | namespace: kube-system
75 | data:
76 | prometheus.yml: |-
77 | global:
78 | scrape_interval: 5s
79 | evaluation_interval: 5s
80 | scrape_configs:
81 | - job_name: 'k8s-shredder'
82 | static_configs:
83 | - targets: ['k8s-shredder.kube-system.svc.cluster.local:8080']
84 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/* vim: set filetype=mustache: */}}
2 | {{/*
3 | Expand the name of the chart.
4 | */}}
5 | {{- define "k8s-shredder.name" -}}
6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
7 | {{- end -}}
8 |
9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "k8s-shredder.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 |
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "k8s-shredder.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 |
34 | {{/*
35 | Common labels
36 | */}}
37 | {{- define "k8s-shredder.labels" -}}
38 | app.kubernetes.io/name: {{ include "k8s-shredder.name" . }}
39 | helm.sh/chart: {{ include "k8s-shredder.chart" . }}
40 | app.kubernetes.io/instance: {{ .Release.Name }}
41 | app.kubernetes.io/managed-by: {{ .Release.Service }}
42 | {{- if .Values.podLabels }}
43 | {{ toYaml .Values.podLabels }}
44 | {{- end }}
45 | {{- end -}}
46 |
47 | {{/*
48 | matchLabels
49 | */}}
50 | {{- define "k8s-shredder.matchLabels" -}}
51 | app.kubernetes.io/name: {{ include "k8s-shredder.name" . }}
52 | app.kubernetes.io/instance: {{ .Release.Name }}
53 | {{- end -}}
54 |
55 | {{/*
56 | Additional pod annotations
57 | */}}
58 | {{- define "k8s-shredder.annotations" -}}
59 | {{- if .Values.podAnnotations }}
60 | {{- toYaml .Values.podAnnotations }}
61 | {{- end }}
62 | {{- end -}}
63 |
64 |
65 | {{/*
66 | Create the name of the service account to use.
67 | */}}
68 | {{- define "k8s-shredder.serviceAccountName" -}}
69 | {{- if .Values.serviceAccount.create -}}
70 | {{ default (include "k8s-shredder.fullname" .) .Values.serviceAccount.name }}
71 | {{- else -}}
72 | {{ default "default" .Values.serviceAccount.name }}
73 | {{- end -}}
74 | {{- end -}}
75 |
76 |
--------------------------------------------------------------------------------
/internal/testing/k8s-shredder.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: k8s-shredder
6 | namespace: kube-system
7 | labels:
8 | app: k8s-shredder
9 | spec:
10 | replicas: 1
11 | selector:
12 | matchLabels:
13 | app: k8s-shredder
14 | template:
15 | metadata:
16 | labels:
17 | app: k8s-shredder
18 | spec:
19 | affinity:
20 | nodeAffinity:
21 | preferredDuringSchedulingIgnoredDuringExecution:
22 | - weight: 1
23 | preference:
24 | matchExpressions:
25 | - key: node.kubernetes.io/role
26 | operator: In
27 | values: [master]
28 | tolerations:
29 | - key: node-role.kubernetes.io/control-plane
30 | operator: Exists
31 | effect: NoSchedule
32 | serviceAccountName: k8s-shredder
33 | containers:
34 | - name: k8s-shredder
35 | image: adobe/k8s-shredder:dev # replace it with a stable version
36 | args:
37 | - --config=/k8s-shredder-config/config.yaml
38 | - --metrics-port=8080
39 | - --log-level=info
40 | # For running it in dry run, without taking any real eviction actions
41 | # - "--dry-run"
42 | ports:
43 | - containerPort: 8080
44 | resources:
45 | requests:
46 | cpu: 250m
47 | memory: 250M
48 | limits:
49 | cpu: '1'
50 | memory: 1Gi
51 | volumeMounts:
52 | - name: k8s-shredder-config-volume
53 | mountPath: /k8s-shredder-config
54 | volumes:
55 | - name: k8s-shredder-config-volume
56 | configMap:
57 | defaultMode: 420
58 | name: k8s-shredder-config
59 | ---
60 | apiVersion: v1
61 | kind: ConfigMap
62 | metadata:
63 | name: k8s-shredder-config
64 | namespace: kube-system
65 | data:
66 | config.yaml: |-
67 | EvictionLoopInterval: 10s
68 | ParkedNodeTTL: 30s
69 | RollingRestartThreshold: 0.5
70 | UpgradeStatusLabel: "shredder.ethos.adobe.net/upgrade-status"
71 | ExpiresOnLabel: "shredder.ethos.adobe.net/parked-node-expires-on"
72 | NamespacePrefixSkipInitialEviction: "ns-ethos-"
73 | RestartedAtAnnotation: "shredder.ethos.adobe.net/restartedAt"
74 | AllowEvictionLabel: "shredder.ethos.adobe.net/allow-eviction"
75 | ToBeDeletedTaint: "ToBeDeletedByClusterAutoscaler"
76 | ArgoRolloutsAPIVersion: "v1alpha1"
77 | EnableKarpenterDriftDetection: false
78 | ParkedByLabel: "shredder.ethos.adobe.net/parked-by"
79 | ParkedByValue: "k8s-shredder"
80 | ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule"
81 | EnableNodeLabelDetection: false
82 | NodeLabelsToDetect: []
83 | MaxParkedNodes: "0"
84 | ---
85 | apiVersion: v1
86 | kind: Service
87 | metadata:
88 | name: k8s-shredder
89 | namespace: kube-system
90 | spec:
91 | selector:
92 | app: k8s-shredder
93 | ports:
94 | - port: 8080
95 | targetPort: 8080
96 |
--------------------------------------------------------------------------------
/internal/testing/k8s-shredder-karpenter.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: k8s-shredder
6 | namespace: kube-system
7 | labels:
8 | app: k8s-shredder
9 | spec:
10 | replicas: 1
11 | selector:
12 | matchLabels:
13 | app: k8s-shredder
14 | template:
15 | metadata:
16 | labels:
17 | app: k8s-shredder
18 | spec:
19 | affinity:
20 | nodeAffinity:
21 | preferredDuringSchedulingIgnoredDuringExecution:
22 | - weight: 1
23 | preference:
24 | matchExpressions:
25 | - key: node.kubernetes.io/role
26 | operator: In
27 | values: [master]
28 | tolerations:
29 | - key: node-role.kubernetes.io/control-plane
30 | operator: Exists
31 | effect: NoSchedule
32 | serviceAccountName: k8s-shredder
33 | containers:
34 | - name: k8s-shredder
35 | image: adobe/k8s-shredder:dev # replace it with a stable version
36 | args:
37 | - --config=/k8s-shredder-config/config.yaml
38 | - --metrics-port=8080
39 | - --log-level=debug
40 | # For running it in dry run, without taking any real eviction actions
41 | # - "--dry-run"
42 | ports:
43 | - containerPort: 8080
44 | resources:
45 | requests:
46 | cpu: 250m
47 | memory: 250M
48 | limits:
49 | cpu: '1'
50 | memory: 1Gi
51 | volumeMounts:
52 | - name: k8s-shredder-config-volume
53 | mountPath: /k8s-shredder-config
54 | volumes:
55 | - name: k8s-shredder-config-volume
56 | configMap:
57 | defaultMode: 420
58 | name: k8s-shredder-config
59 | ---
60 | apiVersion: v1
61 | kind: ConfigMap
62 | metadata:
63 | name: k8s-shredder-config
64 | namespace: kube-system
65 | data:
66 | config.yaml: |-
67 | EvictionLoopInterval: 30s
68 | ParkedNodeTTL: 2m
69 | RollingRestartThreshold: 0.5
70 | UpgradeStatusLabel: "shredder.ethos.adobe.net/upgrade-status"
71 | ExpiresOnLabel: "shredder.ethos.adobe.net/parked-node-expires-on"
72 | NamespacePrefixSkipInitialEviction: "ns-ethos-"
73 | RestartedAtAnnotation: "shredder.ethos.adobe.net/restartedAt"
74 | AllowEvictionLabel: "shredder.ethos.adobe.net/allow-eviction"
75 | ToBeDeletedTaint: "ToBeDeletedByClusterAutoscaler"
76 | ArgoRolloutsAPIVersion: "v1alpha1"
77 | EnableKarpenterDriftDetection: true
78 | ParkedByLabel: "shredder.ethos.adobe.net/parked-by"
79 | ParkedByValue: "k8s-shredder"
80 | ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule"
81 | EnableNodeLabelDetection: false
82 | NodeLabelsToDetect: []
83 | MaxParkedNodes: "0"
84 | ---
85 | apiVersion: v1
86 | kind: Service
87 | metadata:
88 | name: k8s-shredder
89 | namespace: kube-system
90 | spec:
91 | selector:
92 | app: k8s-shredder
93 | ports:
94 | - port: 8080
95 | targetPort: 8080
96 |
--------------------------------------------------------------------------------
/pkg/utils/k8s.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE-2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package utils
13 |
14 | import (
15 | "strconv"
16 | "time"
17 |
18 | "github.com/pkg/errors"
19 | v1 "k8s.io/api/core/v1"
20 | "k8s.io/client-go/dynamic"
21 | "k8s.io/client-go/kubernetes"
22 | "sigs.k8s.io/controller-runtime/pkg/client/config"
23 | )
24 |
25 | func getK8SClient() (*kubernetes.Clientset, error) {
26 | cfg, err := config.GetConfig()
27 | if err != nil {
28 | return nil, err
29 | }
30 |
31 | client, err := kubernetes.NewForConfig(cfg)
32 | if err != nil {
33 | return nil, err
34 | }
35 |
36 | return client, nil
37 | }
38 |
39 | func getDynamicK8SClient() (*dynamic.DynamicClient, error) {
40 | cfg, err := config.GetConfig()
41 | if err != nil {
42 | return nil, err
43 | }
44 |
45 | // Create a dynamic client
46 | dynamicClient, err := dynamic.NewForConfig(cfg)
47 | if err != nil {
48 | return nil, errors.Errorf("Error creating dynamic client: %v", err)
49 | }
50 |
51 | return dynamicClient, nil
52 | }
53 |
54 | // NodeHasTaint check if a node has a taint set
55 | func NodeHasTaint(node v1.Node, key string) bool {
56 | for _, taint := range node.Spec.Taints {
57 | if taint.Key == key {
58 | return true
59 | }
60 | }
61 | return false
62 | }
63 |
64 | // NodeHasLabel check if a node has a specific label set
65 | func NodeHasLabel(node v1.Node, key string) bool {
66 | for k := range node.Labels {
67 | if k == key {
68 | return true
69 | }
70 | }
71 | return false
72 | }
73 |
74 | // PodEvictionAllowed check if a pod has the `skipEvictionLabel`=false label set
75 | func PodEvictionAllowed(pod v1.Pod, skipEvictionLabel string) bool {
76 | if PodHasLabel(pod, skipEvictionLabel) {
77 | if pod.Labels[skipEvictionLabel] == "false" {
78 | return false
79 | }
80 | }
81 | return true
82 | }
83 |
84 | // PodHasLabel check if a pod has a specific label set
85 | func PodHasLabel(pod v1.Pod, key string) bool {
86 | for k := range pod.Labels {
87 | if k == key {
88 | return true
89 | }
90 | }
91 | return false
92 | }
93 |
94 | // GetParkedNodeExpiryTime get the time a parked node TTL expires
95 | func GetParkedNodeExpiryTime(node v1.Node, expiresOnLabel string) (time.Time, error) {
96 | i, err := strconv.ParseFloat(node.Labels[expiresOnLabel], 64)
97 | if err != nil {
98 | return time.Now().UTC(), errors.Errorf("Failed to parse label %s with value %s", expiresOnLabel, node.Labels[expiresOnLabel])
99 | }
100 | return time.Unix(int64(i), 0).UTC(), nil
101 | }
102 |
--------------------------------------------------------------------------------
/internal/testing/k8s-shredder-node-labels.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: k8s-shredder
6 | namespace: kube-system
7 | labels:
8 | app: k8s-shredder
9 | spec:
10 | replicas: 1
11 | selector:
12 | matchLabels:
13 | app: k8s-shredder
14 | template:
15 | metadata:
16 | labels:
17 | app: k8s-shredder
18 | spec:
19 | affinity:
20 | nodeAffinity:
21 | preferredDuringSchedulingIgnoredDuringExecution:
22 | - weight: 1
23 | preference:
24 | matchExpressions:
25 | - key: node.kubernetes.io/role
26 | operator: In
27 | values: [master]
28 | tolerations:
29 | - key: node-role.kubernetes.io/control-plane
30 | operator: Exists
31 | effect: NoSchedule
32 | serviceAccountName: k8s-shredder
33 | containers:
34 | - name: k8s-shredder
35 | image: adobe/k8s-shredder:dev # replace it with a stable version
36 | args:
37 | - --config=/k8s-shredder-config/config.yaml
38 | - --metrics-port=8080
39 | - --log-level=info
40 | # For running it in dry run, without taking any real eviction actions
41 | # - "--dry-run"
42 | ports:
43 | - containerPort: 8080
44 | resources:
45 | requests:
46 | cpu: 250m
47 | memory: 250M
48 | limits:
49 | cpu: '1'
50 | memory: 1Gi
51 | volumeMounts:
52 | - name: k8s-shredder-config-volume
53 | mountPath: /k8s-shredder-config
54 | volumes:
55 | - name: k8s-shredder-config-volume
56 | configMap:
57 | defaultMode: 420
58 | name: k8s-shredder-config
59 | ---
60 | apiVersion: v1
61 | kind: ConfigMap
62 | metadata:
63 | name: k8s-shredder-config
64 | namespace: kube-system
65 | data:
66 | config.yaml: |-
67 | EvictionLoopInterval: 30s
68 | ParkedNodeTTL: 2m
69 | RollingRestartThreshold: 0.5
70 | UpgradeStatusLabel: "shredder.ethos.adobe.net/upgrade-status"
71 | ExpiresOnLabel: "shredder.ethos.adobe.net/parked-node-expires-on"
72 | NamespacePrefixSkipInitialEviction: "ns-ethos-"
73 | RestartedAtAnnotation: "shredder.ethos.adobe.net/restartedAt"
74 | AllowEvictionLabel: "shredder.ethos.adobe.net/allow-eviction"
75 | ToBeDeletedTaint: "ToBeDeletedByClusterAutoscaler"
76 | ArgoRolloutsAPIVersion: "v1alpha1"
77 | EnableKarpenterDriftDetection: false
78 | ParkedByLabel: "shredder.ethos.adobe.net/parked-by"
79 | ParkedByValue: "k8s-shredder"
80 | ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule"
81 | EnableNodeLabelDetection: true
82 | NodeLabelsToDetect:
83 | - "test-label"
84 | - "maintenance=scheduled"
85 | - "node.test.io/park"
86 | MaxParkedNodes: "0"
87 | ---
88 | apiVersion: v1
89 | kind: Service
90 | metadata:
91 | name: k8s-shredder
92 | namespace: kube-system
93 | spec:
94 | selector:
95 | app: k8s-shredder
96 | ports:
97 | - port: 8080
98 | targetPort: 8080
99 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | name: CI tests
3 | on: pull_request
4 | jobs:
5 | ci:
6 | runs-on: ubuntu-latest
7 | name: ci
8 | steps:
9 | - name: Checkout
10 | uses: actions/checkout@v6
11 | - name: Setup Go
12 | uses: actions/setup-go@v6
13 | with:
14 | go-version: '1.25'
15 | - name: Run Gosec Security Scanner
16 | uses: securego/gosec@master
17 | with:
18 | args: -quiet -exclude=G107 ./...
19 | - name: Run golangci-lint
20 | uses: golangci/golangci-lint-action@v9
21 | with:
22 | # Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version
23 | # version: v1.46
24 | args: -v --timeout 5m --no-config ./...
25 | - name: Install k8s Kind Cluster
26 | uses: helm/kind-action@v1.13.0
27 | with:
28 | install_only: true
29 | version: v0.29.0
30 | - name: Prepare test environment
31 | run: make local-test
32 | - name: Run e2e tests
33 | run: make e2e-tests
34 | ci-karpenter:
35 | runs-on: ubuntu-latest
36 | name: ci-karpenter
37 | steps:
38 | - name: Checkout
39 | uses: actions/checkout@v6
40 | - name: Setup Go
41 | uses: actions/setup-go@v6
42 | with:
43 | go-version: '1.25'
44 | - name: Run Gosec Security Scanner
45 | uses: securego/gosec@master
46 | with:
47 | args: -quiet -exclude=G107 ./...
48 | - name: Run golangci-lint
49 | uses: golangci/golangci-lint-action@v9
50 | with:
51 | # Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version
52 | # version: v1.46
53 | args: -v --timeout 5m --no-config ./...
54 | - name: Install k8s Kind Cluster
55 | uses: helm/kind-action@v1.13.0
56 | with:
57 | install_only: true
58 | version: v0.29.0
59 | - name: Prepare test environment
60 | run: make local-test-karpenter
61 | - name: Run e2e tests
62 | run: make e2e-tests
63 | ci-node-labels:
64 | runs-on: ubuntu-latest
65 | name: ci-node-labels
66 | steps:
67 | - name: Checkout
68 | uses: actions/checkout@v6
69 | - name: Setup Go
70 | uses: actions/setup-go@v6
71 | with:
72 | go-version: '1.25'
73 | - name: Run Gosec Security Scanner
74 | uses: securego/gosec@master
75 | with:
76 | args: -quiet -exclude=G107 ./...
77 | - name: Run golangci-lint
78 | uses: golangci/golangci-lint-action@v9
79 | with:
80 | # Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version
81 | # version: v1.46
82 | args: -v --timeout 5m --no-config ./...
83 | - name: Install k8s Kind Cluster
84 | uses: helm/kind-action@v1.13.0
85 | with:
86 | install_only: true
87 | version: v0.29.0
88 | - name: Prepare test environment
89 | run: make local-test-node-labels
90 | - name: Run e2e tests
91 | run: make e2e-tests
92 |
--------------------------------------------------------------------------------
/pkg/metrics/types.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE-2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package metrics
13 |
14 | import (
15 | "fmt"
16 | "net/http"
17 | "time"
18 |
19 | log "github.com/sirupsen/logrus"
20 |
21 | "github.com/prometheus/client_golang/prometheus"
22 | "github.com/prometheus/client_golang/prometheus/promhttp"
23 | )
24 |
25 | // Init ..
26 | func Init(port int) error {
27 | if err := registerMetrics(); err != nil {
28 | return err
29 | }
30 | if err := serve(port); err != nil {
31 | return err
32 | }
33 | return nil
34 | }
35 |
36 | func registerMetrics() error {
37 | prometheus.MustRegister(ShredderAPIServerRequestsTotal)
38 | prometheus.MustRegister(ShredderAPIServerRequestsDurationSeconds)
39 | prometheus.MustRegister(ShredderLoopsTotal)
40 | prometheus.MustRegister(ShredderLoopsDurationSeconds)
41 | prometheus.MustRegister(ShredderProcessedNodesTotal)
42 | prometheus.MustRegister(ShredderProcessedPodsTotal)
43 | prometheus.MustRegister(ShredderErrorsTotal)
44 | prometheus.MustRegister(ShredderPodErrorsTotal)
45 | prometheus.MustRegister(ShredderNodeForceToEvictTime)
46 | prometheus.MustRegister(ShredderPodForceToEvictTime)
47 | prometheus.MustRegister(ShredderKarpenterDriftedNodesTotal)
48 | prometheus.MustRegister(ShredderKarpenterDisruptedNodesTotal)
49 | prometheus.MustRegister(ShredderKarpenterNodesParkedTotal)
50 | prometheus.MustRegister(ShredderKarpenterNodesParkingFailedTotal)
51 | prometheus.MustRegister(ShredderKarpenterProcessingDurationSeconds)
52 | prometheus.MustRegister(ShredderNodeLabelNodesParkedTotal)
53 | prometheus.MustRegister(ShredderNodeLabelNodesParkingFailedTotal)
54 | prometheus.MustRegister(ShredderNodeLabelProcessingDurationSeconds)
55 | prometheus.MustRegister(ShredderNodeLabelMatchingNodesTotal)
56 | prometheus.MustRegister(ShredderNodesParkedTotal)
57 | prometheus.MustRegister(ShredderNodesParkingFailedTotal)
58 | prometheus.MustRegister(ShredderProcessingDurationSeconds)
59 | return nil
60 | }
61 |
62 | func serve(port int) error {
63 | http.Handle("/metrics", promhttp.HandlerFor(
64 | prometheus.DefaultGatherer,
65 | promhttp.HandlerOpts{
66 | EnableOpenMetrics: true,
67 | },
68 | ))
69 |
70 | http.HandleFunc("/healthz", func(res http.ResponseWriter, req *http.Request) {
71 | res.WriteHeader(200)
72 | _, err := res.Write([]byte("OK"))
73 | if err != nil {
74 | log.Errorln("Error while replying to /healthz request:", err)
75 | }
76 | })
77 |
78 | server := &http.Server{
79 | Addr: fmt.Sprintf(":%d", port),
80 | ReadHeaderTimeout: 3 * time.Second,
81 | }
82 |
83 | go func() {
84 | log.Fatal(server.ListenAndServe(), nil)
85 | }()
86 | return nil
87 | }
88 |
--------------------------------------------------------------------------------
/internal/testing/karpenter-manifests.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: karpenter.sh/v1
3 | kind: NodePool
4 | metadata:
5 | name: default-nodepool
6 | spec:
7 | template:
8 | spec:
9 | requirements:
10 | - key: kubernetes.io/arch
11 | operator: In
12 | values: [amd64]
13 | - key: kubernetes.io/os
14 | operator: In
15 | values: [linux]
16 | - key: karpenter.sh/capacity-type
17 | operator: In
18 | values: [spot, on-demand]
19 | - key: node.kubernetes.io/instance-type
20 | operator: In
21 | values: [m5.large, m5.xlarge]
22 | nodeClassRef:
23 | group: karpenter.k8s.aws
24 | kind: EC2NodeClass
25 | name: default-nodeclass
26 | taints:
27 | - key: example.com/special-taint
28 | value: special-value
29 | effect: NoSchedule
30 | limits:
31 | cpu: 1000
32 | disruption:
33 | consolidationPolicy: WhenEmpty
34 | consolidateAfter: 30s
35 | ---
36 | apiVersion: karpenter.sh/v1
37 | kind: NodePool
38 | metadata:
39 | name: test-nodepool
40 | spec:
41 | template:
42 | spec:
43 | requirements:
44 | - key: kubernetes.io/arch
45 | operator: In
46 | values: [amd64]
47 | - key: kubernetes.io/os
48 | operator: In
49 | values: [linux]
50 | - key: karpenter.sh/capacity-type
51 | operator: In
52 | values: [spot]
53 | - key: node.kubernetes.io/instance-type
54 | operator: In
55 | values: [m5.large]
56 | nodeClassRef:
57 | group: karpenter.k8s.aws
58 | kind: EC2NodeClass
59 | name: test-nodeclass
60 | taints:
61 | - key: example.com/test-taint
62 | value: test-value
63 | effect: NoSchedule
64 | limits:
65 | cpu: 500
66 | disruption:
67 | consolidationPolicy: WhenEmpty
68 | consolidateAfter: 30s
69 | ---
70 | apiVersion: karpenter.k8s.aws/v1
71 | kind: EC2NodeClass
72 | metadata:
73 | name: default-nodeclass
74 | spec:
75 | role: KarpenterNodeRole-k8s-shredder-test-cluster
76 | amiFamily: AL2
77 | subnetSelectorTerms:
78 | - tags:
79 | karpenter.sh/discovery: k8s-shredder-test-cluster
80 | securityGroupSelectorTerms:
81 | - tags:
82 | karpenter.sh/discovery: k8s-shredder-test-cluster
83 | instanceStorePolicy: RAID0
84 | userData: |
85 | #!/bin/bash
86 | /etc/eks/bootstrap.sh k8s-shredder-test-cluster
87 | echo "NodeClass: default-nodeclass" >> /etc/kubernetes/kubelet/kubelet-config.json
88 | ---
89 | apiVersion: karpenter.k8s.aws/v1
90 | kind: EC2NodeClass
91 | metadata:
92 | name: test-nodeclass
93 | spec:
94 | role: KarpenterNodeRole-k8s-shredder-test-cluster
95 | amiFamily: AL2
96 | subnetSelectorTerms:
97 | - tags:
98 | karpenter.sh/discovery: k8s-shredder-test-cluster
99 | securityGroupSelectorTerms:
100 | - tags:
101 | karpenter.sh/discovery: k8s-shredder-test-cluster
102 | instanceStorePolicy: RAID0
103 | userData: |-
104 | #!/bin/bash
105 | /etc/eks/bootstrap.sh k8s-shredder-test-cluster
106 | echo "NodeClass: test-nodeclass" >> /etc/kubernetes/kubelet/kubelet-config.json
107 |
--------------------------------------------------------------------------------
/.goreleaser.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 2
3 | project_name: k8s-shredder
4 | release:
5 | github:
6 | owner: adobe
7 | name: k8s-shredder
8 | builds:
9 | - id: k8s-shredder
10 | goos: [linux, windows, darwin]
11 | goarch: [amd64, '386', arm64]
12 | env: [CGO_ENABLED=0]
13 | main: .
14 | ldflags:
15 | - -s -w -X github.com/adobe/k8s-shredder/cmd.buildVersion=v{{.Version}} -X github.com/adobe/k8s-shredder/cmd.gitSHA={{.Commit}}
16 | -X github.com/adobe/k8s-shredder/cmd.buildTime={{.Date}}
17 | flags: [-trimpath]
18 | binary: k8s-shredder
19 | # signs the checksum file
20 | # all files (including the sboms) are included in the checksum, so we don't need to sign each one if we don't want to
21 | # https://goreleaser.com/customization/sign
22 | signs:
23 | - cmd: cosign
24 | env: [COSIGN_EXPERIMENTAL=1]
25 | signature: ${artifact}.bundle
26 | args:
27 | - sign-blob
28 | - --bundle=${signature}
29 | - ${artifact}
30 | - --yes # needed on cosign 2.0.0+
31 | artifacts: checksum
32 | output: true
33 | dockers:
34 | - image_templates: ['ghcr.io/adobe/{{ .ProjectName }}:v{{ .Version }}-amd64']
35 | use: buildx
36 | dockerfile: Dockerfile
37 | build_flag_templates:
38 | - --platform=linux/amd64
39 | - --label=org.opencontainers.image.title={{ .ProjectName }}
40 | - --label=org.opencontainers.image.description={{ .ProjectName }}
41 | - --label=org.opencontainers.image.url=https://github.com/adobe/{{ .ProjectName }}
42 | - --label=org.opencontainers.image.source=https://github.com/adobe/{{ .ProjectName }}
43 | - --label=org.opencontainers.image.version=v{{ .Version }}
44 | - --label=org.opencontainers.image.created={{ .Date }}
45 | - --label=org.opencontainers.image.revision={{ .FullCommit }}
46 | - --label=org.opencontainers.image.licenses=Apache-2.0
47 | - image_templates:
48 | - ghcr.io/adobe/{{ .ProjectName }}:v{{ .Version }}-arm64v8
49 | use: buildx
50 | goarch: arm64
51 | dockerfile: Dockerfile
52 | build_flag_templates:
53 | - --platform=linux/arm64/v8
54 | - --label=org.opencontainers.image.title={{ .ProjectName }}
55 | - --label=org.opencontainers.image.description={{ .ProjectName }}
56 | - --label=org.opencontainers.image.url=https://github.com/adobe/{{ .ProjectName }}
57 | - --label=org.opencontainers.image.source=https://github.com/adobe/{{ .ProjectName }}
58 | - --label=org.opencontainers.image.version=v{{ .Version }}
59 | - --label=org.opencontainers.image.created={{ .Date }}
60 | - --label=org.opencontainers.image.revision={{ .FullCommit }}
61 | - --label=org.opencontainers.image.licenses=Apache-2.0
62 | docker_manifests:
63 | - name_template: ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}
64 | image_templates:
65 | - ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}-amd64
66 | - ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}-arm64v8
67 | - name_template: ghcr.io/adobe/{{.ProjectName}}:latest
68 | image_templates:
69 | - ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}-amd64
70 | - ghcr.io/adobe/{{.ProjectName}}:v{{.Version}}-arm64v8
71 | # signs our docker image
72 | # https://goreleaser.com/customization/docker_sign
73 | docker_signs:
74 | - cmd: cosign
75 | env: [COSIGN_EXPERIMENTAL=1]
76 | artifacts: images
77 | output: true
78 | args:
79 | - sign
80 | - ${artifact}
81 | - --yes # needed on cosign 2.0.0+
82 |
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | # K8s-shredder configuration file with default values
3 | # See README.md for detailed description of each option
4 |
5 | # Core eviction settings
6 | EvictionLoopInterval: 60s # How often to run the eviction loop process
7 | ParkedNodeTTL: 60m # Time a node can be parked before starting force eviction process
8 | RollingRestartThreshold: 0.5 # How much time(percentage) should pass from ParkedNodeTTL before starting the rollout restart process
9 | # Node and pod labeling
10 | UpgradeStatusLabel: shredder.ethos.adobe.net/upgrade-status # Label used for identifying parked nodes
11 | ExpiresOnLabel: shredder.ethos.adobe.net/parked-node-expires-on # Label used for identifying the TTL for parked nodes
12 | ParkedByLabel: shredder.ethos.adobe.net/parked-by # Label used to identify which component parked the node
13 | ParkedByValue: k8s-shredder # Value to set for the ParkedByLabel
14 | # Eviction behavior
15 | NamespacePrefixSkipInitialEviction: '' # For pods in namespaces having this prefix proceed directly with a rollout restart without waiting for the RollingRestartThreshold
16 | RestartedAtAnnotation: shredder.ethos.adobe.net/restartedAt # Annotation name used to mark a controller object for rollout restart
17 | AllowEvictionLabel: shredder.ethos.adobe.net/allow-eviction # Label used for skipping evicting pods that have explicitly set this label on false
18 | # Node management
19 | ToBeDeletedTaint: ToBeDeletedByClusterAutoscaler # Node taint used for skipping a subset of parked nodes that are already handled by cluster-autoscaler
20 | ParkedNodeTaint: shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule # Taint to apply to parked nodes in format key=value:effect
21 | # Argo Rollouts integration
22 | ArgoRolloutsAPIVersion: v1alpha1 # API version from argoproj.io API group to be used while handling Argo Rollouts objects
23 | # Karpenter integration
24 | EnableKarpenterDriftDetection: false # Controls whether to scan for drifted Karpenter NodeClaims and automatically label their nodes
25 | EnableKarpenterDisruptionDetection: false # Controls whether to scan for disrupted Karpenter NodeClaims and automatically label their nodes
26 | # Node label detection
27 | EnableNodeLabelDetection: false # Controls whether to scan for nodes with specific labels and automatically park them
28 | NodeLabelsToDetect: [] # List of node labels to detect. Supports both key-only and key=value formats
29 | # Examples:
30 | # - "maintenance" # Matches any node with the "maintenance" label (any value)
31 | # - "upgrade=required" # Matches nodes with label "upgrade" set to "required"
32 | # - "node.example.com/park" # Matches any node with the "node.example.com/park" label
33 |
34 | # Parking limits
35 | MaxParkedNodes: '0' # Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" (default) for no limit.
36 |
37 | # Extra labels to apply to parked nodes and pods
38 | # ExtraParkingLabels: # (optional) Additional labels to apply to nodes and pods during parking
39 | # example.com/owner: "infrastructure"
40 | # example.com/maintenance: "true"
41 |
42 | # Safety settings
43 | EvictionSafetyCheck: true # Controls whether to perform safety checks before force eviction. If true, nodes will be unparked if pods don't have required parking labels.
44 |
45 | # Parking reason tracking
46 | ParkingReasonLabel: shredder.ethos.adobe.net/parked-reason # Label used to track why a node or pod was parked
47 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Adobe Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, gender identity and expression, level of experience,
9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language.
18 | * Being respectful of differing viewpoints and experiences.
19 | * Gracefully accepting constructive criticism.
20 | * Focusing on what is best for the community.
21 | * Showing empathy towards other community members.
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances.
27 | * Trolling, insulting/derogatory comments, and personal or political attacks.
28 | * Public or private harassment.
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission.
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting.
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at Grp-opensourceoffice@adobe.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [https://contributor-covenant.org/version/1/4][version].
72 |
73 | [homepage]: https://contributor-covenant.org
74 | [version]: https://contributor-covenant.org/version/1/4/
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/adobe/k8s-shredder
2 |
3 | go 1.25.0
4 |
5 | toolchain go1.25.5
6 |
7 | require (
8 | github.com/fsnotify/fsnotify v1.9.0
9 | github.com/go-co-op/gocron/v2 v2.19.0
10 | github.com/google/uuid v1.6.0
11 | github.com/pkg/errors v0.9.1
12 | github.com/prometheus/client_golang v1.23.2
13 | github.com/prometheus/common v0.67.4
14 | github.com/robfig/cron/v3 v3.0.1
15 | github.com/sirupsen/logrus v1.9.3
16 | github.com/spf13/cobra v1.10.2
17 | github.com/spf13/viper v1.21.0
18 | github.com/stretchr/testify v1.11.1
19 | golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93
20 | k8s.io/api v0.35.0
21 | k8s.io/apimachinery v0.35.0
22 | k8s.io/client-go v0.35.0
23 | k8s.io/kubectl v0.35.0
24 | k8s.io/utils v0.0.0-20251220205832-9d40a56c1308
25 | sigs.k8s.io/controller-runtime v0.22.4
26 | )
27 |
28 | require (
29 | github.com/Masterminds/semver/v3 v3.4.0 // indirect
30 | github.com/beorn7/perks v1.0.1 // indirect
31 | github.com/blang/semver/v4 v4.0.0 // indirect
32 | github.com/cespare/xxhash/v2 v2.3.0 // indirect
33 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
34 | github.com/emicklei/go-restful/v3 v3.12.2 // indirect
35 | github.com/fxamacker/cbor/v2 v2.9.0 // indirect
36 | github.com/go-errors/errors v1.4.2 // indirect
37 | github.com/go-logr/logr v1.4.3 // indirect
38 | github.com/go-openapi/jsonpointer v0.21.0 // indirect
39 | github.com/go-openapi/jsonreference v0.20.2 // indirect
40 | github.com/go-openapi/swag v0.23.0 // indirect
41 | github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
42 | github.com/gogo/protobuf v1.3.2 // indirect
43 | github.com/google/gnostic-models v0.7.0 // indirect
44 | github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
45 | github.com/inconshreveable/mousetrap v1.1.0 // indirect
46 | github.com/jonboulle/clockwork v0.5.0 // indirect
47 | github.com/josharian/intern v1.0.0 // indirect
48 | github.com/json-iterator/go v1.1.12 // indirect
49 | github.com/mailru/easyjson v0.7.7 // indirect
50 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
51 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
52 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
53 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
54 | github.com/pelletier/go-toml/v2 v2.2.4 // indirect
55 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
56 | github.com/prometheus/client_model v0.6.2 // indirect
57 | github.com/prometheus/procfs v0.16.1 // indirect
58 | github.com/sagikazarmark/locafero v0.11.0 // indirect
59 | github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect
60 | github.com/spf13/afero v1.15.0 // indirect
61 | github.com/spf13/cast v1.10.0 // indirect
62 | github.com/spf13/pflag v1.0.10 // indirect
63 | github.com/subosito/gotenv v1.6.0 // indirect
64 | github.com/x448/float16 v0.8.4 // indirect
65 | github.com/xlab/treeprint v1.2.0 // indirect
66 | go.yaml.in/yaml/v2 v2.4.3 // indirect
67 | go.yaml.in/yaml/v3 v3.0.4 // indirect
68 | golang.org/x/net v0.47.0 // indirect
69 | golang.org/x/oauth2 v0.32.0 // indirect
70 | golang.org/x/sync v0.19.0 // indirect
71 | golang.org/x/sys v0.38.0 // indirect
72 | golang.org/x/term v0.37.0 // indirect
73 | golang.org/x/text v0.31.0 // indirect
74 | golang.org/x/time v0.11.0 // indirect
75 | google.golang.org/protobuf v1.36.10 // indirect
76 | gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
77 | gopkg.in/inf.v0 v0.9.1 // indirect
78 | gopkg.in/yaml.v3 v3.0.1 // indirect
79 | k8s.io/cli-runtime v0.35.0 // indirect
80 | k8s.io/klog/v2 v2.130.1 // indirect
81 | k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
82 | sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
83 | sigs.k8s.io/kustomize/api v0.20.1 // indirect
84 | sigs.k8s.io/kustomize/kyaml v0.20.1 // indirect
85 | sigs.k8s.io/randfill v1.0.0 // indirect
86 | sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
87 | sigs.k8s.io/yaml v1.6.0 // indirect
88 | )
89 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/templates/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: {{ include "k8s-shredder.fullname" . }}
5 | namespace: {{ .Release.Namespace }}
6 | labels:
7 | {{ include "k8s-shredder.labels" . | indent 4 }}
8 | {{- if .Values.podAnnotations }}
9 | annotations:
10 | {{ include "k8s-shredder.annotations" . | indent 4 }}
11 | {{- end }}
12 | spec:
13 | replicas: {{ .Values.replicaCount }}
14 | {{- with .Values.deploymentStrategy }}
15 | strategy:
16 | {{- toYaml . | nindent 4 }}
17 | {{- end }}
18 | selector:
19 | matchLabels:
20 | {{ include "k8s-shredder.matchLabels" . | indent 6 }}
21 | template:
22 | metadata:
23 | labels:
24 | {{ include "k8s-shredder.labels" . | indent 8 }}
25 | {{- if .Values.podAnnotations }}
26 | annotations:
27 | {{ include "k8s-shredder.annotations" . | indent 8 }}
28 | {{- end }}
29 | spec:
30 | {{- with .Values.imagePullSecrets }}
31 | imagePullSecrets:
32 | {{- toYaml . | nindent 8 }}
33 | {{- end }}
34 | serviceAccountName: {{ include "k8s-shredder.serviceAccountName" . }}
35 | securityContext:
36 | {{- toYaml .Values.podSecurityContext | nindent 8 }}
37 | initContainers:
38 | {{- with .Values.initContainers }}
39 | {{- toYaml . | nindent 8 }}
40 | {{- end }}
41 | containers:
42 | - name: {{ .Chart.Name }}
43 | securityContext:
44 | {{- toYaml .Values.securityContext | nindent 12 }}
45 | image: "{{ .Values.image.registry }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
46 | imagePullPolicy: {{ .Values.image.pullPolicy }}
47 | args:
48 | - "--config=/k8s-shredder-config/config.yaml"
49 | - "--metrics-port=8080"
50 | - "--log-level={{ .Values.logLevel }}"
51 | {{- if .Values.dryRun }}
52 | - "--dry-run"
53 | {{- end }}
54 | env:
55 | {{- with .Values.environmentVars }}
56 | {{- toYaml . | nindent 12 }}
57 | {{- end }}
58 | ports:
59 | - name: metrics
60 | containerPort: 8080
61 | protocol: TCP
62 | volumeMounts:
63 | - name: k8s-shredder-config-volume
64 | mountPath: /k8s-shredder-config
65 | livenessProbe:
66 | httpGet:
67 | path: /healthz
68 | port: metrics
69 | initialDelaySeconds: 10
70 | timeoutSeconds: 3
71 | periodSeconds: 10
72 | failureThreshold: 5
73 | resources:
74 | {{- toYaml .Values.resources | nindent 12 }}
75 | {{- with .Values.additionalContainers }}
76 | {{- toYaml . | nindent 8 }}
77 | {{- end }}
78 | volumes:
79 | - name: k8s-shredder-config-volume
80 | configMap:
81 | defaultMode: 420
82 | name: {{ include "k8s-shredder.fullname" . }}-config
83 | {{- with .Values.volumes }}
84 | {{- toYaml . | nindent 8 }}
85 | {{- end }}
86 | {{- with .Values.nodeSelector }}
87 | nodeSelector:
88 | {{- toYaml . | nindent 8 }}
89 | {{- end }}
90 | {{- if .Values.affinity }}
91 | affinity:
92 | {{- toYaml .Values.affinity | nindent 8 }}
93 | {{- else }}
94 | affinity:
95 | nodeAffinity:
96 | preferredDuringSchedulingIgnoredDuringExecution:
97 | - weight: 1
98 | preference:
99 | matchExpressions:
100 | - key: node.kubernetes.io/role
101 | operator: In
102 | values:
103 | - master
104 | {{- end }}
105 | {{- if .Values.tolerations }}
106 | tolerations:
107 | {{- toYaml .Values.tolerations | nindent 8 }}
108 | {{- else }}
109 | tolerations:
110 | - key: "node-role.kubernetes.io/control-plane"
111 | operator: "Exists"
112 | effect: "NoSchedule"
113 | {{- end }}
114 | {{- with .Values.topologySpreadConstraints }}
115 | topologySpreadConstraints:
116 | {{- toYaml . | nindent 8 }}
117 | {{- end }}
118 | {{- if .Values.priorityClassName }}
119 | priorityClassName: {{ .Values.priorityClassName }}
120 | {{- end }}
121 |
--------------------------------------------------------------------------------
/docs/node-parking.md:
--------------------------------------------------------------------------------
1 | # Node Parking
2 |
3 | "Node Parking" is a process by which nodes that need replacement but are currently being handled by which nodes and the pods scheduled on them are labeled and subsequently targeted for safe eviction over a period of (commonly) several days, after which the pods are forcibly removed by k8s-shredder and the node deleted by the cluster's autoscaler. This process gives tenants the opportunity to reschedule sensitive workloads in a manner that fits their application's SLO while ultimately allowing for the eventual replacement of nodes.
4 |
5 | ## Parking Basics
6 |
7 | When a cluster operator upgrades the node on a cluster (e.g. upgrades the version of Kubernetes, the underlying operating system, a configuration change, etc), it first needs to reschedule all pods on that node. This is done using the [Kuberentes Eviction API](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/), which is to say evictions that respect the application's [PodDisruptionBudgets](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) (PDBs) and [terminationGracePeriodSeconds](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#pod-termination) settings on its pods. Once the node is emptied of application workloads, it is deleted by either the cluster-autoscaler or karpenter (or some other node autoscaler).
8 |
9 | In some cases, it may not be possible to evict a pod without violating the PDB. This is normally due to how the application owner has configured the PDB, but can be caused by other scenarios such as lack of nodes to schedule new pods of scale up due to cloud provider issues. As stewards of the cluster's security and stability, cluster operators cannot let the node run forever. However, they also want to make every effort to make sure application owners have a chance to own and manage their application stability. Enter "Node Parking".
10 |
11 | When a cluster operator encounters a node with a pod that cannot be evict, it will cordon and taint the node. Then, it will label the node and the (non-daemonset) pods contained within it with the following labels:
12 |
13 | ```bash
14 | shredder.ethos.adobe.net/parked-by=k8s-shredder
15 | shredder.ethos.adobe.net/parked-node-expires-on=1750164865.36373
16 | shredder.ethos.adobe.net/upgrade-status=parked
17 | ```
18 |
19 | The first label denotes who/what parked the node. The second contains a unix timestamp of when that node/pod may be forcibly removed. The third label denotes that the node/pod is parked.
20 |
21 | These labels are used by a deployment called [k8s-shredder](../README.md) that will scan the cluster periodically for pods with these labels, and will try to evict them using the Eviction API. After a portion of the expiry period has passed (default 10%), it will shift to using [rollout restarts](https://kubernetes.io/docs/reference/kubectl/generated/kubectl_rollout/kubectl_rollout_restart/) to help reschedule the pod. If the pod is still present when the expiration date is reached, it forcibly evicted (e.g. it is deleted); this the only action k8s-shredder will take that will violate the pod's PDB.
22 |
23 | If you want a pod to be exempted from the eviction loop until parked node TTL expires, you can label the pod with
24 |
25 | ```bash
26 | "shredder.ethos.adobe.net/allow-eviction=false"
27 | ```
28 |
29 | so that k8s-shredder will skip it. It will be encumbent on application owners to gracefully reschedule these pods to avoid deletion once the TTL expires.
30 |
31 | More information about k8s-shredder and how it functions can be found [here](../README.md).
32 |
33 | ## How can I tell if my pods are parked?
34 |
35 | As mentioned above, we don't want to forcibly evict our tenant's workloads, and we would much rather give them the power to manage the eviction process in a way that makes sense for their workload, SLOs, and customers. Given that, we have exposed metrics and labels that will allow customers to track and alert when they have workloads that are parked so that they may take action.
36 |
37 | ### Metrics (Recommended)
38 |
39 | If you are writing an alert or promql query, the recommended approach is to incorporate the metric `kube_ethos_upgrade:parked_pod` after exposing it in prometheus. Given that the expiry time for a pod is measured in days, you may want to delay any alerting on pod-parking for the first hour or so to allow for normal rescheduling to occur.
40 |
41 | ### Pod Labels
42 |
43 | Another way to find out if your pod is parked is to monitor the labels on the pods in your names space. You can find parked pods using this kubectl command:
44 |
45 | ```
46 | kubectl get pods -l shredder.ethos.adobe.net/upgrade-status=parked
47 | ```
48 |
49 | You can also query and alert on pods labels (although, again, we recommend using the metric exposed above):
50 |
51 | ```
52 | kube_pod_labels{label_shredder_ethos_adobe_net_upgrade_status="parked"}
53 | ```
54 |
--------------------------------------------------------------------------------
/pkg/config/config.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE-2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package config
13 |
14 | import (
15 | "time"
16 |
17 | "github.com/adobe/k8s-shredder/pkg/schedule"
18 | "github.com/pkg/errors"
19 | )
20 |
21 | // Config struct defines application configuration options
22 | type Config struct {
23 | // EvictionLoopInterval defines how often to run the eviction loop process
24 | EvictionLoopInterval time.Duration
25 | // EvictionLoopSchedule is an optional cron schedule for when eviction operations are allowed
26 | // If set, parking and shredding operations will only occur during the scheduled time window
27 | // Supports standard cron syntax and macros (@yearly, @monthly, @weekly, @daily, @hourly)
28 | // Example: "@daily" (runs at midnight UTC), "0 2 * * *" (runs at 2 AM UTC daily)
29 | EvictionLoopSchedule string
30 | // EvictionLoopDuration defines how long the scheduled window stays active after the schedule triggers
31 | // Only used when EvictionLoopSchedule is set
32 | // Supports compound durations with hours and minutes (e.g., "10h5m", "30m", "160h")
33 | // Example: "10h" (window stays active for 10 hours), "30m" (window stays active for 30 minutes)
34 | EvictionLoopDuration string
35 | // ParkedNodeTTL is used for defining the time a node can stay parked before starting force eviction process
36 | ParkedNodeTTL time.Duration
37 | // RollingRestartThreshold specifies how much time(percentage) should pass from ParkedNodeTTL before starting the rollout restart process
38 | RollingRestartThreshold float64
39 | // UpgradeStatusLabel is used for identifying parked nodes
40 | UpgradeStatusLabel string
41 | // ExpiresOnLabel is used for identifying the TTL for parked nodes
42 | ExpiresOnLabel string
43 | // NamespacePrefixSkipInitialEviction is used for proceeding directly with a rollout restart without waiting for the RollingRestartThreshold
44 | NamespacePrefixSkipInitialEviction string
45 | // RestartedAtAnnotation is used to mark a controller object for rollout restart
46 | RestartedAtAnnotation string
47 | // AllowEvictionLabel is used for skipping evicting pods that have explicitly set this label on false
48 | AllowEvictionLabel string
49 | // ToBeDeletedTaint is used for skipping a subset of parked nodes
50 | ToBeDeletedTaint string
51 | // ArgoRolloutsAPIVersion is used for specifying the API version from `argoproj.io` apigroup to be used while handling Argo Rollouts objects
52 | ArgoRolloutsAPIVersion string
53 | // EnableKarpenterDriftDetection controls whether to scan for drifted Karpenter NodeClaims and automatically label their nodes
54 | EnableKarpenterDriftDetection bool
55 | // EnableKarpenterDisruptionDetection controls whether to scan for disrupted Karpenter NodeClaims and automatically label their nodes
56 | EnableKarpenterDisruptionDetection bool
57 | // ParkedByLabel is used for identifying which component parked the node
58 | ParkedByLabel string
59 | // ParkedByValue is the value to set for the ParkedByLabel
60 | ParkedByValue string
61 | // ParkedNodeTaint is the taint to apply to parked nodes in the format key=value:effect
62 | ParkedNodeTaint string
63 | // EnableNodeLabelDetection controls whether to scan for nodes with specific labels and automatically park them
64 | EnableNodeLabelDetection bool
65 | // NodeLabelsToDetect is a list of node labels to look for. Can be just keys or key=value pairs
66 | NodeLabelsToDetect []string
67 | // MaxParkedNodes is the maximum number of nodes that can be parked simultaneously.
68 | // Can be either an integer (e.g. "5") or a percentage (e.g. "20%").
69 | // If set to "0" or empty (default), no limit is applied.
70 | // When a percentage is specified, the limit is calculated as (percentage/100) * (total nodes in cluster).
71 | MaxParkedNodes string
72 | // ExtraParkingLabels is a map of additional labels to apply to nodes and pods during the parking process. If not set, no extra labels are applied.
73 | ExtraParkingLabels map[string]string
74 | // EvictionSafetyCheck controls whether to perform safety checks before force eviction. If true, nodes will be unparked if pods don't have required parking labels.
75 | EvictionSafetyCheck bool
76 | // ParkingReasonLabel is the label used to track why a node or pod was parked
77 | ParkingReasonLabel string
78 | }
79 |
80 | // GetEvictionLoopSchedule returns a parsed Schedule object if EvictionLoopSchedule is configured
81 | // Returns nil if schedule is not configured or if there's an error parsing it
82 | func (c *Config) GetEvictionLoopSchedule() (*schedule.Schedule, error) {
83 | if c.EvictionLoopSchedule == "" {
84 | return nil, nil
85 | }
86 |
87 | if c.EvictionLoopDuration == "" {
88 | return nil, errors.New("EvictionLoopDuration must be set when EvictionLoopSchedule is configured")
89 | }
90 |
91 | return schedule.NewSchedule(c.EvictionLoopSchedule, c.EvictionLoopDuration)
92 | }
93 |
94 | // HasEvictionLoopSchedule returns true if EvictionLoopSchedule is configured
95 | func (c *Config) HasEvictionLoopSchedule() bool {
96 | return c.EvictionLoopSchedule != ""
97 | }
98 |
--------------------------------------------------------------------------------
/internal/testing/local_env_prep_helm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | K8S_SHREDDER_VERSION=$1
5 | KINDNODE_VERSION=$2
6 | K8S_CLUSTER_NAME=$3
7 | KUBECONFIG_FILE=${4:-kubeconfig}
8 |
9 | test_dir=$(dirname "${BASH_SOURCE[0]}")
10 |
11 | if kind get clusters | grep "${K8S_CLUSTER_NAME}" ; then
12 | echo "Local environment should be already set up. If that is not the case run 'make clean' first";
13 | [[ -z "${KUBECONFIG}" ]] && export KUBECONFIG=${KUBECONFIG_FILE}
14 | else
15 | # create a k8s cluster
16 | echo "KIND: creating cluster ${K8S_CLUSTER_NAME} with version ${KINDNODE_VERSION}..."
17 | kind create cluster --name "${K8S_CLUSTER_NAME}" --kubeconfig=${KUBECONFIG_FILE} --image "kindest/node:${KINDNODE_VERSION}" \
18 | --config "${test_dir}/kind.yaml"
19 | export KUBECONFIG=${KUBECONFIG_FILE}
20 | fi
21 |
22 | # upload k8s-shredder image inside kind cluster
23 | kind load docker-image adobe/k8s-shredder:"${K8S_SHREDDER_VERSION}" --name "${K8S_CLUSTER_NAME}"
24 |
25 | namespace_status=$(kubectl get ns ns-k8s-shredder-test -o json | jq .status.phase -r)
26 |
27 | if [[ $namespace_status == "Active" ]]
28 | then
29 | echo "KIND: Namespace ns-k8s-shredder-test and ns-team-k8s-shredder-test already present"
30 | else
31 | echo "KIND: creating ns-team-k8s-shredder-test and ns-k8s-shredder-test namespaces..."
32 | kubectl create namespace ns-k8s-shredder-test
33 | kubectl create namespace ns-team-k8s-shredder-test
34 | fi
35 |
36 | if [[ ${ENABLE_APISERVER_DEBUG} == "true" ]]
37 | then
38 | echo -e "K8S_SHREDDER: Enable debug logging on apiserver"
39 | TOKEN=$(kubectl create token default)
40 |
41 | APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"kind-${K8S_CLUSTER_NAME}\")].cluster.server}")
42 | curl -s -X PUT -d '5' "$APISERVER"/debug/flags/v --header "Authorization: Bearer $TOKEN" -k
43 | fi
44 |
45 | echo "KIND: deploying k8s-shredder using Helm chart..."
46 | # Use Helm to deploy k8s-shredder with test-specific configuration
47 | helm install k8s-shredder "${test_dir}/../../charts/k8s-shredder" \
48 | --namespace kube-system \
49 | --set image.registry=adobe/k8s-shredder \
50 | --set image.tag="${K8S_SHREDDER_VERSION}" \
51 | --set image.pullPolicy=Never \
52 | --set shredder.EvictionLoopInterval=10s \
53 | --set shredder.ParkedNodeTTL=30s \
54 | --set shredder.RollingRestartThreshold=0.5 \
55 | --set shredder.EnableKarpenterDriftDetection=false \
56 | --set shredder.EnableNodeLabelDetection=false \
57 | --set logLevel=debug \
58 | --set logFormat=text \
59 | --set dryRun=false \
60 | --set service.create=true \
61 | --set service.type=ClusterIP \
62 | --set service.port=8080 \
63 | --set service.targetPort=metrics
64 |
65 | echo "KIND: deploying prometheus..."
66 | kubectl apply -f "${test_dir}/prometheus_stuffs.yaml"
67 |
68 | echo "KIND: deploying Argo Rollouts CRD..."
69 | kubectl apply -f https://raw.githubusercontent.com/argoproj/argo-rollouts/v1.7.2/manifests/crds/rollout-crd.yaml
70 |
71 | echo "KIND: deploying test applications..."
72 | kubectl apply -f "${test_dir}/test_apps.yaml"
73 |
74 | # Adjust the correct UID for the test-app-argo-rollout ownerReference
75 | rollout_uid=$(kubectl -n ns-team-k8s-shredder-test get rollout test-app-argo-rollout -o jsonpath='{.metadata.uid}')
76 | sed "s/REPLACE_WITH_ROLLOUT_UID/${rollout_uid}/" < "${test_dir}/test_apps.yaml" | kubectl apply -f -
77 |
78 | echo "K8S_SHREDDER: waiting for k8s-shredder deployment to become ready!"
79 | retry_count=0
80 | i=1
81 | sp="/-\|"
82 | while [[ ${status} == *"False"* || -z ${status} ]]; do
83 | # set 5 minute timeout
84 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi
85 | # shellcheck disable=SC2059
86 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5;
87 | status=$(kubectl get pods -n kube-system -l app.kubernetes.io/name=k8s-shredder -o json | \
88 | jq '.items[].status.conditions[] | select(.type=="Ready")| .status' 2> /dev/null)
89 | retry_count=$((retry_count+1))
90 | done
91 | echo ""
92 |
93 | echo "K8S_SHREDDER: waiting for rollout object PDB to become ready!"
94 | retry_count=0
95 | while [[ $(kubectl get pdb -n ns-team-k8s-shredder-test test-app-argo-rollout \
96 | -o jsonpath="{.status.currentHealthy}" 2> /dev/null) != "2" ]]; do
97 | # set 5 minute timeout
98 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi
99 | # shellcheck disable=SC2059
100 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5;
101 | retry_count=$((retry_count+1))
102 | done
103 |
104 | echo ""
105 | kubectl logs -l app.kubernetes.io/name=k8s-shredder -n kube-system
106 |
107 | echo "K8S_SHREDDER: waiting for prometheus deployment to become ready!"
108 | retry_count=0
109 | while [[ $(kubectl get pods -n kube-system -l app=prometheus \
110 | -o jsonpath="{.items[0].status.conditions[?(@.type=='Ready')].status}" 2> /dev/null) != "True" ]]; do
111 | # set 5 minute timeout
112 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi
113 | # shellcheck disable=SC2059
114 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5;
115 | retry_count=$((retry_count+1))
116 | done
117 |
118 | echo ""
119 |
120 | echo -e "K8S_SHREDDER: You can access k8s-shredder metrics at http://localhost:1234/metrics after running
121 | kubectl port-forward -n kube-system svc/k8s-shredder --kubeconfig=${KUBECONFIG_FILE} 1234:8080\n
122 | It can take few minutes before seeing k8s-shredder metrics..."
123 |
124 | echo -e "K8S_SHREDDER: You can access k8s-shredder logs by running
125 | kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder --kubeconfig=${KUBECONFIG_FILE} \n"
126 |
127 | echo -e "K8S_SHREDDER: You can access prometheus metrics at http://localhost:1234 after running
128 | kubectl port-forward -n kube-system svc/prometheus --kubeconfig=${KUBECONFIG_FILE} 1234:9090\n"
129 |
--------------------------------------------------------------------------------
/internal/testing/local_env_prep_node_labels_helm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | K8S_SHREDDER_VERSION=$1
5 | KINDNODE_VERSION=$2
6 | K8S_CLUSTER_NAME=$3
7 | KUBECONFIG_FILE=${4:-kubeconfig}
8 |
9 | test_dir=$(dirname "${BASH_SOURCE[0]}")
10 |
11 | if kind get clusters | grep "${K8S_CLUSTER_NAME}" ; then
12 | echo "Local environment should be already set up. If that is not the case run 'make clean' first";
13 | [[ -z "${KUBECONFIG}" ]] && export KUBECONFIG=${KUBECONFIG_FILE}
14 | else
15 | # create a k8s cluster
16 | echo "KIND: creating cluster ${K8S_CLUSTER_NAME} with version ${KINDNODE_VERSION}..."
17 | kind create cluster --name "${K8S_CLUSTER_NAME}" --kubeconfig=${KUBECONFIG_FILE} --image "kindest/node:${KINDNODE_VERSION}" \
18 | --config "${test_dir}/kind-node-labels.yaml"
19 | export KUBECONFIG=${KUBECONFIG_FILE}
20 | fi
21 |
22 | # upload k8s-shredder image inside kind cluster
23 | kind load docker-image adobe/k8s-shredder:"${K8S_SHREDDER_VERSION}" --name "${K8S_CLUSTER_NAME}"
24 |
25 | namespace_status=$(kubectl get ns ns-k8s-shredder-test -o json | jq .status.phase -r)
26 |
27 | if [[ $namespace_status == "Active" ]]
28 | then
29 | echo "KIND: Namespace ns-k8s-shredder-test and ns-team-k8s-shredder-test already present"
30 | else
31 | echo "KIND: creating ns-team-k8s-shredder-test and ns-k8s-shredder-test namespaces..."
32 | kubectl create namespace ns-k8s-shredder-test
33 | kubectl create namespace ns-team-k8s-shredder-test
34 | fi
35 |
36 | if [[ ${ENABLE_APISERVER_DEBUG} == "true" ]]
37 | then
38 | echo -e "K8S_SHREDDER: Enable debug logging on apiserver"
39 | TOKEN=$(kubectl create token default)
40 |
41 | APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"kind-${K8S_CLUSTER_NAME}\")].cluster.server}")
42 | curl -s -X PUT -d '5' "$APISERVER"/debug/flags/v --header "Authorization: Bearer $TOKEN" -k
43 | fi
44 |
45 | echo "NODE_LABELS: This test environment will demonstrate node label detection functionality"
46 | echo "NODE_LABELS: k8s-shredder will detect nodes with specific labels and park them"
47 |
48 | echo "KIND: deploying k8s-shredder using Helm chart with node label detection enabled..."
49 | # Use Helm to deploy k8s-shredder with node label detection enabled
50 | helm install k8s-shredder "${test_dir}/../../charts/k8s-shredder" \
51 | --namespace kube-system \
52 | --set image.registry=adobe/k8s-shredder \
53 | --set image.tag="${K8S_SHREDDER_VERSION}" \
54 | --set image.pullPolicy=Never \
55 | --set shredder.EvictionLoopInterval=30s \
56 | --set shredder.ParkedNodeTTL=2m \
57 | --set shredder.RollingRestartThreshold=0.5 \
58 | --set shredder.EnableKarpenterDriftDetection=false \
59 | --set shredder.EnableNodeLabelDetection=true \
60 | --set shredder.NodeLabelsToDetect[0]="test-label" \
61 | --set shredder.NodeLabelsToDetect[1]="maintenance=scheduled" \
62 | --set shredder.NodeLabelsToDetect[2]="node.test.io/park" \
63 | --set logLevel=debug \
64 | --set logFormat=text \
65 | --set dryRun=false \
66 | --set service.create=true \
67 | --set service.type=ClusterIP \
68 | --set service.port=8080 \
69 | --set service.targetPort=metrics
70 |
71 | echo "KIND: deploying prometheus..."
72 | kubectl apply -f "${test_dir}/prometheus_stuffs_node_labels.yaml"
73 |
74 | echo "KIND: deploying Argo Rollouts CRD..."
75 | kubectl apply -f https://raw.githubusercontent.com/argoproj/argo-rollouts/v1.7.2/manifests/crds/rollout-crd.yaml
76 |
77 | echo "KIND: deploying test applications..."
78 | kubectl apply -f "${test_dir}/test_apps.yaml"
79 |
80 | # Adjust the correct UID for the test-app-argo-rollout ownerReference
81 | rollout_uid=$(kubectl -n ns-team-k8s-shredder-test get rollout test-app-argo-rollout -o jsonpath='{.metadata.uid}')
82 | sed "s/REPLACE_WITH_ROLLOUT_UID/${rollout_uid}/" < "${test_dir}/test_apps.yaml" | kubectl apply -f -
83 |
84 | echo "NODE_LABELS: Node label detection test environment ready!"
85 |
86 | echo "K8S_SHREDDER: waiting for k8s-shredder deployment to become ready!"
87 | retry_count=0
88 | i=1
89 | sp="/-\|"
90 | while [[ ${status} == *"False"* || -z ${status} ]]; do
91 | # set 5 minute timeout
92 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi
93 | # shellcheck disable=SC2059
94 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5;
95 | status=$(kubectl get pods -n kube-system -l app.kubernetes.io/name=k8s-shredder -o json | \
96 | jq '.items[].status.conditions[] | select(.type=="Ready")| .status' 2> /dev/null)
97 | retry_count=$((retry_count+1))
98 | done
99 | echo ""
100 |
101 | echo "K8S_SHREDDER: waiting for rollout object PDB to become ready!"
102 | retry_count=0
103 | while [[ $(kubectl get pdb -n ns-team-k8s-shredder-test test-app-argo-rollout \
104 | -o jsonpath="{.status.currentHealthy}" 2> /dev/null) != "2" ]]; do
105 | # set 5 minute timeout
106 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi
107 | # shellcheck disable=SC2059
108 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5;
109 | retry_count=$((retry_count+1))
110 | done
111 |
112 | echo ""
113 | kubectl logs -l app.kubernetes.io/name=k8s-shredder -n kube-system
114 |
115 | echo "K8S_SHREDDER: waiting for prometheus deployment to become ready!"
116 | retry_count=0
117 | while [[ $(kubectl get pods -n kube-system -l app=prometheus \
118 | -o jsonpath="{.items[0].status.conditions[?(@.type=='Ready')].status}" 2> /dev/null) != "True" ]]; do
119 | # set 5 minute timeout
120 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi
121 | # shellcheck disable=SC2059
122 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5;
123 | retry_count=$((retry_count+1))
124 | done
125 |
126 | echo ""
127 |
128 | echo -e "K8S_SHREDDER: You can access k8s-shredder metrics at http://localhost:1234/metrics after running
129 | kubectl port-forward -n kube-system svc/k8s-shredder --kubeconfig=${KUBECONFIG_FILE} 1234:8080\n
130 | It can take few minutes before seeing k8s-shredder metrics..."
131 |
132 | echo -e "K8S_SHREDDER: You can access k8s-shredder logs by running
133 | kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder --kubeconfig=${KUBECONFIG_FILE} \n"
134 |
135 | echo -e "K8S_SHREDDER: You can access prometheus metrics at http://localhost:1234 after running
136 | kubectl port-forward -n kube-system svc/prometheus --kubeconfig=${KUBECONFIG_FILE} 1234:9090\n"
137 |
138 | echo "NODE_LABELS: Environment setup complete!"
139 | echo "NODE_LABELS: Configured to detect nodes with these labels:"
140 | echo " - test-label (key only)"
141 | echo " - maintenance=scheduled (key=value)"
142 | echo " - node.test.io/park (key only)"
143 | echo ""
144 |
145 | echo "NODE_LABELS: Now applying test labels to trigger node label detection..."
146 |
147 | # Apply test labels to trigger k8s-shredder's node label detection
148 | WORKER_NODES=($(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | grep -v control-plane))
149 | WORKER_NODE1=${WORKER_NODES[0]}
150 | WORKER_NODE2=${WORKER_NODES[1]}
151 |
152 | echo "NODE_LABELS: Adding 'test-label=test-value' to node ${WORKER_NODE1}"
153 | kubectl label node "${WORKER_NODE1}" test-label=test-value
154 |
155 | echo "NODE_LABELS: Adding 'maintenance=scheduled' to node ${WORKER_NODE2}"
156 | kubectl label node "${WORKER_NODE2}" maintenance=scheduled
157 |
158 | echo "NODE_LABELS: Labels applied! k8s-shredder should detect and park these nodes shortly..."
159 | echo "NODE_LABELS: You can monitor the process with:"
160 | echo " kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder --kubeconfig=${KUBECONFIG_FILE} -f"
161 |
--------------------------------------------------------------------------------
/internal/testing/cluster_upgrade_node_labels.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | K8S_CLUSTER_NAME=$1
5 | KUBECONFIG_FILE=${2:-kubeconfig}
6 | test_dir=$(dirname "${BASH_SOURCE[0]}")
7 |
8 | export KUBECONFIG=${KUBECONFIG_FILE}
9 |
10 | echo "==============================================================================="
11 | echo "NODE_LABELS: Starting node label detection test"
12 | echo "==============================================================================="
13 |
14 | echo "NODE_LABELS: This test will demonstrate k8s-shredder's node label detection functionality"
15 | echo "NODE_LABELS: We'll add specific labels to nodes and verify they get parked automatically"
16 | echo ""
17 |
18 | echo "NODE_LABELS: Getting available nodes for testing..."
19 | node_count=$(kubectl get nodes --no-headers | wc -l)
20 | if [[ ${node_count} -eq 0 ]]; then
21 | echo "ERROR: No nodes available for testing"
22 | exit 1
23 | fi
24 |
25 | # Get a worker node for testing (prefer worker nodes over control-plane)
26 | test_node=$(kubectl get nodes --no-headers -o custom-columns=":metadata.name" | grep -v control-plane | head -1 || kubectl get nodes --no-headers -o custom-columns=":metadata.name" | head -1)
27 | echo "NODE_LABELS: Using node '${test_node}' for testing"
28 |
29 | echo "NODE_LABELS: Current node labels before adding test labels:"
30 | kubectl get node ${test_node} --show-labels
31 | echo ""
32 |
33 | echo "NODE_LABELS: Checking for any existing parking labels..."
34 | parking_status=$(kubectl get node ${test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/upgrade-status}' 2>/dev/null || echo "")
35 | echo "NODE_LABELS: Current parking status: ${parking_status:-"Not parked"}"
36 |
37 | echo ""
38 | echo "==============================================================================="
39 | echo "NODE_LABELS: Adding test label to trigger node label detection..."
40 | echo "==============================================================================="
41 |
42 | # We'll test with the "test-label" key-only selector
43 | echo "NODE_LABELS: Adding label 'test-label=test-value' to node ${test_node}"
44 | kubectl label node ${test_node} test-label=test-value
45 |
46 | echo "NODE_LABELS: Node labeled successfully!"
47 |
48 | echo "NODE_LABELS: Current node labels after adding test label:"
49 | kubectl get node ${test_node} --show-labels
50 | echo ""
51 |
52 | echo ""
53 | echo "==============================================================================="
54 | echo "NODE_LABELS: Waiting for k8s-shredder to detect and park the labeled node..."
55 | echo "==============================================================================="
56 |
57 | echo "NODE_LABELS: Current k8s-shredder logs:"
58 | kubectl logs -l app=k8s-shredder -n kube-system --tail=20
59 | echo ""
60 |
61 | echo "NODE_LABELS: Monitoring k8s-shredder activity for next 3 minutes..."
62 | start_time=$(date +%s)
63 | end_time=$((start_time + 180))
64 |
65 | while [[ $(date +%s) -lt ${end_time} ]]; do
66 | current_time=$(date +%s)
67 | remaining=$((end_time - current_time))
68 |
69 | echo "NODE_LABELS: Checking node parking status... (${remaining}s remaining)"
70 |
71 | # Check if node is parked
72 | parking_status=$(kubectl get node ${test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/upgrade-status}' 2>/dev/null || echo "")
73 | parked_by=$(kubectl get node ${test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/parked-by}' 2>/dev/null || echo "")
74 | expires_on=$(kubectl get node ${test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/parked-node-expires-on}' 2>/dev/null || echo "")
75 |
76 | if [[ "${parking_status}" == "parked" ]]; then
77 | echo ""
78 | echo "==============================================================================="
79 | echo "NODE_LABELS: SUCCESS! Node ${test_node} has been parked by k8s-shredder!"
80 | echo "==============================================================================="
81 | echo "NODE_LABELS: Parking details:"
82 | echo " - Status: ${parking_status}"
83 | echo " - Parked by: ${parked_by}"
84 | echo " - Expires on: ${expires_on}"
85 | echo ""
86 |
87 | echo "NODE_LABELS: Checking if node is also cordoned and tainted..."
88 | node_unschedulable=$(kubectl get node ${test_node} -o jsonpath='{.spec.unschedulable}' 2>/dev/null || echo "")
89 | echo " - Unschedulable (cordoned): ${node_unschedulable}"
90 |
91 | echo " - Taints:"
92 | kubectl get node ${test_node} -o jsonpath='{.spec.taints}' | jq -r '.[] | " \(.key)=\(.value):\(.effect)"' 2>/dev/null || echo " No taints found"
93 |
94 | echo ""
95 | echo "NODE_LABELS: Checking pods on the node..."
96 | kubectl get pods --all-namespaces --field-selector spec.nodeName=${test_node} -o wide
97 |
98 | echo ""
99 | echo "NODE_LABELS: Final k8s-shredder logs:"
100 | kubectl logs -l app=k8s-shredder -n kube-system --tail=30
101 |
102 | echo ""
103 | echo "==============================================================================="
104 | echo "NODE_LABELS: Test completed successfully!"
105 | echo "==============================================================================="
106 | echo "NODE_LABELS: Summary:"
107 | echo " 1. ✅ Test label was added to node"
108 | echo " 2. ✅ k8s-shredder detected the labeled node"
109 | echo " 3. ✅ k8s-shredder parked the node with labels"
110 | echo " 4. ✅ Node was cordoned and tainted"
111 | echo " 5. ✅ Pods on the node were also labeled"
112 |
113 | echo ""
114 | echo "NODE_LABELS: Testing additional label formats..."
115 |
116 | # Test another node with a different label format
117 | available_nodes=$(kubectl get nodes --no-headers -o custom-columns=":metadata.name" | grep -v "${test_node}")
118 | if [[ -n "${available_nodes}" ]]; then
119 | second_test_node=$(echo "${available_nodes}" | head -1)
120 | echo "NODE_LABELS: Testing key=value format on node '${second_test_node}'"
121 | kubectl label node ${second_test_node} maintenance=scheduled
122 |
123 | # Wait a bit to see if this gets detected too
124 | sleep 45
125 |
126 | second_parking_status=$(kubectl get node ${second_test_node} -o jsonpath='{.metadata.labels.shredder\.ethos\.adobe\.net/upgrade-status}' 2>/dev/null || echo "")
127 | if [[ "${second_parking_status}" == "parked" ]]; then
128 | echo " ✅ Second node with key=value label also parked successfully!"
129 | else
130 | echo " ⏳ Second node not yet parked (may need more time)"
131 | fi
132 | fi
133 |
134 | echo ""
135 | exit 0
136 | fi
137 |
138 | echo "NODE_LABELS: Node parking status: ${parking_status:-"Not parked yet"}"
139 | sleep 10
140 | done
141 |
142 | echo ""
143 | echo "==============================================================================="
144 | echo "NODE_LABELS: Test completed but node was not parked within timeout"
145 | echo "==============================================================================="
146 | echo "NODE_LABELS: Final status check:"
147 |
148 | echo "NODE_LABELS: Node labels:"
149 | kubectl get node ${test_node} --show-labels
150 | echo ""
151 |
152 | echo "NODE_LABELS: Final k8s-shredder logs:"
153 | kubectl logs -l app=k8s-shredder -n kube-system --tail=50
154 | echo ""
155 |
156 | echo "NODE_LABELS: All Nodes:"
157 | kubectl get nodes -o wide
158 | echo ""
159 |
160 | echo "NODE_LABELS: k8s-shredder may need more time or there might be an issue."
161 | echo "NODE_LABELS: Check the logs above for any errors or continue monitoring manually."
162 | echo "NODE_LABELS: This could be expected behavior if node label detection is disabled or"
163 | echo "NODE_LABELS: if k8s-shredder hasn't run its eviction loop yet."
164 |
165 | exit 1
--------------------------------------------------------------------------------
/internal/testing/test_apps.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: test-app-disallow-eviction
6 | namespace: ns-team-k8s-shredder-test
7 | spec:
8 | replicas: 4
9 | selector:
10 | matchLabels:
11 | app: test-app-disallow-eviction
12 | strategy:
13 | rollingUpdate:
14 | maxSurge: 25%
15 | maxUnavailable: 25%
16 | type: RollingUpdate
17 | template:
18 | metadata:
19 | labels:
20 | app: test-app-disallow-eviction
21 | shredder.ethos.adobe.net/allow-eviction: 'false'
22 | spec:
23 | containers:
24 | - name: canary
25 | image: busybox:1.35
26 | command: [sh, -c, while true; do sleep 30; done]
27 | ---
28 | apiVersion: policy/v1
29 | kind: PodDisruptionBudget
30 | metadata:
31 | name: test-app-disallow-eviction
32 | namespace: ns-team-k8s-shredder-test
33 | spec:
34 | minAvailable: 1
35 | selector:
36 | matchLabels:
37 | app: test-app-disallow-eviction
38 | # 2. Good citizen
39 | ---
40 | apiVersion: apps/v1
41 | kind: Deployment
42 | metadata:
43 | name: test-app-allow-eviction
44 | namespace: ns-team-k8s-shredder-test
45 | spec:
46 | replicas: 4
47 | selector:
48 | matchLabels:
49 | app: test-app-allow-eviction
50 | strategy:
51 | rollingUpdate:
52 | maxSurge: 25%
53 | maxUnavailable: 25%
54 | type: RollingUpdate
55 | template:
56 | metadata:
57 | labels:
58 | app: test-app-allow-eviction
59 | spec:
60 | containers:
61 | - name: canary
62 | image: busybox:1.35
63 | command: [sh, -c, while true; do sleep 30; done]
64 | ---
65 | apiVersion: policy/v1
66 | kind: PodDisruptionBudget
67 | metadata:
68 | name: test-app-allow-eviction
69 | namespace: ns-team-k8s-shredder-test
70 | spec:
71 | minAvailable: 1
72 | selector:
73 | matchLabels:
74 | app: test-app-allow-eviction
75 | # 3. Bad citizen with wrongly configured PDB
76 | ---
77 | apiVersion: apps/v1
78 | kind: Deployment
79 | metadata:
80 | name: test-app-with-bad-pdb
81 | namespace: ns-team-k8s-shredder-test
82 | spec:
83 | replicas: 4
84 | selector:
85 | matchLabels:
86 | app: test-app-with-bad-pdb
87 | strategy:
88 | rollingUpdate:
89 | maxSurge: 0
90 | maxUnavailable: 25%
91 | template:
92 | metadata:
93 | labels:
94 | app: test-app-with-bad-pdb
95 | spec:
96 | affinity:
97 | nodeAffinity:
98 | preferredDuringSchedulingIgnoredDuringExecution:
99 | - weight: 100
100 | preference:
101 | matchExpressions:
102 | - key: will-be-parked
103 | operator: In
104 | values: ['true']
105 | containers:
106 | - name: canary
107 | image: busybox:1.35
108 | command: [sh, -c, while true; do sleep 30; done]
109 | ---
110 | apiVersion: policy/v1
111 | kind: PodDisruptionBudget
112 | metadata:
113 | name: test-app-with-bad-pdb
114 | namespace: ns-team-k8s-shredder-test
115 | spec:
116 | minAvailable: 10
117 | selector:
118 | matchLabels:
119 | app: test-app-with-bad-pdb
120 | # 4. Good citizen with recreate update strategy
121 | ---
122 | apiVersion: apps/v1
123 | kind: Deployment
124 | metadata:
125 | name: test-app-with-recreate
126 | namespace: ns-team-k8s-shredder-test
127 | spec:
128 | replicas: 4
129 | selector:
130 | matchLabels:
131 | app: test-app-recreate
132 | strategy:
133 | type: Recreate
134 | template:
135 | metadata:
136 | labels:
137 | app: test-app-recreate
138 | spec:
139 | containers:
140 | - name: canary
141 | image: busybox:1.35
142 | command: [sh, -c, while true; do sleep 30; done]
143 | ---
144 | apiVersion: policy/v1
145 | kind: PodDisruptionBudget
146 | metadata:
147 | name: test-app-recreate
148 | namespace: ns-team-k8s-shredder-test
149 | spec:
150 | minAvailable: 1
151 | selector:
152 | matchLabels:
153 | app: test-app-recreate
154 | ##### CAAS #####
155 | # 1. Good citizen in CaaS world
156 | ---
157 | apiVersion: apps/v1
158 | kind: Deployment
159 | metadata:
160 | name: test-app-caas
161 | namespace: ns-k8s-shredder-test
162 | spec:
163 | replicas: 4
164 | selector:
165 | matchLabels:
166 | app: test-app-caas
167 | strategy:
168 | rollingUpdate:
169 | maxSurge: 25%
170 | maxUnavailable: 25%
171 | type: RollingUpdate
172 | template:
173 | metadata:
174 | labels:
175 | app: test-app-caas
176 | spec:
177 | containers:
178 | - name: canary
179 | image: busybox:1.35
180 | command: [sh, -c, while true; do sleep 30; done]
181 | ---
182 | apiVersion: policy/v1
183 | kind: PodDisruptionBudget
184 | metadata:
185 | name: test-app-caas
186 | namespace: ns-k8s-shredder-test
187 | spec:
188 | minAvailable: 1
189 | selector:
190 | matchLabels:
191 | app: test-app-caas
192 | ---
193 | apiVersion: v1
194 | kind: Service
195 | metadata:
196 | name: test-app-statefulset
197 | namespace: ns-team-k8s-shredder-test
198 | spec:
199 | ports:
200 | - port: 80
201 | targetPort: 8080
202 | name: web
203 | clusterIP: None
204 | selector:
205 | app: test-app-statefulset
206 | ---
207 | apiVersion: apps/v1
208 | kind: StatefulSet
209 | metadata:
210 | name: test-app-statefulset
211 | namespace: ns-team-k8s-shredder-test
212 | spec:
213 | selector:
214 | matchLabels:
215 | app: test-app-statefulset
216 | serviceName: test-app-statefulset
217 | replicas: 3
218 | template:
219 | metadata:
220 | labels:
221 | app: test-app-statefulset
222 | spec:
223 | terminationGracePeriodSeconds: 10
224 | containers:
225 | - name: test-app-statefulset
226 | image: busybox:1.35
227 | command: [sh, -c, while true; do sleep 30; done]
228 | ports:
229 | - containerPort: 8080
230 | name: web
231 | ---
232 | apiVersion: policy/v1
233 | kind: PodDisruptionBudget
234 | metadata:
235 | name: test-app-statefulset
236 | namespace: ns-team-k8s-shredder-test
237 | spec:
238 | minAvailable: 1
239 | selector:
240 | matchLabels:
241 | app: test-app-statefulset
242 | #### FLEX ####
243 | # 1. Good citizen Argo Rollout in Flex world
244 | ---
245 | apiVersion: apps/v1
246 | kind: ReplicaSet
247 | metadata:
248 | name: test-app-argo-rollout
249 | namespace: ns-team-k8s-shredder-test
250 | ownerReferences:
251 | - apiVersion: argoproj.io/v1alpha1
252 | kind: Rollout
253 | blockOwnerDeletion: true
254 | name: test-app-argo-rollout
255 | uid: REPLACE_WITH_ROLLOUT_UID
256 | spec:
257 | replicas: 2
258 | selector:
259 | matchLabels:
260 | app: test-app-argo-rollout
261 | template:
262 | metadata:
263 | labels:
264 | app: test-app-argo-rollout
265 | spec:
266 | affinity:
267 | podAntiAffinity:
268 | requiredDuringSchedulingIgnoredDuringExecution:
269 | - labelSelector:
270 | matchExpressions:
271 | - key: app
272 | operator: In
273 | values: [test-app-argo-rollout]
274 | topologyKey: kubernetes.io/hostname
275 | containers:
276 | - name: test-app-argo-rollout
277 | image: busybox:1.35
278 | command: [sh, -c, while true; do sleep 30; done]
279 | ports:
280 | - containerPort: 8080
281 | name: web
282 | ---
283 | apiVersion: argoproj.io/v1alpha1
284 | kind: Rollout
285 | metadata:
286 | name: test-app-argo-rollout
287 | namespace: ns-team-k8s-shredder-test
288 | spec:
289 | replicas: 2
290 | workloadRef:
291 | apiVersion: apps/v1
292 | kind: ReplicaSet
293 | name: test-app-argo-rollout
294 | ---
295 | apiVersion: policy/v1
296 | kind: PodDisruptionBudget
297 | metadata:
298 | name: test-app-argo-rollout
299 | namespace: ns-team-k8s-shredder-test
300 | spec:
301 | minAvailable: 10
302 | selector:
303 | matchLabels:
304 | app: test-app-argo-rollout
305 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/values.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | # -- Container image configuration
3 | image:
4 | # -- Container registry where the k8s-shredder image is hosted
5 | registry: ghcr.io/adobe/k8s-shredder
6 | # -- Image pull policy - IfNotPresent, Always, or Never
7 | pullPolicy: IfNotPresent
8 | # -- Image tag to use
9 | tag: latest
10 | # -- Number of k8s-shredder pods to run
11 | replicaCount: 1
12 | # -- Deployment strategy for rolling updates (e.g., RollingUpdate, Recreate)
13 | deploymentStrategy: {}
14 | # -- Secrets for pulling images from private registries
15 | imagePullSecrets: []
16 | # -- Override the name of the chart
17 | nameOverride: ''
18 | # -- Override the full name used for resources
19 | fullnameOverride: ''
20 | # -- Additional environment variables to set in the container
21 | environmentVars: []
22 | # -- Enable dry-run mode - when true, k8s-shredder will log actions but not execute them
23 | dryRun: false
24 | # -- Logging configuration
25 | # -- Available log levels: panic, fatal, error, warn, warning, info, debug, trace
26 | logLevel: debug
27 | # -- Log output format: text (human-readable) or json (structured logging)
28 | logFormat: text
29 | # -- Core k8s-shredder configuration
30 | shredder:
31 | # -- How often to run the main eviction loop
32 | EvictionLoopInterval: 1h
33 | # -- Optional cron schedule for when eviction operations are allowed. If set, parking and shredding operations will only occur during the scheduled time window. Supports standard cron syntax and macros (@yearly, @monthly, @weekly, @daily, @hourly). Example: "@daily" (runs at midnight UTC), "0 2 * * *" (runs at 2 AM UTC daily). When omitted, operations run continuously.
34 | EvictionLoopSchedule: ''
35 | # -- Duration for how long the scheduled window stays active after the schedule triggers. Only used when EvictionLoopSchedule is set. Supports compound durations with hours and minutes (e.g., "10h5m", "30m", "160h"). Example: "10h" (window stays active for 10 hours), "30m" (window stays active for 30 minutes).
36 | EvictionLoopDuration: ''
37 | # -- How long parked nodes should remain before being eligible for deletion (7 days default)
38 | ParkedNodeTTL: 168h
39 | # -- Maximum percentage of nodes that can be restarted simultaneously during rolling restarts
40 | RollingRestartThreshold: 0.1
41 | # -- Label used to track node upgrade status
42 | UpgradeStatusLabel: shredder.ethos.adobe.net/upgrade-status
43 | # -- Label used to track when a parked node expires
44 | ExpiresOnLabel: shredder.ethos.adobe.net/parked-node-expires-on
45 | # -- Namespace prefix to skip during initial eviction (useful for system namespaces)
46 | NamespacePrefixSkipInitialEviction: ns-ethos-
47 | # -- Annotation to track when a workload was last restarted
48 | RestartedAtAnnotation: shredder.ethos.adobe.net/restartedAt
49 | # -- Label to explicitly allow eviction on specific resources
50 | AllowEvictionLabel: shredder.ethos.adobe.net/allow-eviction
51 | # -- Taint indicating nodes scheduled for deletion by cluster autoscaler
52 | ToBeDeletedTaint: ToBeDeletedByClusterAutoscaler
53 | # -- API version for Argo Rollouts integration
54 | ArgoRolloutsAPIVersion: v1alpha1
55 | # -- Enable Karpenter drift detection for node lifecycle management
56 | EnableKarpenterDriftDetection: false
57 | # -- Enable Karpenter disruption detection for node lifecycle management
58 | EnableKarpenterDisruptionDetection: false
59 | # -- Label to track which component parked a node
60 | ParkedByLabel: shredder.ethos.adobe.net/parked-by
61 | # -- Value set in the ParkedByLabel to identify k8s-shredder as the parking agent
62 | ParkedByValue: k8s-shredder
63 | # -- Taint applied to parked nodes to prevent new pod scheduling
64 | ParkedNodeTaint: shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule
65 | # -- Enable detection of nodes based on specific labels
66 | EnableNodeLabelDetection: false
67 | # -- List of node labels to monitor for triggering shredder actions
68 | NodeLabelsToDetect: []
69 | # -- Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" for no limit
70 | MaxParkedNodes: '0'
71 | # -- Controls whether to perform safety checks before force eviction
72 | EvictionSafetyCheck: true
73 | # -- Label used to track why a node or pod was parked
74 | ParkingReasonLabel: shredder.ethos.adobe.net/parked-reason
75 | # -- Additional labels to apply to nodes and pods during parking
76 | ExtraParkingLabels: {}
77 | # Example configuration:
78 | # example.com/owner: "infrastructure"
79 | # example.com/maintenance: "true"
80 | # -- RBAC (Role-Based Access Control) configuration
81 | rbac:
82 | # -- Create RBAC resources (ClusterRole, ClusterRoleBinding)
83 | create: true
84 | # -- Kubernetes service account configuration
85 | serviceAccount:
86 | # -- Create a service account for k8s-shredder
87 | create: true
88 | # -- Name of the service account
89 | name: k8s-shredder
90 | # -- Additional annotations for the service account (useful for IAM roles, etc.)
91 | annotations: {}
92 | # -- Kubernetes service configuration
93 | service:
94 | # -- Create a service for k8s-shredder metrics endpoint
95 | create: false
96 | # -- Service type (ClusterIP, NodePort, LoadBalancer)
97 | type: ClusterIP
98 | # -- Service port for metrics endpoint
99 | port: 8080
100 | # -- Target port for metrics endpoint
101 | targetPort: metrics
102 | # -- Additional annotations for the service
103 | annotations: {}
104 | # -- Additional labels for the service
105 | labels: {}
106 | # -- Annotations to add to k8s-shredder pod(s)
107 | podAnnotations: {}
108 | # -- Additional labels to add to k8s-shredder pod(s)
109 | podLabels: {}
110 | # -- Security context applied to the entire pod
111 | podSecurityContext: {}
112 | # -- Security context applied to the k8s-shredder container
113 | securityContext: {}
114 | # -- Init containers to run before the main k8s-shredder container starts
115 | initContainers: []
116 | # -- Additional containers to run alongside k8s-shredder in the same pod
117 | additionalContainers: []
118 | # -- Resource requests and limits for the k8s-shredder container
119 | resources:
120 | limits:
121 | # -- Maximum CPU cores the container can use
122 | cpu: '1'
123 | # -- Maximum memory the container can use
124 | memory: 1Gi
125 | requests:
126 | # -- CPU cores requested for the container (guaranteed allocation)
127 | cpu: 250m
128 | # -- Memory requested for the container (guaranteed allocation)
129 | memory: 250Mi
130 | # -- Additional volumes to mount in the pod
131 | volumes: []
132 | # Example volume configuration:
133 | # - name: ca
134 | # secret:
135 | # secretName: k8s-shredder-ca
136 | # items:
137 | # - key: ca.pem
138 | # path: ca.pem
139 |
140 | # -- Node selector to constrain pod scheduling to specific nodes
141 | nodeSelector: {}
142 | # -- Tolerations to allow scheduling on nodes with specific taints
143 | tolerations: []
144 | # -- Affinity rules for advanced pod scheduling (node affinity, pod affinity/anti-affinity)
145 | affinity: {}
146 | # -- Prometheus monitoring configuration
147 | podMonitor:
148 | # -- Enable creation of a PodMonitor resource for Prometheus scraping
149 | enabled: false
150 | # -- Labels to apply to the PodMonitor resource
151 | labels: {}
152 | # app: k8s-shredder
153 | # subsystem: k8s-a
154 | # -- How often Prometheus should scrape metrics
155 | interval: 60s
156 | # -- Timeout for each scrape attempt
157 | scrapeTimeout: 10s
158 | # -- Whether to honor labels from the target
159 | honorLabels: true
160 | # -- Metric relabeling configuration
161 | relabelings: []
162 | # -- Priority class for pod scheduling - system-cluster-critical ensures high priority
163 | priorityClassName: system-cluster-critical
164 | # -- Topology spread constraints to control pod distribution across failure domains
165 | # -- Helps ensure high availability by spreading pods across zones/nodes
166 | topologySpreadConstraints: []
167 | # Example configuration:
168 | # - maxSkew: 1
169 | # topologyKey: topology.kubernetes.io/zone
170 | # whenUnsatisfiable: DoNotSchedule
171 | # labelSelector:
172 | # matchLabels:
173 | # app.kubernetes.io/name=k8s-shredder
174 |
--------------------------------------------------------------------------------
/pkg/metrics/metrics.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE-2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package metrics
13 |
14 | import (
15 | "github.com/prometheus/client_golang/prometheus"
16 | )
17 |
18 | var (
19 |
20 | // ShredderAPIServerRequestsTotal = Total requests for Kubernetes API
21 | ShredderAPIServerRequestsTotal = prometheus.NewCounterVec(
22 | prometheus.CounterOpts{
23 | Name: "shredder_apiserver_requests_total",
24 | Help: "Total requests for Kubernetes API",
25 | },
26 | []string{"verb", "resource", "status"},
27 | )
28 |
29 | // ShredderAPIServerRequestsDurationSeconds = Requests duration seconds for calling Kubernetes API
30 | ShredderAPIServerRequestsDurationSeconds = prometheus.NewSummaryVec(
31 | prometheus.SummaryOpts{
32 | Name: "shredder_apiserver_requests_duration_seconds",
33 | Help: "Requests duration when calling Kubernetes API",
34 | Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
35 | },
36 | []string{"verb", "resource", "status"},
37 | )
38 |
39 | // ShredderLoopsTotal = Total loops
40 | ShredderLoopsTotal = prometheus.NewCounter(
41 | prometheus.CounterOpts{
42 | Name: "shredder_loops_total",
43 | Help: "Total loops",
44 | },
45 | )
46 |
47 | // ShredderLoopsDurationSeconds = Loops duration in seconds
48 | ShredderLoopsDurationSeconds = prometheus.NewSummary(
49 | prometheus.SummaryOpts{
50 | Name: "shredder_loops_duration_seconds",
51 | Help: "Loops duration in seconds",
52 | Objectives: map[float64]float64{0.5: 1200, 0.9: 900, 0.99: 600},
53 | },
54 | )
55 |
56 | // ShredderProcessedNodesTotal = Total processed nodes
57 | ShredderProcessedNodesTotal = prometheus.NewCounter(
58 | prometheus.CounterOpts{
59 | Name: "shredder_processed_nodes_total",
60 | Help: "Total processed nodes",
61 | },
62 | )
63 |
64 | // ShredderProcessedPodsTotal = Total processed pods
65 | ShredderProcessedPodsTotal = prometheus.NewCounter(
66 | prometheus.CounterOpts{
67 | Name: "shredder_processed_pods_total",
68 | Help: "Total processed pods",
69 | },
70 | )
71 |
72 | // ShredderErrorsTotal = Total errors
73 | ShredderErrorsTotal = prometheus.NewCounter(
74 | prometheus.CounterOpts{
75 | Name: "shredder_errors_total",
76 | Help: "Total errors",
77 | },
78 | )
79 |
80 | // ShredderPodErrorsTotal = Total pod errors
81 | ShredderPodErrorsTotal = prometheus.NewGaugeVec(
82 | prometheus.GaugeOpts{
83 | Name: "shredder_pod_errors_total",
84 | Help: "Total pod errors per eviction loop",
85 | },
86 | []string{"pod_name", "namespace", "reason", "action"},
87 | )
88 |
89 | // ShredderNodeForceToEvictTime = Time when the node will be forcibly evicted
90 | ShredderNodeForceToEvictTime = prometheus.NewGaugeVec(
91 | prometheus.GaugeOpts{
92 | Name: "shredder_node_force_to_evict_time",
93 | Help: "Time when the node will be forcibly evicted",
94 | },
95 | []string{"node_name"},
96 | )
97 |
98 | // ShredderPodForceToEvictTime = Time when the pod will be forcibly evicted
99 | ShredderPodForceToEvictTime = prometheus.NewGaugeVec(
100 | prometheus.GaugeOpts{
101 | Name: "shredder_pod_force_to_evict_time",
102 | Help: "Time when the pod will be forcibly evicted",
103 | },
104 | []string{"pod_name", "namespace"},
105 | )
106 |
107 | // ShredderKarpenterDriftedNodesTotal = Total number of drifted Karpenter nodes detected
108 | ShredderKarpenterDriftedNodesTotal = prometheus.NewCounter(
109 | prometheus.CounterOpts{
110 | Name: "shredder_karpenter_drifted_nodes_total",
111 | Help: "Total number of drifted Karpenter nodes detected",
112 | },
113 | )
114 |
115 | // ShredderKarpenterDisruptedNodesTotal = Total number of disrupted Karpenter nodes detected
116 | ShredderKarpenterDisruptedNodesTotal = prometheus.NewCounter(
117 | prometheus.CounterOpts{
118 | Name: "shredder_karpenter_disrupted_nodes_total",
119 | Help: "Total number of disrupted Karpenter nodes detected",
120 | },
121 | )
122 |
123 | // ShredderKarpenterNodesParkedTotal = Total number of Karpenter nodes successfully parked
124 | ShredderKarpenterNodesParkedTotal = prometheus.NewCounter(
125 | prometheus.CounterOpts{
126 | Name: "shredder_karpenter_nodes_parked_total",
127 | Help: "Total number of Karpenter nodes successfully parked",
128 | },
129 | )
130 |
131 | // ShredderKarpenterNodesParkingFailedTotal = Total number of Karpenter nodes that failed to be parked
132 | ShredderKarpenterNodesParkingFailedTotal = prometheus.NewCounter(
133 | prometheus.CounterOpts{
134 | Name: "shredder_karpenter_nodes_parking_failed_total",
135 | Help: "Total number of Karpenter nodes that failed to be parked",
136 | },
137 | )
138 |
139 | // ShredderKarpenterProcessingDurationSeconds = Duration of Karpenter node processing in seconds
140 | ShredderKarpenterProcessingDurationSeconds = prometheus.NewSummary(
141 | prometheus.SummaryOpts{
142 | Name: "shredder_karpenter_processing_duration_seconds",
143 | Help: "Duration of Karpenter node processing in seconds",
144 | Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
145 | },
146 | )
147 |
148 | // ShredderNodeLabelNodesParkedTotal = Total number of nodes successfully parked via node label detection
149 | ShredderNodeLabelNodesParkedTotal = prometheus.NewCounter(
150 | prometheus.CounterOpts{
151 | Name: "shredder_node_label_nodes_parked_total",
152 | Help: "Total number of nodes successfully parked via node label detection",
153 | },
154 | )
155 |
156 | // ShredderNodeLabelNodesParkingFailedTotal = Total number of nodes that failed to be parked via node label detection
157 | ShredderNodeLabelNodesParkingFailedTotal = prometheus.NewCounter(
158 | prometheus.CounterOpts{
159 | Name: "shredder_node_label_nodes_parking_failed_total",
160 | Help: "Total number of nodes that failed to be parked via node label detection",
161 | },
162 | )
163 |
164 | // ShredderNodeLabelProcessingDurationSeconds = Duration of node label detection and parking process in seconds
165 | ShredderNodeLabelProcessingDurationSeconds = prometheus.NewSummary(
166 | prometheus.SummaryOpts{
167 | Name: "shredder_node_label_processing_duration_seconds",
168 | Help: "Duration of node label detection and parking process in seconds",
169 | Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
170 | },
171 | )
172 |
173 | // ShredderNodeLabelMatchingNodesTotal = Total number of nodes matching the label criteria
174 | ShredderNodeLabelMatchingNodesTotal = prometheus.NewGauge(
175 | prometheus.GaugeOpts{
176 | Name: "shredder_node_label_matching_nodes_total",
177 | Help: "Total number of nodes matching the label criteria",
178 | },
179 | )
180 |
181 | // ShredderNodesParkedTotal = Total number of nodes successfully parked (shared across all detection methods)
182 | ShredderNodesParkedTotal = prometheus.NewCounter(
183 | prometheus.CounterOpts{
184 | Name: "shredder_nodes_parked_total",
185 | Help: "Total number of nodes successfully parked (shared across all detection methods)",
186 | },
187 | )
188 |
189 | // ShredderNodesParkingFailedTotal = Total number of nodes that failed to be parked (shared across all detection methods)
190 | ShredderNodesParkingFailedTotal = prometheus.NewCounter(
191 | prometheus.CounterOpts{
192 | Name: "shredder_nodes_parking_failed_total",
193 | Help: "Total number of nodes that failed to be parked (shared across all detection methods)",
194 | },
195 | )
196 |
197 | // ShredderProcessingDurationSeconds = Duration of node processing in seconds (shared across all detection methods)
198 | ShredderProcessingDurationSeconds = prometheus.NewSummary(
199 | prometheus.SummaryOpts{
200 | Name: "shredder_processing_duration_seconds",
201 | Help: "Duration of node processing in seconds (shared across all detection methods)",
202 | Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
203 | },
204 | )
205 | )
206 |
--------------------------------------------------------------------------------
/internal/testing/local_env_prep_karpenter_helm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | K8S_SHREDDER_VERSION=$1
5 | KINDNODE_VERSION=$2
6 | K8S_CLUSTER_NAME=$3
7 | KUBECONFIG_FILE=${4:-kubeconfig}
8 |
9 | test_dir=$(dirname "${BASH_SOURCE[0]}")
10 |
11 | if kind get clusters | grep "${K8S_CLUSTER_NAME}" ; then
12 | echo "Local environment should be already set up. If that is not the case run 'make clean' first";
13 | [[ -z "${KUBECONFIG}" ]] && export KUBECONFIG=${KUBECONFIG_FILE}
14 | else
15 | # create a k8s cluster
16 | echo "KIND: creating cluster ${K8S_CLUSTER_NAME} with version ${KINDNODE_VERSION}..."
17 | kind create cluster --name "${K8S_CLUSTER_NAME}" --kubeconfig=${KUBECONFIG_FILE} --image "kindest/node:${KINDNODE_VERSION}" \
18 | --config "${test_dir}/kind-karpenter.yaml"
19 | export KUBECONFIG=${KUBECONFIG_FILE}
20 | fi
21 |
22 | # upload k8s-shredder image inside kind cluster
23 | kind load docker-image adobe/k8s-shredder:"${K8S_SHREDDER_VERSION}" --name "${K8S_CLUSTER_NAME}"
24 |
25 | namespace_status=$(kubectl get ns ns-k8s-shredder-test -o json | jq .status.phase -r)
26 |
27 | if [[ $namespace_status == "Active" ]]
28 | then
29 | echo "KIND: Namespace ns-k8s-shredder-test and ns-team-k8s-shredder-test already present"
30 | else
31 | echo "KIND: creating ns-team-k8s-shredder-test and ns-k8s-shredder-test namespaces..."
32 | kubectl create namespace ns-k8s-shredder-test
33 | kubectl create namespace ns-team-k8s-shredder-test
34 | fi
35 |
36 | if [[ ${ENABLE_APISERVER_DEBUG} == "true" ]]
37 | then
38 | echo -e "K8S_SHREDDER: Enable debug logging on apiserver"
39 | TOKEN=$(kubectl create token default)
40 |
41 | APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"kind-${K8S_CLUSTER_NAME}\")].cluster.server}")
42 | curl -s -X PUT -d '5' "$APISERVER"/debug/flags/v --header "Authorization: Bearer $TOKEN" -k
43 | fi
44 |
45 | echo "KARPENTER: Note - this is a simplified test setup that simulates Karpenter without installing it"
46 | echo "KARPENTER: In this test environment, we'll simulate drifted NodeClaims using mock objects"
47 | echo "KARPENTER: The k8s-shredder Karpenter drift detection will be tested against these objects"
48 |
49 | # Create karpenter namespace for testing
50 | kubectl create namespace karpenter || true
51 |
52 | # Create mock Karpenter CRDs for testing (simplified versions)
53 | echo "KARPENTER: Creating mock Karpenter CRDs for testing..."
54 | cat < /dev/null)
166 | retry_count=$((retry_count+1))
167 | done
168 | echo ""
169 |
170 | echo "K8S_SHREDDER: waiting for rollout object PDB to become ready!"
171 | retry_count=0
172 | while [[ $(kubectl get pdb -n ns-team-k8s-shredder-test test-app-argo-rollout \
173 | -o jsonpath="{.status.currentHealthy}" 2> /dev/null) != "2" ]]; do
174 | # set 5 minute timeout
175 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi
176 | # shellcheck disable=SC2059
177 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5;
178 | retry_count=$((retry_count+1))
179 | done
180 |
181 | echo ""
182 | kubectl logs -l app.kubernetes.io/name=k8s-shredder -n kube-system
183 |
184 | echo "K8S_SHREDDER: waiting for prometheus deployment to become ready!"
185 | retry_count=0
186 | while [[ $(kubectl get pods -n kube-system -l app=prometheus \
187 | -o jsonpath="{.items[0].status.conditions[?(@.type=='Ready')].status}" 2> /dev/null) != "True" ]]; do
188 | # set 5 minute timeout
189 | if [[ ${retry_count} == 600 ]]; then echo "Timeout exceeded!" && exit 1; fi
190 | # shellcheck disable=SC2059
191 | printf "\b${sp:i++%${#sp}:1}" && sleep 0.5;
192 | retry_count=$((retry_count+1))
193 | done
194 |
195 | echo ""
196 |
197 | echo -e "K8S_SHREDDER: You can access k8s-shredder metrics at http://localhost:1234/metrics after running
198 | kubectl port-forward -n kube-system svc/k8s-shredder --kubeconfig=${KUBECONFIG_FILE} 1234:8080\n
199 | It can take few minutes before seeing k8s-shredder metrics..."
200 |
201 | echo -e "K8S_SHREDDER: You can access k8s-shredder logs by running
202 | kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder --kubeconfig=${KUBECONFIG_FILE} \n"
203 |
204 | echo -e "K8S_SHREDDER: You can access prometheus metrics at http://localhost:1234 after running
205 | kubectl port-forward -n kube-system svc/prometheus --kubeconfig=${KUBECONFIG_FILE} 1234:9090\n"
206 |
207 | echo "KARPENTER: Environment setup complete!"
208 | echo "KARPENTER: Mock Karpenter CRDs are ready for testing"
209 | echo ""
210 | echo "KARPENTER: To test drift detection, the upgrade script will create mock drifted NodeClaims..."
211 |
--------------------------------------------------------------------------------
/pkg/utils/node_label_detection.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE-2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package utils
13 |
14 | import (
15 | "context"
16 | "strings"
17 | "time"
18 |
19 | "github.com/adobe/k8s-shredder/pkg/config"
20 | "github.com/adobe/k8s-shredder/pkg/metrics"
21 | "github.com/pkg/errors"
22 | log "github.com/sirupsen/logrus"
23 | v1 "k8s.io/api/core/v1"
24 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25 | "k8s.io/client-go/kubernetes"
26 | )
27 |
28 | // NodeLabelInfo holds information about a node that matches the label criteria
29 | type NodeLabelInfo struct {
30 | Name string
31 | Labels map[string]string
32 | }
33 |
34 | // parseLabelSelector parses a label selector string that can be either "key" or "key=value"
35 | func parseLabelSelector(selector string, logger *log.Entry) (string, string, bool) {
36 | logger.WithField("selector", selector).Debug("Parsing label selector")
37 |
38 | if strings.Contains(selector, "=") {
39 | parts := strings.SplitN(selector, "=", 2)
40 | logger.WithFields(log.Fields{
41 | "key": parts[0],
42 | "value": parts[1],
43 | }).Debug("Parsed key=value selector")
44 | return parts[0], parts[1], true
45 | }
46 |
47 | logger.WithField("key", selector).Debug("Parsed key-only selector")
48 | return selector, "", false
49 | }
50 |
51 | // nodeMatchesLabelSelectors checks if a node matches any (rather than all) of the label selectors
52 | // and excludes nodes that are already parked
53 | func nodeMatchesLabelSelectors(node *v1.Node, labelSelectors []string, upgradeStatusLabel string, logger *log.Entry) bool {
54 | nodeLogger := logger.WithField("nodeName", node.Name)
55 | nodeLogger.Debug("Checking if node matches label selectors")
56 |
57 | nodeLabels := node.Labels
58 | if nodeLabels == nil {
59 | nodeLogger.Debug("Node has no labels")
60 | return false
61 | }
62 |
63 | // First check if the node is already parked - if so, exclude it
64 | if upgradeStatusLabel != "" {
65 | if upgradeStatus, exists := nodeLabels[upgradeStatusLabel]; exists && upgradeStatus == "parked" {
66 | nodeLogger.Debug("Node is already parked, excluding from selection")
67 | return false
68 | }
69 | }
70 |
71 | for _, selector := range labelSelectors {
72 | selectorLogger := nodeLogger.WithField("selector", selector)
73 | key, value, hasValue := parseLabelSelector(selector, selectorLogger)
74 |
75 | if nodeValue, exists := nodeLabels[key]; exists {
76 | if !hasValue {
77 | // If the selector is just a key, match if the key exists
78 | selectorLogger.WithField("nodeValue", nodeValue).Info("Node matches key-only selector")
79 | return true
80 | } else if nodeValue == value {
81 | // If the selector has a value, match if key=value
82 | selectorLogger.WithFields(log.Fields{
83 | "expectedValue": value,
84 | "nodeValue": nodeValue,
85 | }).Info("Node matches key=value selector")
86 | return true
87 | } else {
88 | selectorLogger.WithFields(log.Fields{
89 | "expectedValue": value,
90 | "nodeValue": nodeValue,
91 | }).Debug("Node value doesn't match selector value")
92 | }
93 | } else {
94 | selectorLogger.Debug("Node doesn't have the selector key")
95 | }
96 | }
97 |
98 | nodeLogger.Debug("Node doesn't match any label selectors")
99 | return false
100 | }
101 |
102 | // FindNodesWithLabels scans the kubernetes cluster for nodes that match the specified label selectors
103 | // and excludes nodes that are already labeled as parked
104 | func FindNodesWithLabels(ctx context.Context, k8sClient kubernetes.Interface, cfg config.Config, logger *log.Entry) ([]NodeLabelInfo, error) {
105 | logger = logger.WithField("function", "FindNodesWithLabels")
106 |
107 | if len(cfg.NodeLabelsToDetect) == 0 {
108 | logger.Debug("No node labels configured for detection")
109 | return []NodeLabelInfo{}, nil
110 | }
111 |
112 | logger.WithField("labelSelectors", cfg.NodeLabelsToDetect).Debug("Listing nodes with specified labels")
113 |
114 | // List all nodes (we'll filter them using an OR condition in nodeMatchesLabelSelectors)
115 | listOptions := metav1.ListOptions{}
116 | nodeList, err := k8sClient.CoreV1().Nodes().List(ctx, listOptions)
117 | if err != nil {
118 | logger.WithError(err).Error("Failed to list nodes")
119 | return nil, errors.Wrap(err, "failed to list nodes")
120 | }
121 |
122 | logger.WithField("totalNodes", len(nodeList.Items)).Debug("Retrieved nodes list")
123 |
124 | var matchingNodes []NodeLabelInfo
125 |
126 | for _, node := range nodeList.Items {
127 | // Check if the node matches any of the label selectors (this now also excludes already parked nodes)
128 | if nodeMatchesLabelSelectors(&node, cfg.NodeLabelsToDetect, cfg.UpgradeStatusLabel, logger) {
129 | logger.WithField("nodeName", node.Name).Info("Found node matching label criteria")
130 |
131 | matchingNodes = append(matchingNodes, NodeLabelInfo{
132 | Name: node.Name,
133 | Labels: node.Labels,
134 | })
135 | }
136 | }
137 |
138 | logger.WithField("matchingCount", len(matchingNodes)).Info("Found nodes matching label criteria")
139 |
140 | return matchingNodes, nil
141 | }
142 |
143 | // ParkNodesWithLabels labels nodes that match the configured label selectors with the standard parking labels
144 | func ParkNodesWithLabels(ctx context.Context, k8sClient kubernetes.Interface, matchingNodes []NodeLabelInfo, cfg config.Config, dryRun bool, logger *log.Entry) error {
145 | logger = logger.WithField("function", "ParkNodesWithLabels")
146 |
147 | logger.WithField("matchingNodesCount", len(matchingNodes)).Info("Starting to park nodes with labels")
148 |
149 | // Convert NodeLabelInfo to NodeInfo for the common parking function
150 | var nodesToPark []NodeInfo
151 | for _, nodeInfo := range matchingNodes {
152 | logger.WithField("nodeName", nodeInfo.Name).Debug("Adding node to parking list")
153 | nodesToPark = append(nodesToPark, NodeInfo(nodeInfo))
154 | }
155 |
156 | logger.WithField("nodesToPark", len(nodesToPark)).Info("Converted labeled nodes to parking list")
157 |
158 | // Apply MaxParkedNodes limit if configured
159 | limitedNodes, err := LimitNodesToPark(ctx, k8sClient, nodesToPark, cfg.MaxParkedNodes, cfg.UpgradeStatusLabel, logger)
160 | if err != nil {
161 | logger.WithError(err).Error("Failed to apply MaxParkedNodes limit")
162 | return errors.Wrap(err, "failed to apply MaxParkedNodes limit")
163 | }
164 |
165 | if len(limitedNodes) == 0 {
166 | logger.Info("No nodes to park after applying MaxParkedNodes limit")
167 | return nil
168 | }
169 |
170 | // Use the common parking function
171 | return ParkNodes(ctx, k8sClient, limitedNodes, cfg, dryRun, "node-labels", logger)
172 | }
173 |
174 | // ProcessNodesWithLabels is the main function that combines finding nodes with specific labels and parking them
175 | func ProcessNodesWithLabels(ctx context.Context, appContext *AppContext, logger *log.Entry) error {
176 | logger = logger.WithField("function", "ProcessNodesWithLabels")
177 |
178 | logger.Info("Starting node label detection and parking process")
179 |
180 | // Start timing the processing duration
181 | startTime := time.Now()
182 |
183 | // Find nodes with specified labels
184 | matchingNodes, err := FindNodesWithLabels(ctx, appContext.K8sClient, appContext.Config, logger)
185 | if err != nil {
186 | logger.WithError(err).Error("Failed to find nodes with specified labels")
187 | return errors.Wrap(err, "failed to find nodes with specified labels")
188 | }
189 |
190 | // Update the matching nodes gauge
191 | metrics.ShredderNodeLabelMatchingNodesTotal.Set(float64(len(matchingNodes)))
192 |
193 | if len(matchingNodes) == 0 {
194 | logger.Info("No nodes found matching the specified label criteria")
195 | return nil
196 | }
197 |
198 | // Park the nodes that match the criteria
199 | err = ParkNodesWithLabels(ctx, appContext.K8sClient, matchingNodes, appContext.Config, appContext.IsDryRun(), logger)
200 | if err != nil {
201 | logger.WithError(err).Error("Failed to label nodes matching criteria")
202 | metrics.ShredderNodeLabelNodesParkingFailedTotal.Add(float64(len(matchingNodes)))
203 | metrics.ShredderNodesParkingFailedTotal.Add(float64(len(matchingNodes)))
204 | return errors.Wrap(err, "failed to label nodes matching criteria")
205 | }
206 |
207 | // Increment the successfully parked nodes counter
208 | metrics.ShredderNodeLabelNodesParkedTotal.Add(float64(len(matchingNodes)))
209 | metrics.ShredderNodesParkedTotal.Add(float64(len(matchingNodes)))
210 |
211 | // Record the processing duration
212 | metrics.ShredderNodeLabelProcessingDurationSeconds.Observe(time.Since(startTime).Seconds())
213 | metrics.ShredderProcessingDurationSeconds.Observe(time.Since(startTime).Seconds())
214 |
215 | logger.WithField("processedNodes", len(matchingNodes)).Info("Completed node label detection and parking process")
216 |
217 | return nil
218 | }
219 |
--------------------------------------------------------------------------------
/docs/metrics.md:
--------------------------------------------------------------------------------
1 | # k8s-shredder Metrics
2 |
3 | This document describes all the metrics exposed by k8s-shredder. These metrics are available at the `/metrics` endpoint and can be scraped by Prometheus or other monitoring systems.
4 |
5 | ## Overview
6 |
7 | k8s-shredder exposes metrics in Prometheus format to help operators monitor the health and performance of the node parking and eviction processes. The metrics are organized into several categories:
8 |
9 | - **Core Operation Metrics**: General operation counters and timing
10 | - **API Server Metrics**: Kubernetes API interaction metrics
11 | - **Node Processing Metrics**: Node parking and processing statistics
12 | - **Pod Processing Metrics**: Pod eviction and processing statistics
13 | - **Karpenter Integration Metrics**: Karpenter drift detection metrics
14 | - **Node Label Detection Metrics**: Node label-based detection metrics
15 | - **Shared Metrics**: Aggregated metrics across all detection methods
16 |
17 | ## Core Operation Metrics
18 |
19 | ### `shredder_loops_total`
20 | - **Type**: Counter
21 | - **Description**: Total number of eviction loops completed
22 | - **Use Case**: Monitor the frequency of eviction loop execution and overall system activity
23 |
24 | ### `shredder_loops_duration_seconds`
25 | - **Type**: Summary
26 | - **Description**: Duration of eviction loops in seconds
27 | - **Objectives**: 0.5: 1200, 0.9: 900, 0.99: 600
28 | - **Use Case**: Monitor the performance of eviction loops and identify slow operations
29 |
30 | ### `shredder_errors_total`
31 | - **Type**: Counter
32 | - **Description**: Total number of errors encountered during operation
33 | - **Use Case**: Monitor system health and identify operational issues
34 |
35 | ## API Server Metrics
36 |
37 | ### `shredder_apiserver_requests_total`
38 | - **Type**: Counter Vector
39 | - **Labels**: `verb`, `resource`, `status`
40 | - **Description**: Total requests made to the Kubernetes API
41 | - **Use Case**: Monitor API usage patterns and identify potential rate limiting issues
42 |
43 | ### `shredder_apiserver_requests_duration_seconds`
44 | - **Type**: Summary Vector
45 | - **Labels**: `verb`, `resource`, `status`
46 | - **Description**: Duration of Kubernetes API requests in seconds
47 | - **Objectives**: 0.5: 0.05, 0.9: 0.01, 0.99: 0.001
48 | - **Use Case**: Monitor API performance and identify slow API calls
49 |
50 | ## Node Processing Metrics
51 |
52 | ### `shredder_processed_nodes_total`
53 | - **Type**: Counter
54 | - **Description**: Total number of nodes processed during eviction loops
55 | - **Use Case**: Monitor the volume of node processing activity
56 |
57 | ### `shredder_node_force_to_evict_time`
58 | - **Type**: Gauge Vector
59 | - **Labels**: `node_name`
60 | - **Description**: Unix timestamp when a node will be forcibly evicted
61 | - **Use Case**: Monitor when nodes are scheduled for forced eviction
62 |
63 | ## Pod Processing Metrics
64 |
65 | ### `shredder_processed_pods_total`
66 | - **Type**: Counter
67 | - **Description**: Total number of pods processed during eviction loops
68 | - **Use Case**: Monitor the volume of pod processing activity
69 |
70 | ### `shredder_pod_errors_total`
71 | - **Type**: Gauge Vector
72 | - **Labels**: `pod_name`, `namespace`, `reason`, `action`
73 | - **Description**: Total pod errors per eviction loop
74 | - **Use Case**: Monitor pod eviction failures and their reasons
75 |
76 | ### `shredder_pod_force_to_evict_time`
77 | - **Type**: Gauge Vector
78 | - **Labels**: `pod_name`, `namespace`
79 | - **Description**: Unix timestamp when a pod will be forcibly evicted
80 | - **Use Case**: Monitor when pods are scheduled for forced eviction
81 |
82 | ## Karpenter Integration Metrics
83 |
84 | ### `shredder_karpenter_drifted_nodes_total`
85 | - **Type**: Counter
86 | - **Description**: Total number of drifted Karpenter nodes detected
87 | - **Use Case**: Monitor the volume of Karpenter drift detection activity
88 |
89 | ### `shredder_karpenter_disrupted_nodes_total`
90 | - **Type**: Counter
91 | - **Description**: Total number of disrupted Karpenter nodes detected
92 | - **Use Case**: Monitor the volume of Karpenter disruption detection activity
93 |
94 | ### `shredder_karpenter_nodes_parked_total`
95 | - **Type**: Counter
96 | - **Description**: Total number of Karpenter nodes successfully parked
97 | - **Use Case**: Monitor successful Karpenter node parking operations
98 |
99 | ### `shredder_karpenter_nodes_parking_failed_total`
100 | - **Type**: Counter
101 | - **Description**: Total number of Karpenter nodes that failed to be parked
102 | - **Use Case**: Monitor Karpenter node parking failures
103 |
104 | ### `shredder_karpenter_processing_duration_seconds`
105 | - **Type**: Summary
106 | - **Description**: Duration of Karpenter node processing in seconds
107 | - **Objectives**: 0.5: 0.05, 0.9: 0.01, 0.99: 0.001
108 | - **Use Case**: Monitor the performance of Karpenter drift detection and parking operations
109 |
110 | ## Node Label Detection Metrics
111 |
112 | ### `shredder_node_label_nodes_parked_total`
113 | - **Type**: Counter
114 | - **Description**: Total number of nodes successfully parked via node label detection
115 | - **Use Case**: Monitor successful node label-based parking operations
116 |
117 | ### `shredder_node_label_nodes_parking_failed_total`
118 | - **Type**: Counter
119 | - **Description**: Total number of nodes that failed to be parked via node label detection
120 | - **Use Case**: Monitor node label-based parking failures
121 |
122 | ### `shredder_node_label_processing_duration_seconds`
123 | - **Type**: Summary
124 | - **Description**: Duration of node label detection and parking process in seconds
125 | - **Objectives**: 0.5: 0.05, 0.9: 0.01, 0.99: 0.001
126 | - **Use Case**: Monitor the performance of node label detection and parking operations
127 |
128 | ### `shredder_node_label_matching_nodes_total`
129 | - **Type**: Gauge
130 | - **Description**: Total number of nodes matching the label criteria
131 | - **Use Case**: Monitor the current number of nodes that match the configured label selectors
132 |
133 | ## Shared Metrics
134 |
135 | These metrics aggregate data across all detection methods (Karpenter and node label detection) to provide a unified view of node parking activity.
136 |
137 | ### `shredder_nodes_parked_total`
138 | - **Type**: Counter
139 | - **Description**: Total number of nodes successfully parked (shared across all detection methods)
140 | - **Use Case**: Monitor total node parking activity regardless of detection method
141 |
142 | ### `shredder_nodes_parking_failed_total`
143 | - **Type**: Counter
144 | - **Description**: Total number of nodes that failed to be parked (shared across all detection methods)
145 | - **Use Case**: Monitor total node parking failures regardless of detection method
146 |
147 | ### `shredder_processing_duration_seconds`
148 | - **Type**: Summary
149 | - **Description**: Duration of node processing in seconds (shared across all detection methods)
150 | - **Objectives**: 0.5: 0.05, 0.9: 0.01, 0.99: 0.001
151 | - **Use Case**: Monitor total node processing performance regardless of detection method
152 |
153 | ## Metric Relationships
154 |
155 | ### Detection Method Metrics
156 | - **Karpenter metrics** are incremented when `EnableKarpenterDriftDetection=true`
157 | - **Node label metrics** are incremented when `EnableNodeLabelDetection=true`
158 | - **Shared metrics** are incremented whenever either detection method processes nodes
159 |
160 | ### Processing Flow
161 | 1. **Detection**: Nodes are identified via Karpenter drift or label matching
162 | 2. **Parking**: Nodes are labeled, cordoned, and tainted
163 | 3. **Eviction**: Pods are evicted from parked nodes over time
164 | 4. **Cleanup**: Nodes are eventually removed when all pods are evicted
165 |
166 | ## Alerting Recommendations
167 |
168 | ### High Error Rates
169 | ```promql
170 | rate(shredder_errors_total[5m]) > 0.1
171 | ```
172 |
173 | ### Slow Processing
174 | ```promql
175 | histogram_quantile(0.95, rate(shredder_processing_duration_seconds_bucket[5m])) > 30
176 | ```
177 |
178 | ### Failed Node Parking
179 | ```promql
180 | rate(shredder_nodes_parking_failed_total[5m]) > 0
181 | ```
182 |
183 | ### High API Latency
184 | ```promql
185 | histogram_quantile(0.95, rate(shredder_apiserver_requests_duration_seconds_bucket[5m])) > 5
186 | ```
187 |
188 | ### Parked Pods Alert
189 | ```promql
190 | # Alert when pods are running on parked nodes
191 | kube_ethos_upgrade:parked_pod > 0
192 | ```
193 |
194 | ## Example Queries
195 |
196 | ### Node Parking Success Rate
197 | ```promql
198 | rate(shredder_nodes_parked_total[5m]) / (rate(shredder_nodes_parked_total[5m]) + rate(shredder_nodes_parking_failed_total[5m]))
199 | ```
200 |
201 | ### Average Processing Duration
202 | ```promql
203 | histogram_quantile(0.5, rate(shredder_processing_duration_seconds_bucket[5m]))
204 | ```
205 |
206 | ### Nodes Parked by Detection Method
207 | ```promql
208 | # Karpenter nodes
209 | rate(shredder_karpenter_nodes_parked_total[5m])
210 |
211 | # Label-based nodes
212 | rate(shredder_node_label_nodes_parked_total[5m])
213 | ```
214 |
215 | ### Current Matching Nodes
216 | ```promql
217 | shredder_node_label_matching_nodes_total
218 | ```
219 |
220 | ## Configuration
221 |
222 | Metrics are exposed on the configured port (default: 8080) at the `/metrics` endpoint. The metrics server can be configured using the following options:
223 |
224 | - **Metrics Port**: Configure the port for metrics exposure
225 | - **Health Endpoint**: Available at `/healthz` for health checks
226 | - **OpenMetrics Format**: Enabled by default for better compatibility
227 |
228 | For more information about configuring k8s-shredder, see the [main README](../README.md).
229 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: default help format lint vet security build build-prereq push unit-test local-test local-test-karpenter local-test-node-labels ci clean e2e-tests check-license helm-docs
2 |
3 | NAME ?= adobe/k8s-shredder
4 | K8S_SHREDDER_VERSION ?= "dev"
5 | KINDNODE_VERSION ?= "v1.34.0"
6 | COMMIT ?= $(shell git rev-parse --short HEAD)
7 | TEST_CLUSTERNAME ?= "k8s-shredder-test-cluster"
8 | TEST_CLUSTERNAME_KARPENTER ?= "k8s-shredder-test-cluster-karpenter"
9 | TEST_CLUSTERNAME_NODE_LABELS ?= "k8s-shredder-test-cluster-node-labels"
10 | KUBECONFIG_LOCALTEST ?= "kubeconfig-localtest"
11 | KUBECONFIG_KARPENTER ?= "kubeconfig-local-test-karpenter"
12 | KUBECONFIG_NODE_LABELS ?= "kubeconfig-local-test-node-labels"
13 |
14 | GOSEC=gosec -quiet -exclude=G107
15 |
16 | default: help
17 |
18 | help: ## Print this help text
19 | @printf "\n"
20 | @awk 'BEGIN {FS = ":.*?## "}; ($$2 && !/@awk/){printf "${CYAN}%-30s${NC} %s\n", $$1, $$2}' $(lastword ${MAKEFILE_LIST}) | sort
21 | @printf "\n"
22 |
23 | # CI
24 | # -----------
25 | format: helm-docs ## Format go code and YAML files
26 | @echo "Format go code..."
27 | @go fmt ./...
28 | @hash golangci-lint 2>/dev/null && { golangci-lint run --fix ./... ; } || { \
29 | echo >&2 "[WARN] I require golangci-lint but it's not installed (see https://github.com/golangci/golangci-lint). Skipping golangci-lint format."; \
30 | }
31 | @hash yamlfix 2>/dev/null && { \
32 | echo "Format YAML files..."; \
33 | find . -name "*.yaml" -o -name "*.yml" | grep -v "/templates/" | xargs yamlfix 2>/dev/null || true ; \
34 | echo "YAML files formatted!" ; \
35 | } || { \
36 | echo >&2 "[WARN] I require yamlfix but it's not installed (see https://github.com/lyz-code/yamlfix). Skipping YAML format."; \
37 | }
38 |
39 | lint: ## Lint go code and YAML files
40 | @hash golangci-lint 2>/dev/null && { \
41 | echo "Checking go code style..."; \
42 | echo "Run "make format" in case of failures!"; \
43 | golangci-lint run -v --timeout 5m --no-config ./... ; \
44 | echo "Go code style OK!" ; \
45 | } || { \
46 | echo >&2 "[WARN] I require golangci-lint but it's not installed (see https://github.com/golangci/golangci-lint). Skipping lint."; \
47 | }
48 | @hash yamlfix 2>/dev/null && { \
49 | echo "Checking YAML files..."; \
50 | find . -name "*.yaml" -o -name "*.yml" | grep -v "/templates/" | xargs yamlfix --check 2>/dev/null || { \
51 | echo "YAML files have formatting issues. Run 'make format' to fix them."; \
52 | exit 1; \
53 | } ; \
54 | echo "YAML files OK!" ; \
55 | } || { \
56 | echo >&2 "[WARN] I require yamlfix but it's not installed (see https://github.com/lyz-code/yamlfix). Skipping YAML lint."; \
57 | }
58 | @hash kubeconform 2>/dev/null && { \
59 | echo "Validating Kubernetes manifests with kubeconform..."; \
60 | find internal/testing -name "*.yaml" -o -name "*.yml" | xargs kubeconform -strict -skip CustomResourceDefinition,EC2NodeClass,NodePool,Rollout,Cluster || { \
61 | echo "Kubeconform found schema errors. Please fix them."; \
62 | exit 1; \
63 | } ; \
64 | echo "Kubeconform validation OK!" ; \
65 | } || { \
66 | echo >&2 "[WARN] I require kubeconform but it's not installed (see https://github.com/yannh/kubeconform). Skipping kubeconform lint."; \
67 | }
68 | @hash helm-docs 2>/dev/null && { \
69 | echo "Checking Helm documentation..."; \
70 | helm-docs --chart-search-root=charts --template-files=README.md.gotmpl --dry-run >/dev/null 2>&1 || { \
71 | echo "Helm documentation is out of date. Run 'make format' to update it."; \
72 | exit 1; \
73 | } ; \
74 | echo "Helm documentation OK!" ; \
75 | } || { \
76 | echo >&2 "[WARN] I require helm-docs but it's not installed (see https://github.com/norwoodj/helm-docs). Skipping Helm documentation lint."; \
77 | }
78 |
79 | vet: ## Vetting go code
80 | @echo 'Vetting go code and identify subtle source code issues...'
81 | @go vet ./...
82 | @echo 'Not issues found in go codebase!'
83 |
84 | security: ## Inspects go source code for security problems
85 | @hash gosec 2>/dev/null && { \
86 | echo "Checking go source code for security problems..."; \
87 | $(GOSEC) ./... ; \
88 | echo "No security problems found in the go codebase!" ; \
89 | } || { \
90 | echo >&2 "[WARN] I require gosec but it's not installed (see https://github.com/securego/gosec). Skipping security inspections."; \
91 | }
92 | check-license: ## Check if all go files have the license header set
93 | @echo "Checking files for license header"
94 | @./internal/check_license.sh
95 |
96 | helm-docs: ## Generate Helm chart documentation
97 | @hash helm-docs 2>/dev/null && { \
98 | echo "Generating Helm chart documentation..."; \
99 | helm-docs --chart-search-root=charts --template-files=README.md.gotmpl ; \
100 | echo "Helm documentation generated!" ; \
101 | } || { \
102 | echo >&2 "[WARN] I require helm-docs but it's not installed (see https://github.com/norwoodj/helm-docs). Skipping documentation generation."; \
103 | }
104 |
105 | build: check-license lint vet security unit-test ## Builds the local Docker container for development
106 | @CGO_ENABLED=0 GOOS=linux go build \
107 | -ldflags="-s -w -X github.com/adobe/k8s-shredder/cmd.buildVersion=${K8S_SHREDDER_VERSION}-${COMMIT} -X github.com/adobe/k8s-shredder/cmd.gitSHA=${COMMIT} -X github.com/adobe/k8s-shredder/cmd.buildTime=$(date)" \
108 | -o k8s-shredder
109 | @CGO_ENABLED=0 go build \
110 | -ldflags="-s -w" \
111 | -o park-node \
112 | ./cmd/park-node
113 | @DOCKER_BUILDKIT=1 docker build -t ${NAME}:${K8S_SHREDDER_VERSION} .
114 |
115 | # TEST
116 | # -----------
117 | local-test: build ## Test docker image in a kind cluster (with Karpenter drift and node label detection disabled)
118 | @hash kind 2>/dev/null && { \
119 | echo "Test docker image in a kind cluster..."; \
120 | ./internal/testing/local_env_prep_helm.sh "${K8S_SHREDDER_VERSION}" "${KINDNODE_VERSION}" "${TEST_CLUSTERNAME}" "${KUBECONFIG_LOCALTEST}" && \
121 | ./internal/testing/cluster_upgrade.sh "${TEST_CLUSTERNAME}" "${KUBECONFIG_LOCALTEST}" || \
122 | exit 1; \
123 | } || { \
124 | echo >&2 "[WARN] I require kind but it's not installed(see https://kind.sigs.k8s.io). Assuming a cluster is already accessible."; \
125 | }
126 |
127 | local-test-karpenter: build ## Test docker image in a kind cluster with Karpenter drift and disruption detection enabled
128 | @hash kind 2>/dev/null && { \
129 | echo "Test docker image in a kind cluster with Karpenter drift and disruption detection..."; \
130 | ./internal/testing/local_env_prep_karpenter_helm.sh "${K8S_SHREDDER_VERSION}" "${KINDNODE_VERSION}" "${TEST_CLUSTERNAME_KARPENTER}" "${KUBECONFIG_KARPENTER}" && \
131 | ./internal/testing/cluster_upgrade_karpenter.sh "${TEST_CLUSTERNAME_KARPENTER}" "${KUBECONFIG_KARPENTER}" || \
132 | exit 1; \
133 | } || { \
134 | echo >&2 "[WARN] I require kind but it's not installed(see https://kind.sigs.k8s.io). Assuming a cluster is already accessible."; \
135 | }
136 |
137 |
138 | local-test-node-labels: build ## Test docker image in a kind cluster with node label detection enabled
139 | @hash kind 2>/dev/null && { \
140 | echo "Test docker image in a kind cluster with node label detection..."; \
141 | ./internal/testing/local_env_prep_node_labels_helm.sh "${K8S_SHREDDER_VERSION}" "${KINDNODE_VERSION}" "${TEST_CLUSTERNAME_NODE_LABELS}" "${KUBECONFIG_NODE_LABELS}" && \
142 | ./internal/testing/cluster_upgrade_node_labels.sh "${TEST_CLUSTERNAME_NODE_LABELS}" "${KUBECONFIG_NODE_LABELS}" || \
143 | exit 1; \
144 | } || { \
145 | echo >&2 "[WARN] I require kind but it's not installed(see https://kind.sigs.k8s.io). Assuming a cluster is already accessible."; \
146 | }
147 |
148 | unit-test: ## Run unit tests
149 | @echo "Run unit tests for k8s-shredder..."
150 | @go test ./pkg/... -coverprofile=
151 |
152 |
153 | e2e-tests: ## Run e2e tests for k8s-shredder deployed in a local kind cluster
154 | @echo "Run e2e tests for k8s-shredder..."
155 | @if [ -f "${PWD}/${KUBECONFIG_KARPENTER}" ]; then \
156 | echo "Using Karpenter test cluster configuration..."; \
157 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_KARPENTER} go test internal/testing/e2e_test.go -v; \
158 | elif [ -f "${PWD}/${KUBECONFIG_NODE_LABELS}" ]; then \
159 | echo "Using node labels test cluster configuration..."; \
160 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_NODE_LABELS} go test internal/testing/e2e_test.go -v; \
161 | else \
162 | echo "Using default test cluster configuration..."; \
163 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_LOCALTEST} go test internal/testing/e2e_test.go -v; \
164 | fi
165 |
166 | # DEMO targets
167 | # -----------
168 | .PHONY: demo.prep demo.run demo.rollback
169 | demo.prep: build ## Setup demo cluster
170 | echo "Setup demo cluster..."
171 | ./internal/testing/local_env_prep_helm.sh "${K8S_SHREDDER_VERSION}" "${KINDNODE_VERSION}" "${TEST_CLUSTERNAME}"
172 |
173 | demo.run: ## Run demo
174 | ./internal/testing/cluster_upgrade.sh "${TEST_CLUSTERNAME}"
175 |
176 | demo.rollback: ## Rollback demo
177 | ./internal/testing/rollback_cluster_upgrade.sh "${TEST_CLUSTERNAME}"
178 |
179 |
180 | ci: local-test e2e-tests clean ## Run CI
181 |
182 | # PUBLISH
183 | # -----------
184 | publish: ## Release a new version
185 | @goreleaser release --clean
186 |
187 | # CLEANUP
188 | # -----------
189 | clean: ## Clean up local testing environment
190 | @echo "Cleaning up your local testing environment..."
191 | @kind delete cluster --name="${TEST_CLUSTERNAME}" ## > /dev/null 2>&1 || true
192 | @kind delete cluster --name="${TEST_CLUSTERNAME_KARPENTER}" ## > /dev/null 2>&1 || true
193 | @kind delete cluster --name="${TEST_CLUSTERNAME_NODE_LABELS}" ## > /dev/null 2>&1 || true
194 | @echo "Removing all generated files and directories"
195 | @rm -rf dist/ k8s-shredder park-node kubeconfig ${KUBECONFIG_LOCALTEST} ${KUBECONFIG_KARPENTER} ${KUBECONFIG_NODE_LABELS}
196 | @echo "Done!"
197 |
--------------------------------------------------------------------------------
/pkg/schedule/schedule.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2025 Adobe. All rights reserved.
3 | This file is licensed to you under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License. You may obtain a copy
5 | of the License at http://www.apache.org/licenses/LICENSE/2.0
6 | Unless required by applicable law or agreed to in writing, software distributed under
7 | the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | OF ANY KIND, either express or implied. See the License for the specific language
9 | governing permissions and limitations under the License.
10 | */
11 |
12 | package schedule
13 |
14 | import (
15 | "fmt"
16 | "strings"
17 | "time"
18 |
19 | "github.com/pkg/errors"
20 | "github.com/robfig/cron/v3"
21 | )
22 |
23 | // Schedule represents a time window defined by a cron schedule and duration
24 | type Schedule struct {
25 | // CronSchedule is the cron expression (supports macros like @daily, @hourly, etc.)
26 | CronSchedule string
27 | // Duration is how long the window stays active after the schedule triggers
28 | Duration time.Duration
29 | // parser is the cron parser instance
30 | parser cron.Parser
31 | // schedule is the parsed cron schedule
32 | schedule cron.Schedule
33 | }
34 |
35 | // NewSchedule creates a new Schedule instance from a cron expression and duration string
36 | // The cron expression supports standard cron syntax and macros (@yearly, @monthly, @weekly, @daily, @hourly)
37 | // The duration string supports compound durations with minutes and hours (e.g., "10h5m", "30m", "160h")
38 | func NewSchedule(cronExpr string, durationStr string) (*Schedule, error) {
39 | if cronExpr == "" {
40 | return nil, errors.New("cron schedule cannot be empty")
41 | }
42 |
43 | if durationStr == "" {
44 | return nil, errors.New("duration cannot be empty")
45 | }
46 |
47 | // Parse duration - supports compound durations like "10h5m", "30m", "160h"
48 | duration, err := parseDuration(durationStr)
49 | if err != nil {
50 | return nil, errors.Wrapf(err, "failed to parse duration: %s", durationStr)
51 | }
52 |
53 | if duration <= 0 {
54 | return nil, errors.New("duration must be greater than zero")
55 | }
56 |
57 | // Create parser with support for standard cron format and macros
58 | // Try parsing with seconds first (6 fields), then without seconds (5 fields - Kubernetes format)
59 | var schedule cron.Schedule
60 | var parser cron.Parser
61 |
62 | // First try with seconds (6 fields: second minute hour dom month dow)
63 | parser6 := cron.NewParser(cron.Second | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor)
64 | schedule, err = parser6.Parse(cronExpr)
65 | if err != nil {
66 | // If that fails, try without seconds (5 fields: minute hour dom month dow - Kubernetes format)
67 | parser5 := cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor)
68 | schedule, err = parser5.Parse(cronExpr)
69 | if err != nil {
70 | return nil, errors.Wrapf(err, "failed to parse cron schedule: %s", cronExpr)
71 | }
72 | // Use the 5-field parser for future operations
73 | parser = parser5
74 | } else {
75 | parser = parser6
76 | }
77 |
78 | return &Schedule{
79 | CronSchedule: cronExpr,
80 | Duration: duration,
81 | parser: parser,
82 | schedule: schedule,
83 | }, nil
84 | }
85 |
86 | // IsActive checks if the current time (or provided time) falls within the active window
87 | // The window is active from when the schedule triggers until Duration time has passed
88 | func (s *Schedule) IsActive(now time.Time) bool {
89 | if s.schedule == nil {
90 | return false
91 | }
92 |
93 | // Get the most recent time the schedule triggered (before or at now)
94 | // We need to find the last trigger time that is <= now
95 | lastTrigger := s.getLastTriggerTime(now)
96 |
97 | if lastTrigger.IsZero() {
98 | return false
99 | }
100 |
101 | // Check if we're still within the duration window
102 | windowEnd := lastTrigger.Add(s.Duration)
103 | return now.Before(windowEnd) || now.Equal(windowEnd)
104 | }
105 |
106 | // getLastTriggerTime finds the most recent time the schedule triggered before or at the given time
107 | func (s *Schedule) getLastTriggerTime(now time.Time) time.Time {
108 | // For macros, we can calculate directly for efficiency
109 | cronLower := strings.ToLower(s.CronSchedule)
110 | switch cronLower {
111 | case "@yearly", "@annually":
112 | // Triggers at 00:00:00 UTC on January 1st
113 | lastYear := time.Date(now.Year(), 1, 1, 0, 0, 0, 0, time.UTC)
114 | if lastYear.After(now) {
115 | lastYear = time.Date(now.Year()-1, 1, 1, 0, 0, 0, 0, time.UTC)
116 | }
117 | return lastYear
118 | case "@monthly":
119 | // Triggers at 00:00:00 UTC on the 1st of each month
120 | lastMonth := time.Date(now.Year(), now.Month(), 1, 0, 0, 0, 0, time.UTC)
121 | if lastMonth.After(now) {
122 | if now.Month() == 1 {
123 | lastMonth = time.Date(now.Year()-1, 12, 1, 0, 0, 0, 0, time.UTC)
124 | } else {
125 | lastMonth = time.Date(now.Year(), now.Month()-1, 1, 0, 0, 0, 0, time.UTC)
126 | }
127 | }
128 | return lastMonth
129 | case "@weekly":
130 | // Triggers at 00:00:00 UTC on Sunday
131 | lastWeek := now
132 | // Go back to the most recent Sunday
133 | for lastWeek.Weekday() != time.Sunday {
134 | lastWeek = lastWeek.AddDate(0, 0, -1)
135 | }
136 | lastWeek = time.Date(lastWeek.Year(), lastWeek.Month(), lastWeek.Day(), 0, 0, 0, 0, time.UTC)
137 | if lastWeek.After(now) {
138 | lastWeek = lastWeek.AddDate(0, 0, -7)
139 | }
140 | return lastWeek
141 | case "@daily", "@midnight":
142 | // Triggers at 00:00:00 UTC each day
143 | lastDay := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC)
144 | if lastDay.After(now) {
145 | lastDay = lastDay.AddDate(0, 0, -1)
146 | }
147 | return lastDay
148 | case "@hourly":
149 | // Triggers at the top of each hour
150 | lastHour := time.Date(now.Year(), now.Month(), now.Day(), now.Hour(), 0, 0, 0, time.UTC)
151 | if lastHour.After(now) {
152 | lastHour = lastHour.Add(-time.Hour)
153 | }
154 | return lastHour
155 | }
156 |
157 | // For standard cron expressions, iterate backwards to find the last trigger
158 | // We use a binary-search-like approach: start from now and go back
159 | checkWindow := s.getCheckWindow()
160 | checkTime := now
161 | maxIterations := 1000 // Safety limit
162 |
163 | for i := 0; i < maxIterations; i++ {
164 | // Get what the next trigger would be from (checkTime - 1 second)
165 | nextTrigger := s.schedule.Next(checkTime.Add(-time.Second))
166 |
167 | // If the next trigger is at or before now, we found our last trigger
168 | if !nextTrigger.After(now) {
169 | return nextTrigger
170 | }
171 |
172 | // Move back in time - use a smart increment based on the schedule
173 | // For most schedules, going back by the minimum interval works
174 | // We'll go back by minutes, but could optimize further
175 | checkTime = checkTime.Add(-time.Minute)
176 |
177 | // Safety check: don't go back too far
178 | if now.Sub(checkTime) > checkWindow {
179 | break
180 | }
181 | }
182 |
183 | return time.Time{}
184 | }
185 |
186 | // getCheckWindow returns the maximum time window to check backwards
187 | // This is optimized based on the schedule type
188 | func (s *Schedule) getCheckWindow() time.Duration {
189 | cronLower := strings.ToLower(s.CronSchedule)
190 |
191 | // Handle macros
192 | switch cronLower {
193 | case "@yearly", "@annually":
194 | return 2 * 365 * 24 * time.Hour
195 | case "@monthly":
196 | return 2 * 30 * 24 * time.Hour
197 | case "@weekly":
198 | return 2 * 7 * 24 * time.Hour
199 | case "@daily", "@midnight":
200 | return 2 * 24 * time.Hour
201 | case "@hourly":
202 | return 2 * time.Hour
203 | default:
204 | // For standard cron, check up to 7 days back
205 | // This should cover most common schedules
206 | return 7 * 24 * time.Hour
207 | }
208 | }
209 |
210 | // parseDuration parses a duration string supporting compound durations
211 | // Supports formats like "10h5m", "30m", "160h", "1h30m", etc.
212 | // Only supports hours and minutes as per Karpenter's duration format
213 | func parseDuration(durationStr string) (time.Duration, error) {
214 | durationStr = strings.TrimSpace(durationStr)
215 | if durationStr == "" {
216 | return 0, errors.New("duration string cannot be empty")
217 | }
218 |
219 | var totalDuration time.Duration
220 |
221 | // Parse hours
222 | if strings.Contains(durationStr, "h") {
223 | parts := strings.Split(durationStr, "h")
224 | if len(parts) > 0 && parts[0] != "" {
225 | var hours int64
226 | _, err := fmt.Sscanf(parts[0], "%d", &hours)
227 | if err != nil {
228 | return 0, errors.Wrapf(err, "invalid hours in duration: %s", durationStr)
229 | }
230 | totalDuration += time.Duration(hours) * time.Hour
231 | }
232 | // Remaining part might contain minutes
233 | if len(parts) > 1 && parts[1] != "" {
234 | durationStr = parts[1]
235 | } else {
236 | durationStr = ""
237 | }
238 | }
239 |
240 | // Parse minutes
241 | if strings.Contains(durationStr, "m") {
242 | parts := strings.Split(durationStr, "m")
243 | if len(parts) > 0 && parts[0] != "" {
244 | var minutes int64
245 | _, err := fmt.Sscanf(parts[0], "%d", &minutes)
246 | if err != nil {
247 | return 0, errors.Wrapf(err, "invalid minutes in duration: %s", durationStr)
248 | }
249 | totalDuration += time.Duration(minutes) * time.Minute
250 | }
251 | } else if durationStr != "" {
252 | // If there's remaining string that's not "m", it's invalid
253 | return 0, errors.Errorf("invalid duration format: %s (only hours 'h' and minutes 'm' are supported)", durationStr)
254 | }
255 |
256 | if totalDuration == 0 {
257 | return 0, errors.New("duration must be greater than zero")
258 | }
259 |
260 | return totalDuration, nil
261 | }
262 |
--------------------------------------------------------------------------------
/charts/k8s-shredder/README.md:
--------------------------------------------------------------------------------
1 | # k8s-shredder
2 |
3 |   
4 |
5 | a novel way of dealing with kubernetes nodes blocked from draining
6 |
7 | **Homepage:**
8 |
9 | ## Maintainers
10 |
11 | | Name | Email | Url |
12 | | ---- | ------ | --- |
13 | | adriananeci | | |
14 | | sfotony | | |
15 |
16 | ## Values
17 |
18 | | Key | Type | Default | Description |
19 | |-----|------|---------|-------------|
20 | | additionalContainers | list | `[]` | Additional containers to run alongside k8s-shredder in the same pod |
21 | | affinity | object | `{}` | Affinity rules for advanced pod scheduling (node affinity, pod affinity/anti-affinity) |
22 | | deploymentStrategy | object | `{}` | Deployment strategy for rolling updates (e.g., RollingUpdate, Recreate) |
23 | | dryRun | bool | `false` | Enable dry-run mode - when true, k8s-shredder will log actions but not execute them |
24 | | environmentVars | list | `[]` | Additional environment variables to set in the container |
25 | | fullnameOverride | string | `""` | Override the full name used for resources |
26 | | image | object | `{"pullPolicy":"IfNotPresent","registry":"ghcr.io/adobe/k8s-shredder","tag":"latest"}` | Container image configuration |
27 | | image.pullPolicy | string | `"IfNotPresent"` | Image pull policy - IfNotPresent, Always, or Never |
28 | | image.registry | string | `"ghcr.io/adobe/k8s-shredder"` | Container registry where the k8s-shredder image is hosted |
29 | | image.tag | string | `"latest"` | Image tag to use |
30 | | imagePullSecrets | list | `[]` | Secrets for pulling images from private registries |
31 | | initContainers | list | `[]` | Init containers to run before the main k8s-shredder container starts |
32 | | logFormat | string | `"text"` | Log output format: text (human-readable) or json (structured logging) |
33 | | logLevel | string | `"debug"` | Available log levels: panic, fatal, error, warn, warning, info, debug, trace |
34 | | nameOverride | string | `""` | Override the name of the chart |
35 | | nodeSelector | object | `{}` | Node selector to constrain pod scheduling to specific nodes |
36 | | podAnnotations | object | `{}` | Annotations to add to k8s-shredder pod(s) |
37 | | podLabels | object | `{}` | Additional labels to add to k8s-shredder pod(s) |
38 | | podMonitor | object | `{"enabled":false,"honorLabels":true,"interval":"60s","labels":{},"relabelings":[],"scrapeTimeout":"10s"}` | Prometheus monitoring configuration |
39 | | podMonitor.enabled | bool | `false` | Enable creation of a PodMonitor resource for Prometheus scraping |
40 | | podMonitor.honorLabels | bool | `true` | Whether to honor labels from the target |
41 | | podMonitor.interval | string | `"60s"` | How often Prometheus should scrape metrics |
42 | | podMonitor.labels | object | `{}` | Labels to apply to the PodMonitor resource |
43 | | podMonitor.relabelings | list | `[]` | Metric relabeling configuration |
44 | | podMonitor.scrapeTimeout | string | `"10s"` | Timeout for each scrape attempt |
45 | | podSecurityContext | object | `{}` | Security context applied to the entire pod |
46 | | priorityClassName | string | `"system-cluster-critical"` | Priority class for pod scheduling - system-cluster-critical ensures high priority |
47 | | rbac | object | `{"create":true}` | RBAC (Role-Based Access Control) configuration |
48 | | rbac.create | bool | `true` | Create RBAC resources (ClusterRole, ClusterRoleBinding) |
49 | | replicaCount | int | `1` | Number of k8s-shredder pods to run |
50 | | resources | object | `{"limits":{"cpu":"1","memory":"1Gi"},"requests":{"cpu":"250m","memory":"250Mi"}}` | Resource requests and limits for the k8s-shredder container |
51 | | resources.limits.cpu | string | `"1"` | Maximum CPU cores the container can use |
52 | | resources.limits.memory | string | `"1Gi"` | Maximum memory the container can use |
53 | | resources.requests.cpu | string | `"250m"` | CPU cores requested for the container (guaranteed allocation) |
54 | | resources.requests.memory | string | `"250Mi"` | Memory requested for the container (guaranteed allocation) |
55 | | securityContext | object | `{}` | Security context applied to the k8s-shredder container |
56 | | service | object | `{"annotations":{},"create":false,"labels":{},"port":8080,"targetPort":"metrics","type":"ClusterIP"}` | Kubernetes service configuration |
57 | | service.annotations | object | `{}` | Additional annotations for the service |
58 | | service.create | bool | `false` | Create a service for k8s-shredder metrics endpoint |
59 | | service.labels | object | `{}` | Additional labels for the service |
60 | | service.port | int | `8080` | Service port for metrics endpoint |
61 | | service.targetPort | string | `"metrics"` | Target port for metrics endpoint |
62 | | service.type | string | `"ClusterIP"` | Service type (ClusterIP, NodePort, LoadBalancer) |
63 | | serviceAccount | object | `{"annotations":{},"create":true,"name":"k8s-shredder"}` | Kubernetes service account configuration |
64 | | serviceAccount.annotations | object | `{}` | Additional annotations for the service account (useful for IAM roles, etc.) |
65 | | serviceAccount.create | bool | `true` | Create a service account for k8s-shredder |
66 | | serviceAccount.name | string | `"k8s-shredder"` | Name of the service account |
67 | | shredder | object | `{"AllowEvictionLabel":"shredder.ethos.adobe.net/allow-eviction","ArgoRolloutsAPIVersion":"v1alpha1","EnableKarpenterDisruptionDetection":false,"EnableKarpenterDriftDetection":false,"EnableNodeLabelDetection":false,"EvictionLoopDuration":"","EvictionLoopInterval":"1h","EvictionLoopSchedule":"","EvictionSafetyCheck":true,"ExpiresOnLabel":"shredder.ethos.adobe.net/parked-node-expires-on","ExtraParkingLabels":{},"MaxParkedNodes":"0","NamespacePrefixSkipInitialEviction":"ns-ethos-","NodeLabelsToDetect":[],"ParkedByLabel":"shredder.ethos.adobe.net/parked-by","ParkedByValue":"k8s-shredder","ParkedNodeTTL":"168h","ParkedNodeTaint":"shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule","ParkingReasonLabel":"shredder.ethos.adobe.net/parked-reason","RestartedAtAnnotation":"shredder.ethos.adobe.net/restartedAt","RollingRestartThreshold":0.1,"ToBeDeletedTaint":"ToBeDeletedByClusterAutoscaler","UpgradeStatusLabel":"shredder.ethos.adobe.net/upgrade-status"}` | Core k8s-shredder configuration |
68 | | shredder.AllowEvictionLabel | string | `"shredder.ethos.adobe.net/allow-eviction"` | Label to explicitly allow eviction on specific resources |
69 | | shredder.ArgoRolloutsAPIVersion | string | `"v1alpha1"` | API version for Argo Rollouts integration |
70 | | shredder.EnableKarpenterDisruptionDetection | bool | `false` | Enable Karpenter disruption detection for node lifecycle management |
71 | | shredder.EnableKarpenterDriftDetection | bool | `false` | Enable Karpenter drift detection for node lifecycle management |
72 | | shredder.EnableNodeLabelDetection | bool | `false` | Enable detection of nodes based on specific labels |
73 | | shredder.EvictionLoopDuration | string | `""` | Duration for how long the scheduled window stays active after the schedule triggers. Only used when EvictionLoopSchedule is set. Supports compound durations with hours and minutes (e.g., "10h5m", "30m", "160h"). Example: "10h" (window stays active for 10 hours), "30m" (window stays active for 30 minutes). |
74 | | shredder.EvictionLoopInterval | string | `"1h"` | How often to run the main eviction loop |
75 | | shredder.EvictionLoopSchedule | string | `""` | Optional cron schedule for when eviction operations are allowed. If set, parking and shredding operations will only occur during the scheduled time window. Supports standard cron syntax and macros (@yearly, @monthly, @weekly, @daily, @hourly). Example: "@daily" (runs at midnight UTC), "0 2 * * *" (runs at 2 AM UTC daily). When omitted, operations run continuously. |
76 | | shredder.EvictionSafetyCheck | bool | `true` | Controls whether to perform safety checks before force eviction |
77 | | shredder.ExpiresOnLabel | string | `"shredder.ethos.adobe.net/parked-node-expires-on"` | Label used to track when a parked node expires |
78 | | shredder.ExtraParkingLabels | object | `{}` | Additional labels to apply to nodes and pods during parking |
79 | | shredder.MaxParkedNodes | string | `"0"` | Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" for no limit |
80 | | shredder.NamespacePrefixSkipInitialEviction | string | `"ns-ethos-"` | Namespace prefix to skip during initial eviction (useful for system namespaces) |
81 | | shredder.NodeLabelsToDetect | list | `[]` | List of node labels to monitor for triggering shredder actions |
82 | | shredder.ParkedByLabel | string | `"shredder.ethos.adobe.net/parked-by"` | Label to track which component parked a node |
83 | | shredder.ParkedByValue | string | `"k8s-shredder"` | Value set in the ParkedByLabel to identify k8s-shredder as the parking agent |
84 | | shredder.ParkedNodeTTL | string | `"168h"` | How long parked nodes should remain before being eligible for deletion (7 days default) |
85 | | shredder.ParkedNodeTaint | string | `"shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule"` | Taint applied to parked nodes to prevent new pod scheduling |
86 | | shredder.ParkingReasonLabel | string | `"shredder.ethos.adobe.net/parked-reason"` | Label used to track why a node or pod was parked |
87 | | shredder.RestartedAtAnnotation | string | `"shredder.ethos.adobe.net/restartedAt"` | Annotation to track when a workload was last restarted |
88 | | shredder.RollingRestartThreshold | float | `0.1` | Maximum percentage of nodes that can be restarted simultaneously during rolling restarts |
89 | | shredder.ToBeDeletedTaint | string | `"ToBeDeletedByClusterAutoscaler"` | Taint indicating nodes scheduled for deletion by cluster autoscaler |
90 | | shredder.UpgradeStatusLabel | string | `"shredder.ethos.adobe.net/upgrade-status"` | Label used to track node upgrade status |
91 | | tolerations | list | `[]` | Tolerations to allow scheduling on nodes with specific taints |
92 | | topologySpreadConstraints | list | `[]` | Helps ensure high availability by spreading pods across zones/nodes |
93 | | volumes | list | `[]` | Additional volumes to mount in the pod |
94 |
95 | ----------------------------------------------
96 | Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
97 |
--------------------------------------------------------------------------------
/docs/e2e_tests.md:
--------------------------------------------------------------------------------
1 | # k8s-shredder End-to-End Tests
2 |
3 | This document describes all the end-to-end tests for k8s-shredder, including their functionality, skip conditions, and execution environments.
4 |
5 | ## Test Overview
6 |
7 | The e2e tests verify various aspects of k8s-shredder functionality including node parking, pod eviction, metrics collection, and safety checks. Tests are designed to run in different environments (standard, Karpenter, node-labels) and have specific skip conditions.
8 |
9 | ## Test Environments
10 |
11 | ### Standard Environment (`local-test`)
12 | - **Purpose**: Basic k8s-shredder functionality testing
13 | - **Cluster**: 4 nodes (control-plane, worker, worker2, worker3-monitoring)
14 | - **Features**: Standard node parking and pod eviction
15 |
16 | ### Karpenter Environment (`local-test-karpenter`)
17 | - **Purpose**: Karpenter drift detection testing
18 | - **Cluster**: 4 nodes (control-plane, worker, worker2, worker3-monitoring)
19 | - **Features**: Mock Karpenter CRDs, drift detection simulation
20 |
21 | ### Node Labels Environment (`local-test-node-labels`)
22 | - **Purpose**: Node label detection testing
23 | - **Cluster**: 5 nodes (control-plane, worker, worker2, worker3, worker4-monitoring)
24 | - **Features**: Node label detection, automatic parking based on labels
25 |
26 | ## Test Cases
27 |
28 | ### TestNodeIsCleanedUp
29 | **Always Run**: ✅ Yes (in all environments)
30 |
31 | **Purpose**: Verifies that k8s-shredder properly cleans up parked nodes after their TTL expires.
32 |
33 | **Steps**:
34 | 1. Parks a worker node with a 1-minute TTL
35 | 2. Waits for the TTL to expire
36 | 3. Verifies that all user pods are evicted from the node
37 | 4. Collects metrics to verify operation
38 |
39 | **Expected Result**: The node should be parked, then after TTL expiration, all user pods should be evicted.
40 |
41 | **Skip Conditions**: None - runs in all environments
42 |
43 | ---
44 |
45 | ### TestShredderMetrics
46 | **Always Run**: ✅ Yes (in all environments)
47 |
48 | **Purpose**: Verifies that k8s-shredder metrics are properly collected and exposed via Prometheus.
49 |
50 | **Steps**:
51 | 1. Collects metrics from Prometheus
52 | 2. Verifies that expected metrics are present
53 | 3. Logs metric values for verification
54 |
55 | **Expected Result**: Should find metrics like `shredder_processed_pods_total`, `shredder_errors_total`, etc.
56 |
57 | **Skip Conditions**: None - runs in all environments
58 |
59 | **Note**: Requires Prometheus to be running on the dedicated monitoring node (worker3/worker4)
60 |
61 | ---
62 |
63 | ### TestArgoRolloutRestartAt
64 | **Always Run**: ✅ Yes (in all environments)
65 |
66 | **Purpose**: Verifies that k8s-shredder properly sets the `restartAt` field on Argo Rollouts.
67 |
68 | **Steps**:
69 | 1. Waits for the Argo Rollout to have its `restartAt` field set
70 | 2. Verifies the field is properly configured
71 |
72 | **Expected Result**: The Argo Rollout should have a `restartAt` field set to a future timestamp.
73 |
74 | **Skip Conditions**: None - runs in all environments
75 |
76 | ---
77 |
78 | ### TestKarpenterMetrics
79 | **Conditional Run**: Only in Karpenter environment
80 |
81 | **Purpose**: Verifies Karpenter-specific metrics when drift detection is enabled.
82 |
83 | **Steps**:
84 | 1. Collects Karpenter-specific metrics from Prometheus
85 | 2. Verifies expected Karpenter metrics are present
86 | 3. Logs metric values for verification
87 |
88 | **Expected Result**: Should find metrics like `shredder_karpenter_drifted_nodes_total`, `shredder_karpenter_nodes_parked_total`, etc.
89 |
90 | **Skip Conditions**:
91 | - ❌ Not running in Karpenter test environment
92 | - ❌ Prometheus not accessible
93 |
94 | ---
95 |
96 | ### TestNodeLabelMetrics
97 | **Conditional Run**: Only in node-labels environment
98 |
99 | **Purpose**: Verifies node label detection metrics when node label detection is enabled.
100 |
101 | **Steps**:
102 | 1. Collects node label detection metrics from Prometheus
103 | 2. Verifies expected node label metrics are present
104 | 3. Logs metric values for verification
105 |
106 | **Expected Result**: Should find metrics like `shredder_node_label_nodes_parked_total`, `shredder_node_label_matching_nodes_total`, etc.
107 |
108 | **Skip Conditions**:
109 | - ❌ Not running in node-labels test environment
110 | - ❌ Prometheus not accessible
111 |
112 | ---
113 |
114 | ### TestEvictionSafetyCheck
115 | **Conditional Run**: Only when EvictionSafetyCheck is enabled
116 |
117 | **Purpose**: Tests the EvictionSafetyCheck failure case - verifies that nodes are unparked when pods lack proper parking labels.
118 |
119 | **Steps**:
120 | 1. Scale k8s-shredder replicas to zero to disable actions
121 | 2. Park the worker2 node and all pods on it (properly labels all existing pods)
122 | 3. Create a new pod without proper parking labels on worker2
123 | 4. Create a PodDisruptionBudget to prevent soft eviction of the unlabeled pod
124 | 5. Scale k8s-shredder replicas to 1 to start the test
125 | 6. Monitor worker2 parking status - it should be unparked due to safety check failure
126 |
127 | **Expected Result**: The node should be unparked because the EvictionSafetyCheck detects that not all pods have proper parking labels.
128 |
129 | **Skip Conditions**:
130 | - ❌ EvictionSafetyCheck is disabled in k8s-shredder configuration
131 | - ❌ Running in Karpenter or node-labels test environments (different node structures)
132 | - ❌ Cannot access k8s-shredder-config configmap
133 |
134 | ---
135 |
136 | ### TestEvictionSafetyCheckPasses
137 | **Conditional Run**: Only when EvictionSafetyCheck is enabled
138 |
139 | **Purpose**: Tests the EvictionSafetyCheck success case - verifies that force eviction proceeds when all pods are properly labeled.
140 |
141 | **Steps**:
142 | 1. Scale k8s-shredder replicas to zero to disable actions
143 | 2. Park the worker2 node and all pods on it (this properly labels all pods)
144 | 3. Scale k8s-shredder replicas to 1 to start the test
145 | 4. Monitor worker2 parking status - it should remain parked until TTL expires, then get force evicted
146 |
147 | **Expected Result**: The node should remain parked and eventually be force evicted because all pods have proper parking labels.
148 |
149 | **Skip Conditions**:
150 | - ❌ EvictionSafetyCheck is disabled in k8s-shredder configuration
151 | - ❌ Running in Karpenter or node-labels test environments (different node structures)
152 | - ❌ Cannot access k8s-shredder-config configmap
153 |
154 | ## Running the Tests
155 |
156 | ### Prerequisites
157 |
158 | 1. A running kind cluster with k8s-shredder deployed
159 | 2. The `park-node` binary built and available
160 | 3. Prometheus running on the dedicated monitoring node
161 |
162 | ### Running All Tests
163 |
164 | ```bash
165 | # Build the park-node binary
166 | make build
167 |
168 | # Run all e2e tests
169 | make e2e-tests
170 | ```
171 |
172 | ### Running Specific Test Environments
173 |
174 | ```bash
175 | # Standard environment
176 | make local-test
177 |
178 | # Karpenter environment
179 | make local-test-karpenter
180 |
181 | # Node labels environment
182 | make local-test-node-labels
183 | ```
184 |
185 | ### Running Individual Tests
186 |
187 | ```bash
188 | # Run specific test
189 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/kubeconfig-localtest go test internal/testing/e2e_test.go -v -run TestShredderMetrics
190 |
191 | # Run all EvictionSafetyCheck tests
192 | PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/kubeconfig-localtest go test internal/testing/e2e_test.go -v -run 'TestEvictionSafetyCheck.*'
193 | ```
194 |
195 | ## Test Configuration
196 |
197 | ### EvictionSafetyCheck Configuration
198 |
199 | The EvictionSafetyCheck tests check the `k8s-shredder-config` ConfigMap in the `kube-system` namespace for the `EvictionSafetyCheck: true` setting. If this setting is not found or is set to `false`, the tests will be skipped.
200 |
201 | ### Prometheus Configuration
202 |
203 | All metrics tests require Prometheus to be running on the dedicated monitoring node. The monitoring node is configured with:
204 | - **Node Label**: `monitoring=dedicated`
205 | - **Node Taint**: `monitoring=dedicated:NoSchedule`
206 | - **Prometheus Node Selector**: `monitoring: dedicated`
207 | - **Prometheus Toleration**: For the `monitoring=dedicated:NoSchedule` taint
208 |
209 | This ensures Prometheus is never affected by k8s-shredder node parking operations.
210 |
211 | ## PodDisruptionBudget Usage
212 |
213 | In the `TestEvictionSafetyCheck` failure test case, a PodDisruptionBudget is created to prevent the unlabeled pod from being evicted by normal "soft" eviction mechanisms before the EvictionSafetyCheck runs. This ensures that:
214 |
215 | 1. The pod remains on the node when k8s-shredder performs the safety check
216 | 2. The safety check can properly detect the missing parking labels
217 | 3. The node gets unparked as expected
218 |
219 | The PDB uses `minAvailable: 1` and targets the specific test pod using the `test-pod: "true"` label selector.
220 |
221 | ## Test Results Interpretation
222 |
223 | ### Successful Test Results
224 | - **PASS**: Test completed successfully with expected behavior
225 | - **SKIP**: Test was skipped due to environment or configuration conditions
226 |
227 | ### Failed Test Results
228 | - **FAIL**: Test failed due to unexpected behavior or errors
229 | - **TIMEOUT**: Test exceeded maximum execution time
230 |
231 | ### Common Skip Reasons
232 | - `EvictionSafetyCheck is disabled in k8s-shredder configuration`
233 | - `not running in a Karpenter test environment`
234 | - `not running in a node labels test environment`
235 | - `Prometheus is not accessible after 30 retries`
236 |
237 | ## Troubleshooting
238 |
239 | ### Prometheus Issues
240 | If metrics tests are failing with "Prometheus port not set" errors:
241 | 1. Check that Prometheus is running: `kubectl get pods -n kube-system | grep prometheus`
242 | 2. Verify Prometheus is on the monitoring node: `kubectl get pods -n kube-system -o wide | grep prometheus`
243 | 3. Check node labels and taints: `kubectl describe node `
244 |
245 | ### EvictionSafetyCheck Issues
246 | If EvictionSafetyCheck tests are being skipped:
247 | 1. Check the configmap: `kubectl get configmap k8s-shredder-config -n kube-system -o yaml`
248 | 2. Verify `EvictionSafetyCheck: true` is set in the configuration
249 | 3. Ensure you're running in the standard test environment (not Karpenter or node-labels)
250 |
251 | ### Node Parking Issues
252 | If node parking tests are failing:
253 | 1. Check k8s-shredder logs: `kubectl logs -n kube-system -l app.kubernetes.io/name=k8s-shredder`
254 | 2. Verify the park-node binary exists: `ls -la park-node`
255 | 3. Check node status: `kubectl get nodes`
256 |
--------------------------------------------------------------------------------