├── test ├── assets │ ├── uptime-reboot │ └── squid.conf ├── webhook-test-proxy │ ├── THIRD_PARTY_LICENSES.md │ ├── Dockerfile.windows │ ├── Dockerfile │ └── cmd │ │ └── webhook-test-proxy.go ├── readme-test │ ├── spellcheck-Dockerfile │ └── run-readme-spellcheck ├── k8s-local-cluster-test │ ├── kind-three-node-cluster.yaml │ ├── delete-cluster │ ├── psp-default.yaml │ └── psp-privileged.yaml ├── eks-cluster-test │ ├── cluster-spec.yaml │ ├── reset-cluster │ └── provision-cluster ├── helm-sync-test │ └── run-helm-version-sync-test ├── shellcheck │ └── run-shellcheck ├── k8s-compatibility-test │ └── run-k8s-compatibility-test.sh ├── helm │ ├── validate-chart-versions │ └── helm-lint └── e2e │ ├── spot-interruption-dry-run-test │ ├── maintenance-event-dry-run-test │ ├── emit-events-test │ ├── cordon-only-test │ ├── imds-v2-test │ └── webhook-secret-test ├── .dockerignore ├── NOTICE ├── templates └── third-party-licenses.tmpl ├── config └── helm │ ├── aws-node-termination-handler │ ├── example-values-imds-linux.yaml │ ├── example-values-imds-windows.yaml │ ├── example-values-queue.yaml │ ├── .helmignore │ ├── templates │ │ ├── serviceaccount.yaml │ │ ├── NOTES.txt │ │ ├── clusterrolebinding.yaml │ │ ├── pdb.yaml │ │ ├── service.yaml │ │ ├── clusterrole.yaml │ │ ├── podmonitor.yaml │ │ ├── servicemonitor.yaml │ │ ├── psp.yaml │ │ └── _helpers.tpl │ └── Chart.yaml │ ├── localstack │ ├── Chart.yaml │ ├── templates │ │ ├── clusterrole.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── psp.yaml │ │ ├── deployment.yaml │ │ └── _helpers.tpl │ ├── .helmignore │ └── values.yaml │ ├── squid │ ├── templates │ │ ├── clusterrole.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── daemonset.yaml │ │ ├── psp.yaml │ │ └── _helpers.tpl │ ├── Chart.yaml │ ├── values.yaml │ └── .helmignore │ └── webhook-test-proxy │ ├── templates │ ├── clusterrole.yaml │ ├── service.yaml │ ├── serviceaccount.yaml │ ├── clusterrolebinding.yaml │ ├── regular-pod-test.yaml │ ├── daemonset.yaml │ ├── psp.yaml │ └── _helpers.tpl │ ├── .helmignore │ ├── values.yaml │ └── Chart.yaml ├── CODEOWNERS ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── stale.yml │ └── release.yaml ├── CODE_OF_CONDUCT.md ├── .gitignore ├── scripts ├── ecr-public-login ├── helm-login ├── ecr-template-for-helm-chart.json ├── sync-catalog-information-for-helm-chart ├── install-amazon-ecr-credential-helper ├── sync-readme-to-ecr-public ├── push-helm-chart ├── run-unit-tests-in-docker ├── retag-docker-images ├── build-docker-images ├── build-binaries └── upload-resources-to-github ├── pkg ├── uptime │ ├── uptime_darwin.go │ ├── uptime_linux.go │ ├── uptime_test.go │ ├── uptime_windows.go │ ├── common.go │ └── common_test.go ├── observability │ ├── probes_test.go │ └── probes.go ├── logging │ ├── routing.go │ ├── routing-integration_test.go │ ├── routing_test.go │ └── versioned.go ├── monitor │ ├── sqsevent │ │ ├── event-bridge.go │ │ ├── sqs-retryer.go │ │ ├── ec2-state-change-event.go │ │ ├── sqs-retryer_test.go │ │ ├── spot-itn-event.go │ │ └── rebalance-recommendation-event.go │ ├── types_test.go │ ├── types.go │ ├── asglifecycle │ │ ├── asg-lifecycle-monitor_internal_test.go │ │ └── asg-lifecycle-monitor.go │ ├── spotitn │ │ ├── spot-itn-monitor_internal_test.go │ │ └── spot-itn-monitor.go │ ├── rebalancerecommendation │ │ └── rebalance-recommendation-monitor_internal_test.go │ └── scheduledevent │ │ └── scheduled-event-monitor_internal_test.go ├── test │ └── helpers.go ├── ec2helper │ └── ec2helper.go ├── ec2metadata │ └── ec2metadata_internal_test.go ├── config │ └── config_internal_test.go └── interruptionevent │ └── internal │ └── common │ └── handler.go ├── Dockerfile ├── Dockerfile.windows ├── docs ├── cfn-template.yaml └── kubernetes_events.md ├── CONTRIBUTING.md └── BUILD.md /test/assets/uptime-reboot: -------------------------------------------------------------------------------- 1 | 5.01 10000 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | test/* 2 | config/* 3 | build/* -------------------------------------------------------------------------------- /test/webhook-test-proxy/THIRD_PARTY_LICENSES.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /test/readme-test/spellcheck-Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.20 2 | 3 | RUN go install github.com/client9/misspell/cmd/misspell@v0.3.4 4 | 5 | CMD [ "/go/bin/misspell" ] -------------------------------------------------------------------------------- /templates/third-party-licenses.tmpl: -------------------------------------------------------------------------------- 1 | # Third-party Licenses 2 | 3 | {{ range . -}} 4 | - {{ .Name }} {{ .Version }} [{{ .LicenseName }}]({{ .LicenseURL }}) 5 | {{ end -}} -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/example-values-imds-linux.yaml: -------------------------------------------------------------------------------- 1 | enableSqsTerminationDraining: false 2 | 3 | targetNodeOs: linux 4 | 5 | enableProbesServer: true 6 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/example-values-imds-windows.yaml: -------------------------------------------------------------------------------- 1 | enableSqsTerminationDraining: false 2 | 3 | targetNodeOs: windows 4 | 5 | enableProbesServer: true 6 | -------------------------------------------------------------------------------- /config/helm/localstack/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | name: localstack 3 | description: A very simplified helm chart for localstack (used in e2e tests) 4 | version: 0.0.1 5 | appVersion: 0.11.2 6 | home: https://github.com/localstack/localstack 7 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Require approvals from someone in the owner team before merging 2 | # More information here: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners 3 | 4 | * @aws/ec2-guacamole 5 | -------------------------------------------------------------------------------- /config/helm/squid/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRole 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: {{ include "squid.fullname" . }} 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - nodes 10 | verbs: 11 | - get 12 | -------------------------------------------------------------------------------- /config/helm/squid/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ .Values.squid.label }} 5 | spec: 6 | selector: 7 | app: {{ .Values.squid.label }} 8 | ports: 9 | - port: {{ .Values.squid.port }} 10 | protocol: TCP 11 | 12 | -------------------------------------------------------------------------------- /config/helm/localstack/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRole 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: {{ include "localstack.fullname" . }} 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - nodes 10 | verbs: 11 | - get 12 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRole 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: {{ include "webhook-test-proxy.fullname" . }} 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - nodes 10 | verbs: 11 | - get 12 | -------------------------------------------------------------------------------- /config/helm/localstack/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ .Values.label }} 5 | spec: 6 | selector: 7 | app: {{ .Values.label }} 8 | ports: 9 | - port: {{ .Values.port }} 10 | targetPort: {{ .Values.containerPort }} 11 | protocol: TCP 12 | 13 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **Issue #, if available:** 2 | 3 | **Description of changes:** 4 | 5 | **How you tested your changes:** 6 | Environment (Linux / Windows): 7 | Kubernetes Version: 8 | 9 | 10 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 11 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/example-values-queue.yaml: -------------------------------------------------------------------------------- 1 | serviceAccount: 2 | annotations: 3 | eks.amazonaws.com/role-arn: arn:aws:iam::99999999:role/nth-role 4 | 5 | resources: 6 | requests: 7 | cpu: 100m 8 | memory: 128Mi 9 | limits: 10 | cpu: 500m 11 | memory: 256Mi 12 | 13 | enableSqsTerminationDraining: true 14 | -------------------------------------------------------------------------------- /config/helm/squid/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | name: squid 3 | description: A Helm chart for Squid 4 | version: 0.0.1 5 | appVersion: 1.0.0 6 | home: https://github.com/aws/aws-node-termination-handler 7 | icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png 8 | sources: 9 | - https://github.com/aws/aws-node-termination-handler/test/squid 10 | -------------------------------------------------------------------------------- /config/helm/squid/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ template "squid.serviceAccountName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | {{- with .Values.serviceAccount.annotations }} 7 | annotations: 8 | {{ toYaml . | indent 4 }} 9 | {{- end }} 10 | labels: 11 | {{ include "squid.labels" . | indent 4 }} 12 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ .Values.webhookTestProxy.label }} 5 | spec: 6 | selector: 7 | app: {{ .Values.webhookTestProxy.label }} 8 | ports: 9 | - port: {{ .Values.webhookTestProxy.port }} 10 | targetPort: {{ .Values.webhookTestProxy.containerPort }} 11 | protocol: TCP 12 | 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gomod" 4 | allow: 5 | - dependency-name: "github.com/aws/aws-sdk-go" 6 | - dependency-name: "k8s.io/*" 7 | dependency-type: "direct" 8 | directory: "/" 9 | schedule: 10 | interval: "weekly" 11 | day: "tuesday" 12 | time: "09:00" 13 | timezone: "America/Chicago" 14 | -------------------------------------------------------------------------------- /config/helm/localstack/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ template "localstack.serviceAccountName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | {{- with .Values.serviceAccount.annotations }} 7 | annotations: 8 | {{ toYaml . | indent 4 }} 9 | {{- end }} 10 | labels: 11 | {{ include "localstack.labels" . | indent 4 }} 12 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ template "webhook-test-proxy.serviceAccountName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | {{- with .Values.serviceAccount.annotations }} 7 | annotations: 8 | {{ toYaml . | indent 4 }} 9 | {{- end }} 10 | labels: 11 | {{ include "webhook-test-proxy.labels" . | indent 4 }} 12 | -------------------------------------------------------------------------------- /config/helm/squid/values.yaml: -------------------------------------------------------------------------------- 1 | nameOverride: "" 2 | fullnameOverride: "" 3 | priorityClassName: system-node-critical 4 | podAnnotations: {} 5 | rbac: 6 | pspEnabled: true 7 | serviceAccount: 8 | name: squid-sa 9 | create: true 10 | annotations: {} 11 | squid: 12 | port: 3128 13 | label: squid 14 | configMap: squidConfigMap 15 | image: 16 | repository: squid 17 | tag: customtest 18 | 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Go ### 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | 9 | # Test binary, built with `go test -c` 10 | *.test 11 | 12 | # Output of the go coverage tool, specifically when used with LiteIDE 13 | *.out 14 | .idea/ 15 | 16 | # Output of the go build for the cmd binary 17 | /node-termination-handler 18 | 19 | ### Go Patch ### 20 | /vendor/ 21 | /Godeps/ 22 | /build/ 23 | /bin/ 24 | -------------------------------------------------------------------------------- /config/helm/squid/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRoleBinding 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: {{ include "squid.fullname" . }} 5 | subjects: 6 | - kind: ServiceAccount 7 | name: {{ template "squid.serviceAccountName" . }} 8 | namespace: {{ .Release.Namespace }} 9 | roleRef: 10 | kind: ClusterRole 11 | name: {{ include "squid.fullname" . }} 12 | apiGroup: rbac.authorization.k8s.io 13 | -------------------------------------------------------------------------------- /config/helm/localstack/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRoleBinding 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: {{ include "localstack.fullname" . }} 5 | subjects: 6 | - kind: ServiceAccount 7 | name: {{ template "localstack.serviceAccountName" . }} 8 | namespace: {{ .Release.Namespace }} 9 | roleRef: 10 | kind: ClusterRole 11 | name: {{ include "localstack.fullname" . }} 12 | apiGroup: rbac.authorization.k8s.io 13 | -------------------------------------------------------------------------------- /config/helm/squid/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | -------------------------------------------------------------------------------- /config/helm/localstack/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | -------------------------------------------------------------------------------- /test/assets/squid.conf: -------------------------------------------------------------------------------- 1 | acl SSL_ports port 443 2 | acl CONNECT method CONNECT 3 | acl all src 0.0.0.0/0 4 | http_access allow all 5 | http_port 3128 6 | coredump_dir /var/spool/squid 7 | refresh_pattern ^ftp: 1440 20% 10080 8 | refresh_pattern ^gopher: 1440 0% 1440 9 | refresh_pattern -i (/cgi-bin/|\?) 0 0% 0 10 | refresh_pattern (Release|Packages(.gz)*)$ 0 20% 2880 11 | refresh_pattern . 0 20% 4320 12 | access_log stdio:/var/log/squid/access.log all 13 | -------------------------------------------------------------------------------- /test/k8s-local-cluster-test/kind-three-node-cluster.yaml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | nodes: 4 | - role: control-plane 5 | kubeadmConfigPatches: 6 | - | 7 | apiVersion: kubeadm.k8s.io/v1beta2 8 | kind: ClusterConfiguration 9 | metadata: 10 | name: config 11 | apiServer: 12 | extraArgs: 13 | "enable-admission-plugins": "NodeRestriction,PodSecurityPolicy" 14 | - role: worker 15 | - role: worker -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRoleBinding 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: {{ include "webhook-test-proxy.fullname" . }} 5 | subjects: 6 | - kind: ServiceAccount 7 | name: {{ template "webhook-test-proxy.serviceAccountName" . }} 8 | namespace: {{ .Release.Namespace }} 9 | roleRef: 10 | kind: ClusterRole 11 | name: {{ include "webhook-test-proxy.fullname" . }} 12 | apiGroup: rbac.authorization.k8s.io 13 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | example-values*.yaml 24 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ template "aws-node-termination-handler.serviceAccountName" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 9 | {{- with .Values.serviceAccount.annotations }} 10 | annotations: 11 | {{- toYaml . | nindent 4 }} 12 | {{- end }} 13 | {{- end -}} 14 | -------------------------------------------------------------------------------- /config/helm/localstack/values.yaml: -------------------------------------------------------------------------------- 1 | nameOverride: "" 2 | fullnameOverride: "" 3 | priorityClassName: system-node-critical 4 | podAnnotations: {} 5 | rbac: 6 | pspEnabled: true 7 | serviceAccount: 8 | name: ls-sa 9 | create: true 10 | annotations: {} 11 | 12 | label: localstack 13 | port: 80 14 | containerPort: 4566 15 | image: 16 | repository: localstack/localstack 17 | tag: 3.0.2 18 | pullPolicy: IfNotPresent 19 | tolerations: 20 | - operator: "Exists" 21 | nodeSelector: {} 22 | services: "events,sqs,ec2" 23 | defaultRegion: "us-east-1" -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest a feature/enhancement for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the feature** 11 | A concise description of the feature and desired behavior. 12 | 13 | **Is the feature request related to a problem?** 14 | A description of what the problem is. For example: I'm frustrated when [...] 15 | 16 | **Describe alternatives you've considered** 17 | A description of any alternative solutions or features you've considered. 18 | -------------------------------------------------------------------------------- /scripts/ecr-public-login: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | BUILD_DIR=$SCRIPTPATH/../build/ 6 | export PATH="${BUILD_DIR}:${PATH}" 7 | 8 | if [[ -z "${ECR_REGISTRY}" ]]; then 9 | echo "The env var ECR_REGISTRY must be set" 10 | exit 1 11 | fi 12 | 13 | function exit_and_fail() { 14 | echo "❌ Failed to login to ECR Public Repo!" 15 | } 16 | 17 | trap exit_and_fail INT TERM ERR 18 | 19 | docker login --username AWS --password="$(aws ecr-public get-login-password --region us-east-1)" "${ECR_REGISTRY}" -------------------------------------------------------------------------------- /scripts/helm-login: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | BUILD_DIR=$SCRIPTPATH/../build/ 6 | export PATH="${BUILD_DIR}:${PATH}" 7 | 8 | if [[ -z "${ECR_REGISTRY}" ]]; then 9 | echo "The env var ECR_REGISTRY must be set" 10 | exit 1 11 | fi 12 | 13 | function exit_and_fail() { 14 | echo "❌ Failed to login to ECR Public Repo!" 15 | } 16 | 17 | trap exit_and_fail INT TERM ERR 18 | 19 | export HELM_EXPERIMENTAL_OCI=1 20 | helm registry login --username AWS --password="$(aws ecr-public get-login-password --region us-east-1)" "${ECR_REGISTRY}" -------------------------------------------------------------------------------- /test/readme-test/run-readme-spellcheck: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | 6 | function exit_and_fail() { 7 | echo "❌ Test Failed! Found a markdown file with spelling errors." 8 | exit 1 9 | } 10 | trap exit_and_fail INT ERR TERM 11 | 12 | docker buildx build --load -t misspell -f $SCRIPTPATH/spellcheck-Dockerfile $SCRIPTPATH/ 13 | docker run -i --rm -v $SCRIPTPATH/../../:/app misspell /bin/bash -c 'find /app/ -type f -name "*.md" -not -path "build" | grep -v "/build/" | xargs misspell -error -debug' 14 | echo "✅ Markdown file spell check passed!" -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | *********************************************************************** 2 | * AWS Node Termination Handler * 3 | *********************************************************************** 4 | Chart version: {{ .Chart.Version }} 5 | App version: {{ .Chart.AppVersion }} 6 | Image tag: {{ include "aws-node-termination-handler.image" . }} 7 | Mode : {{ if .Values.enableSqsTerminationDraining }}Queue Processor{{ else }}IMDS{{ end }} 8 | *********************************************************************** 9 | -------------------------------------------------------------------------------- /test/eks-cluster-test/cluster-spec.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: eksctl.io/v1alpha5 3 | kind: ClusterConfig 4 | metadata: 5 | name: nth-eks-cluster-test 6 | region: us-west-2 7 | version: '1.27' 8 | cloudWatch: 9 | clusterLogging: 10 | enableTypes: ["*"] 11 | managedNodeGroups: 12 | - name: linux-ng 13 | instanceType: t3.medium 14 | amiFamily: AmazonLinux2 15 | desiredCapacity: 2 16 | minSize: 2 17 | maxSize: 2 18 | spot: true 19 | nodeGroups: 20 | - name: windows-ng 21 | instanceType: m5.large 22 | minSize: 1 23 | maxSize: 1 24 | volumeSize: 100 25 | amiFamily: WindowsServer2022FullContainer -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.rbac.create -}} 2 | kind: ClusterRoleBinding 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | metadata: 5 | name: {{ include "aws-node-termination-handler.fullname" . }} 6 | labels: 7 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: {{ include "aws-node-termination-handler.fullname" . }} 12 | subjects: 13 | - kind: ServiceAccount 14 | name: {{ template "aws-node-termination-handler.serviceAccountName" . }} 15 | namespace: {{ .Release.Namespace }} 16 | {{- end -}} 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A concise description of what the bug is. 12 | 13 | **Steps to reproduce** 14 | A step-by-step description on how to reproduce the problem. 15 | 16 | **Expected outcome** 17 | A concise description of what you expected to happen. 18 | 19 | **Application Logs** 20 | The log output when experiencing the issue. 21 | 22 | 23 | **Environment** 24 | 25 | * NTH App Version: 26 | * NTH Mode (IMDS/Queue processor): 27 | * OS/Arch: 28 | * Kubernetes version: 29 | * Installation method: 30 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/pdb.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.enableSqsTerminationDraining (and .Values.podDisruptionBudget (gt (int .Values.replicas) 1)) }} 2 | apiVersion: {{ include "aws-node-termination-handler.pdb.apiVersion" . }} 3 | kind: PodDisruptionBudget 4 | metadata: 5 | name: {{ include "aws-node-termination-handler.fullname" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 9 | spec: 10 | selector: 11 | matchLabels: 12 | {{- include "aws-node-termination-handler.selectorLabelsDeployment" . | nindent 6 }} 13 | {{- toYaml .Values.podDisruptionBudget | nindent 2 }} 14 | {{- end }} 15 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/service.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.enableSqsTerminationDraining .Values.enablePrometheusServer -}} 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: {{ include "aws-node-termination-handler.fullname" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{- include "aws-node-termination-handler.labelsDeployment" . | nindent 4 }} 9 | spec: 10 | type: ClusterIP 11 | selector: 12 | {{- include "aws-node-termination-handler.selectorLabelsDeployment" . | nindent 4 }} 13 | ports: 14 | - name: http-metrics 15 | port: {{ .Values.prometheusServerPort }} 16 | targetPort: http-metrics 17 | protocol: TCP 18 | {{- end -}} 19 | -------------------------------------------------------------------------------- /test/webhook-test-proxy/Dockerfile.windows: -------------------------------------------------------------------------------- 1 | ARG WINDOWS_VERSION=1903 2 | 3 | # Build the manager binary 4 | FROM --platform=windows/amd64 golang:1.22 AS builder 5 | 6 | ## GOLANG env 7 | ENV GO111MODULE="on" CGO_ENABLED="0" GOOS="windows" GOARCH="amd64" 8 | ARG GOPROXY="https://proxy.golang.org,direct" 9 | 10 | WORKDIR /webhook-test-proxy 11 | 12 | ## Build 13 | COPY . . 14 | RUN go build -a -o webhook-test-proxy cmd/webhook-test-proxy.go 15 | ENTRYPOINT ["webhook-test-proxy"] 16 | 17 | ## Copy binary to a thin image 18 | FROM mcr.microsoft.com/windows/nanoserver:${WINDOWS_VERSION} 19 | WORKDIR / 20 | COPY --from=builder /webhook-test-proxy . 21 | COPY THIRD_PARTY_LICENSES.md . 22 | ENTRYPOINT ["/webhook-test-proxy"] 23 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/values.yaml: -------------------------------------------------------------------------------- 1 | 2 | nameOverride: "" 3 | fullnameOverride: "" 4 | priorityClassName: system-node-critical 5 | podAnnotations: {} 6 | rbac: 7 | pspEnabled: true 8 | serviceAccount: 9 | name: wtp-sa 10 | create: true 11 | annotations: {} 12 | # The webhook-test-proxy is for testing purposes 13 | webhookTestProxy: 14 | create: true 15 | label: webhook-test-proxy 16 | port: 80 17 | containerPort: 1441 18 | image: 19 | repository: webhook-test-proxy 20 | tag: customtest 21 | pullPolicy: IfNotPresent 22 | tolerations: [] 23 | regularPodTest: 24 | create: true 25 | label: regular-pod-test 26 | port: 1339 27 | targetNodeOs: "linux" 28 | nodeSelector: {} 29 | linuxNodeSelector: {} 30 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | name: webhook-test-proxy 3 | description: A Helm chart for the webhook test proxy (used in e2e tests) 4 | version: 0.3.1 5 | appVersion: 1.0.0 6 | home: https://github.com/aws/aws-node-termination-handler 7 | icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png 8 | sources: 9 | - https://github.com/aws/aws-node-termination-handler/test/webhook-test-proxy 10 | maintainers: 11 | - name: Brandon Wagner 12 | url: https://github.com/bwagner5 13 | email: bwagner5@users.noreply.github.com 14 | - name: Jillian Montalvo 15 | url: https://github.com/jillmon 16 | email: jillmon@users.noreply.github.com 17 | keywords: 18 | - eks 19 | - ec2 20 | - node-termination 21 | - imds 22 | - spot 23 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: aws-node-termination-handler 3 | description: A Helm chart for the AWS Node Termination Handler. 4 | type: application 5 | version: 0.27.3 6 | appVersion: 1.25.3 7 | kubeVersion: ">= 1.16-0" 8 | keywords: 9 | - aws 10 | - eks 11 | - ec2 12 | - node-termination 13 | - spot 14 | home: https://github.com/aws/aws-node-termination-handler/ 15 | icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png 16 | sources: 17 | - https://github.com/aws/aws-node-termination-handler/ 18 | maintainers: 19 | - name: Brandon Wagner 20 | url: https://github.com/bwagner5 21 | email: bwagner5@users.noreply.github.com 22 | - name: Jillian Kuentz 23 | url: https://github.com/jillmon 24 | email: jillmon@users.noreply.github.com 25 | -------------------------------------------------------------------------------- /scripts/ecr-template-for-helm-chart.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Helm Charts for AWS Node Termination Handler", 3 | "aboutText": "# Helm Chart for Node Termination Handler\n\n AWS NTH gracefully handles EC2 instance shutdown within Kubernetes.\n\nThis repository contains helm-charts for Node Termination Handler.\n\nFor more information on this project, see the project repo at [AWS Node Termination Handler](https://github.com/aws/aws-node-termination-handler)", 4 | "usageText": "# We can install AWS NTH using the helm chart from this repository.\n\nWe need to authenticate our helm client to ECR registry and install NTH chart using helm chart URI, detailed information on how to install helm chart can be found here [HelmChart ReadMe](https://github.com/aws/aws-node-termination-handler/tree/main/config/helm/aws-node-termination-handler#readme)" 5 | } -------------------------------------------------------------------------------- /pkg/uptime/uptime_darwin.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package uptime 15 | 16 | import "errors" 17 | 18 | // Uptime returns an error on Darwin hosts. 19 | func Uptime() (int64, error) { 20 | return 0, errors.New("Not implemented on darwin platform") 21 | } 22 | -------------------------------------------------------------------------------- /pkg/uptime/uptime_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package uptime 15 | 16 | // Uptime reads system uptime from /proc/uptime and returns the number 17 | // of seconds since last system boot. 18 | func Uptime() (int64, error) { 19 | return UptimeFromFile("/proc/uptime") 20 | } 21 | -------------------------------------------------------------------------------- /scripts/sync-catalog-information-for-helm-chart: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | REPO_NAME="helm/aws-node-termination-handler" 6 | REPO_ROOT_PATH=$SCRIPTPATH/../ 7 | TEMPLATE_PATH=$REPO_ROOT_PATH/scripts/ecr-template-for-helm-chart.json 8 | CATALOG_DATA=$(cat "$TEMPLATE_PATH") 9 | 10 | if aws ecr-public describe-repositories --region us-east-1 --repository-names "$REPO_NAME" > /dev/null 2>&1; then 11 | echo "The repository $REPO_NAME exists, update it with template..." 12 | aws ecr-public put-repository-catalog-data --region us-east-1 --repository-name "$REPO_NAME" --catalog-data "$CATALOG_DATA" 13 | else 14 | echo "The repository $REPO_NAME does not exist, create it with template..." 15 | aws ecr-public create-repository --region us-east-1 --repository-name "$REPO_NAME" --catalog-data "$CATALOG_DATA" 16 | fi -------------------------------------------------------------------------------- /test/webhook-test-proxy/Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the manager binary 2 | FROM golang:1.22-alpine as builder 3 | 4 | ## GOLANG env 5 | ARG GOPROXY="https://proxy.golang.org|direct" 6 | ARG GO111MODULE="on" 7 | ARG CGO_ENABLED=0 8 | ARG GOOS=linux 9 | ARG GOARCH=amd64 10 | 11 | # Copy go.mod and download dependencies 12 | WORKDIR /webhook-test-proxy 13 | 14 | # Build 15 | COPY . . 16 | RUN go build -ldflags="-s -w" -a -o webhook-test-proxy cmd/webhook-test-proxy.go 17 | # In case the target is build for testing: 18 | # $ docker build --target=builder -t test . 19 | ENTRYPOINT ["webhook-test-proxy"] 20 | 21 | # Copy the webhook-test-proxy binary into a thin image 22 | FROM amazonlinux:2 as amazonlinux 23 | FROM scratch 24 | WORKDIR / 25 | COPY --from=builder /webhook-test-proxy . 26 | COPY --from=amazonlinux /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ 27 | COPY THIRD_PARTY_LICENSES.md . 28 | ENTRYPOINT ["/webhook-test-proxy"] 29 | -------------------------------------------------------------------------------- /pkg/observability/probes_test.go: -------------------------------------------------------------------------------- 1 | package observability 2 | 3 | import ( 4 | "net/http" 5 | "net/http/httptest" 6 | "testing" 7 | ) 8 | 9 | func TestLivenessHandler(t *testing.T) { 10 | req := httptest.NewRequest("GET", "/healthz", nil) 11 | rr := httptest.NewRecorder() 12 | handler := http.HandlerFunc(livenessHandler) 13 | 14 | handler.ServeHTTP(rr, req) 15 | 16 | if contentType := rr.Header().Get("Content-Type"); contentType != "application/json" { 17 | t.Errorf("handler returned wrong status content type: got %v want %v", 18 | contentType, "application/json") 19 | } 20 | 21 | if status := rr.Code; status != http.StatusOK { 22 | t.Errorf("handler returned wrong status code: got %v want %v", 23 | status, http.StatusOK) 24 | } 25 | 26 | if body := rr.Body.String(); body != `{"health":"OK"}` { 27 | t.Errorf("handler returned wrong body: got %v want %v", 28 | body, http.StatusText(http.StatusOK)) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /test/helm-sync-test/run-helm-version-sync-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | 6 | TAG=$(git describe --tags "`git rev-list --tags --max-count=1`" | cut -d'v' -f2) 7 | CHART_VERSION=$(cat $SCRIPTPATH/../../config/helm/aws-node-termination-handler/Chart.yaml | grep 'appVersion:' | xargs | cut -d' ' -f2 | tr -d '[:space:]') 8 | DEFAULT_VALUE=$(cat $SCRIPTPATH/../../config/helm/aws-node-termination-handler/values.yaml | grep 'tag:' | xargs | cut -d' ' -f2 | tr -d '[:space:]') 9 | 10 | if [[ "$CHART_VERSION" != "$TAG" || "$DEFAULT_VALUE" != "v$TAG" ]]; then 11 | echo "NTH Version: $TAG" 12 | echo "❌ CHART VERSION: $CHART_VERSION" 13 | echo "❌ Values VERSION: $DEFAULT_VALUE" 14 | echo "❌ The values.yaml or Chart.yaml is not updated with the correct NTH version" 15 | exit 1 16 | fi 17 | echo "✅ Successfully checked that the Helm app versions are in-sync with the repo" 18 | -------------------------------------------------------------------------------- /pkg/uptime/uptime_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | // +build !darwin 15 | 16 | package uptime 17 | 18 | import ( 19 | "testing" 20 | 21 | h "github.com/aws/aws-node-termination-handler/pkg/test" 22 | ) 23 | 24 | func TestUptime(t *testing.T) { 25 | value, err := Uptime() 26 | h.Ok(t, err) 27 | h.Assert(t, value > 0, "Invalid system uptime") 28 | } 29 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=$BUILDPLATFORM golang:1.22 as builder 2 | 3 | ## GOLANG env 4 | ARG GOPROXY="https://proxy.golang.org|direct" 5 | ARG GO111MODULE="on" 6 | 7 | # Copy go.mod and download dependencies 8 | WORKDIR /node-termination-handler 9 | COPY go.mod . 10 | COPY go.sum . 11 | RUN go mod download 12 | 13 | ARG CGO_ENABLED=0 14 | ARG TARGETOS TARGETARCH 15 | ARG GOOS=$TARGETOS 16 | ARG GOARCH=$TARGETARCH 17 | 18 | # Build 19 | COPY . . 20 | RUN make build 21 | # In case the target is build for testing: 22 | # $ docker build --target=builder -t test . 23 | ENTRYPOINT ["/node-termination-handler/build/node-termination-handler"] 24 | 25 | # Build the final image with only the binary 26 | FROM amazonlinux:2 as amazonlinux 27 | FROM scratch 28 | WORKDIR / 29 | COPY --from=builder /node-termination-handler/build/node-termination-handler . 30 | COPY --from=amazonlinux /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ 31 | COPY THIRD_PARTY_LICENSES.md . 32 | USER 1000 33 | ENTRYPOINT ["/node-termination-handler"] 34 | -------------------------------------------------------------------------------- /test/shellcheck/run-shellcheck: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 6 | BUILD_DIR="${SCRIPTPATH}/../../build" 7 | 8 | KERNEL=$(uname -s | tr '[:upper:]' '[:lower:]') 9 | SHELLCHECK_VERSION="0.7.1" 10 | 11 | function exit_and_fail() { 12 | echo "❌ Test Failed! Found a shell script with errors." 13 | exit 1 14 | } 15 | trap exit_and_fail INT ERR TERM 16 | 17 | curl -Lo ${BUILD_DIR}/shellcheck.tar.xz "https://github.com/koalaman/shellcheck/releases/download/v${SHELLCHECK_VERSION}/shellcheck-v${SHELLCHECK_VERSION}.${KERNEL}.x86_64.tar.xz" 18 | tar -C ${BUILD_DIR} -xvf "${BUILD_DIR}/shellcheck.tar.xz" 19 | export PATH="${BUILD_DIR}/shellcheck-v${SHELLCHECK_VERSION}:$PATH" 20 | 21 | shell_files=() 22 | while IFS='' read -r line; do 23 | shell_files+=("$line"); 24 | done < <(grep -Rnl --exclude-dir=build -e '#!.*/bin/bash' -e '#!.*/usr/bin/env bash' ${SCRIPTPATH}/../../) 25 | shellcheck -S warning "${shell_files[@]}" 26 | 27 | echo "✅ All shell scripts look good! 😎" -------------------------------------------------------------------------------- /config/helm/squid/templates/daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: {{ .Values.squid.label }} 5 | labels: 6 | app: {{ .Values.squid.label }} 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: {{ .Values.squid.label }} 11 | template: 12 | metadata: 13 | labels: 14 | app: {{ .Values.squid.label }} 15 | spec: 16 | serviceAccountName: {{ template "squid.serviceAccountName" . }} 17 | containers: 18 | - name: {{ .Values.squid.label }} 19 | image: {{ .Values.squid.image.repository }}:{{ .Values.squid.image.tag }} 20 | imagePullPolicy: IfNotPresent 21 | ports: 22 | - containerPort: {{ .Values.squid.port }} 23 | hostPort: {{ .Values.squid.port }} 24 | volumeMounts: 25 | - name: squid-config 26 | mountPath: /etc/squid 27 | readOnly: true 28 | volumes: 29 | - name: squid-config 30 | configMap: 31 | name: {{ .Values.squid.configMap }} 32 | 33 | -------------------------------------------------------------------------------- /test/k8s-local-cluster-test/delete-cluster: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | USAGE=$(cat << 'EOM' 5 | Usage: delete-cluster [-c ] [-o] 6 | Deletes a kind cluster and context dir 7 | 8 | Example: delete-cluster -c build/tmp-cluster-1234 9 | 10 | Required: 11 | -c Cluster context directory 12 | 13 | Optional: 14 | -o Override path w/ your own kubectl and kind binaries 15 | EOM 16 | ) 17 | 18 | # Process our input arguments 19 | while getopts "c:o" opt; do 20 | case ${opt} in 21 | c ) # Cluster context directory 22 | TMP_DIR=$OPTARG 23 | CLUSTER_NAME=$(cat $TMP_DIR/clustername) 24 | ;; 25 | o ) # Override path with your own kubectl and kind binaries 26 | export PATH=$PATH:$TMP_DIR 27 | ;; 28 | \? ) 29 | echoerr "$USAGE" 1>&2 30 | exit 31 | ;; 32 | esac 33 | done 34 | 35 | echo "🥑 Deleting k8s cluster using \"kind\"" 36 | kind delete cluster --name "$CLUSTER_NAME" 37 | rm -r $TMP_DIR 38 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.rbac.create -}} 2 | kind: ClusterRole 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | metadata: 5 | name: {{ include "aws-node-termination-handler.fullname" . }} 6 | labels: 7 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 8 | rules: 9 | - apiGroups: 10 | - "" 11 | resources: 12 | - nodes 13 | verbs: 14 | - get 15 | - list 16 | - patch 17 | - update 18 | - apiGroups: 19 | - "" 20 | resources: 21 | - pods 22 | verbs: 23 | - list 24 | - get 25 | - apiGroups: 26 | - "" 27 | resources: 28 | - pods/eviction 29 | verbs: 30 | - create 31 | - apiGroups: 32 | - extensions 33 | resources: 34 | - daemonsets 35 | verbs: 36 | - get 37 | - apiGroups: 38 | - apps 39 | resources: 40 | - daemonsets 41 | verbs: 42 | - get 43 | {{- if .Values.emitKubernetesEvents }} 44 | - apiGroups: 45 | - "" 46 | resources: 47 | - events 48 | verbs: 49 | - create 50 | - patch 51 | {{- end }} 52 | {{- end -}} 53 | -------------------------------------------------------------------------------- /Dockerfile.windows: -------------------------------------------------------------------------------- 1 | ARG WINDOWS_VERSION=1809 2 | 3 | # Build the manager binary 4 | FROM --platform=windows/amd64 golang:1.22 as builder 5 | 6 | ## GOLANG env 7 | ENV GO111MODULE="on" CGO_ENABLED="0" GOOS="windows" GOARCH="amd64" 8 | ARG GOPROXY="https://proxy.golang.org|direct" 9 | 10 | # Copy go.mod and download dependencies 11 | WORKDIR /node-termination-handler 12 | COPY go.mod . 13 | COPY go.sum . 14 | RUN go mod download -x 15 | 16 | # Build 17 | COPY . . 18 | RUN go build -ldflags="-s" -a -tags nth${GOOS} -o build/node-termination-handler cmd/node-termination-handler.go 19 | 20 | # In case the target is build for testing: 21 | # $ docker build --target=builder -t test . 22 | ENTRYPOINT ["/node-termination-handler/build/node-termination-handler"] 23 | 24 | # Copy the controller-manager into a thin image 25 | FROM mcr.microsoft.com/windows/nanoserver:${WINDOWS_VERSION} 26 | WORKDIR / 27 | COPY --from=builder /windows/system32/netapi32.dll /windows/system32/ 28 | COPY --from=builder /node-termination-handler/build/node-termination-handler . 29 | COPY THIRD_PARTY_LICENSES.md . 30 | ENTRYPOINT ["/node-termination-handler"] 31 | 32 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/podmonitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (not .Values.enableSqsTerminationDraining) (and .Values.enablePrometheusServer .Values.podMonitor.create) -}} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PodMonitor 4 | metadata: 5 | name: {{ template "aws-node-termination-handler.fullname" . }} 6 | namespace: {{ default .Release.Namespace .Values.podMonitor.namespace }} 7 | labels: 8 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 9 | {{- with .Values.podMonitor.labels }} 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | jobLabel: app.kubernetes.io/name 14 | namespaceSelector: 15 | matchNames: 16 | - {{ .Release.Namespace }} 17 | podMetricsEndpoints: 18 | - port: http-metrics 19 | path: /metrics 20 | {{- with .Values.podMonitor.interval }} 21 | interval: {{ . }} 22 | {{- end }} 23 | {{- with .Values.podMonitor.sampleLimit }} 24 | sampleLimit: {{ . }} 25 | {{- end }} 26 | selector: 27 | matchLabels: 28 | {{- include "aws-node-termination-handler.selectorLabelsDaemonset" . | nindent 6 }} 29 | {{- end -}} 30 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/servicemonitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.enableSqsTerminationDraining (and .Values.enablePrometheusServer .Values.serviceMonitor.create) -}} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | name: {{ include "aws-node-termination-handler.fullname" . }} 6 | namespace: {{ default .Release.Namespace .Values.serviceMonitor.namespace }} 7 | labels: 8 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 9 | {{- with .Values.serviceMonitor.labels }} 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | jobLabel: app.kubernetes.io/name 14 | namespaceSelector: 15 | matchNames: 16 | - {{ .Release.Namespace }} 17 | endpoints: 18 | - port: http-metrics 19 | path: /metrics 20 | {{- with .Values.serviceMonitor.interval }} 21 | interval: {{ . }} 22 | {{- end }} 23 | {{- with .Values.serviceMonitor.sampleLimit }} 24 | sampleLimit: {{ . }} 25 | {{- end }} 26 | selector: 27 | matchLabels: 28 | {{- include "aws-node-termination-handler.selectorLabelsDeployment" . | nindent 6 }} 29 | {{- end -}} 30 | -------------------------------------------------------------------------------- /pkg/uptime/uptime_windows.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package uptime 15 | 16 | import ( 17 | "syscall" 18 | "time" 19 | 20 | "golang.org/x/sys/windows" 21 | ) 22 | 23 | var ( 24 | kernel32 = windows.NewLazySystemDLL("kernel32.dll") 25 | getTickCount = kernel32.NewProc("GetTickCount") 26 | ) 27 | 28 | // Uptime returns the number of seconds since last system boot. 29 | func Uptime() (int64, error) { 30 | millis, _, err := syscall.Syscall(getTickCount.Addr(), 0, 0, 0, 0) 31 | if err != 0 { 32 | return 0, err 33 | } 34 | uptime := (time.Duration(millis) * time.Millisecond).Seconds() 35 | return int64(uptime), nil 36 | } 37 | -------------------------------------------------------------------------------- /pkg/logging/routing.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package logging 15 | 16 | import ( 17 | "io" 18 | 19 | "github.com/rs/zerolog" 20 | ) 21 | 22 | // RoutingLevelWriter writes data to one of two locations based on an 23 | // associated level value. 24 | type RoutingLevelWriter struct { 25 | io.Writer 26 | ErrWriter io.Writer 27 | } 28 | 29 | // WriteLevel if *l* is warning or higher then *b* is written to the error 30 | // location, otherwise it is written to the default location. 31 | func (r RoutingLevelWriter) WriteLevel(l zerolog.Level, b []byte) (int, error) { 32 | if l < zerolog.WarnLevel { 33 | return r.Write(b) 34 | } 35 | return r.ErrWriter.Write(b) 36 | } 37 | -------------------------------------------------------------------------------- /pkg/observability/probes.go: -------------------------------------------------------------------------------- 1 | package observability 2 | 3 | import ( 4 | "net" 5 | "net/http" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/rs/zerolog/log" 10 | ) 11 | 12 | // InitProbes will initialize, register and expose, via http server, the probes. 13 | func InitProbes(enabled bool, port int, endpoint string) error { 14 | if !enabled { 15 | return nil 16 | } 17 | 18 | http.HandleFunc(endpoint, livenessHandler) 19 | 20 | probes := &http.Server{ 21 | Addr: net.JoinHostPort("", strconv.Itoa(port)), 22 | ReadTimeout: 1 * time.Second, 23 | WriteTimeout: 1 * time.Second, 24 | } 25 | 26 | // Starts HTTP server exposing the probes path 27 | go func() { 28 | log.Info().Msgf("Starting to serve handler %s, port %d", endpoint, port) 29 | if err := probes.ListenAndServe(); err != nil && err != http.ErrServerClosed { 30 | log.Err(err).Msg("Failed to listen and serve http server") 31 | } 32 | }() 33 | 34 | return nil 35 | } 36 | 37 | func livenessHandler(w http.ResponseWriter, r *http.Request) { 38 | w.Header().Add("Content-Type", "application/json") 39 | w.WriteHeader(http.StatusOK) 40 | _, err := w.Write([]byte(`{"health":"OK"}`)) 41 | if err != nil { 42 | w.WriteHeader(http.StatusInternalServerError) 43 | log.Warn().Err(err).Msg("Unable to write health response") 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /scripts/install-amazon-ecr-credential-helper: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | usage=$(cat << EOM 6 | Download and install amazon-ecr-credential-helper for Docker client. 7 | 8 | usage: $(basename $0) [-h] VERSION 9 | 10 | Options: 11 | -h Print help message then exit 12 | 13 | Arguments: 14 | VERSION Version number of amazon-ecr-login-helper to download and install (e.g. 0.7.1) 15 | 16 | EOM 17 | ) 18 | 19 | function display_help { 20 | echo "${usage}" 1<&2 21 | } 22 | 23 | while getopts "h" arg; do 24 | case "${arg}" in 25 | h ) display_help 26 | exit 0 27 | ;; 28 | 29 | * ) display_help 30 | exit 1 31 | ;; 32 | esac 33 | done 34 | shift $((OPTIND-1)) 35 | 36 | version="${1:-}" 37 | if [[ -z "${version}" ]]; then 38 | echo "❌ no version given" 39 | display_help 40 | exit 1 41 | fi 42 | 43 | install_path="$(dirname "$(which docker-credential-wincred.exe)")" 44 | curl -Lo "${install_path}/docker-credential-ecr-login.exe" "https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/${version}/windows-amd64/docker-credential-ecr-login.exe" 45 | 46 | # Update Docker to use ecr-login instead of wincred. 47 | modified_config="$(mktemp)" 48 | jq '.credsStore="ecr-login"' ~/.docker/config.json > "${modified_config}" 49 | mv -f "${modified_config}" ~/.docker/config.json 50 | -------------------------------------------------------------------------------- /scripts/sync-readme-to-ecr-public: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | REPO_NAME="aws-node-termination-handler" 6 | #about and usage section char max 7 | MAX_CHAR_COUNT=10240 8 | USAGE_TEXT="See About section" 9 | ADDITIONAL_MSG="... 10 | 11 | **truncated due to char limits**... 12 | A complete version of the ReadMe can be found [here](https://github.com/aws/aws-node-termination-handler#aws-node-termination-handler)\"" 13 | 14 | 15 | if git --no-pager diff --name-only HEAD^ HEAD | grep 'README.md'; then 16 | #converting to json to insert esc chars, then replace newlines for proper markdown render 17 | raw_about=$(jq -n --arg msg "$(<$SCRIPTPATH/../README.md)" '{"usageText": $msg}' | jq '.usageText' | sed 's/\\n/\ 18 | /g') 19 | char_to_trunc="$(($MAX_CHAR_COUNT-${#ADDITIONAL_MSG}))" 20 | raw_truncated="${raw_about:0:$char_to_trunc}" 21 | raw_truncated+="$ADDITIONAL_MSG" 22 | resp=$(aws ecr-public put-repository-catalog-data --repository-name="${REPO_NAME}" --catalog-data aboutText="${raw_truncated}",usageText="${USAGE_TEXT}" --region us-east-1) 23 | 24 | if [[ $resp -ge 1 ]]; then 25 | echo "README sync to ecr-public failed" 26 | exit 1 27 | else 28 | echo "README sync to ecr-public succeeded!" 29 | fi 30 | else 31 | echo "README.md did not change in the last commit. Not taking any action." 32 | fi -------------------------------------------------------------------------------- /pkg/uptime/common.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package uptime 15 | 16 | import ( 17 | "fmt" 18 | "os" 19 | "strconv" 20 | "strings" 21 | ) 22 | 23 | // UptimeFuncType cleans up function arguments or return type. 24 | type UptimeFuncType func() (int64, error) 25 | 26 | // UptimeFromFile reads system uptime information from filepath and returns 27 | // the number of seconds since last system boot. 28 | func UptimeFromFile(filepath string) (int64, error) { 29 | data, err := os.ReadFile(filepath) 30 | if err != nil { 31 | return 0, fmt.Errorf("Not able to read %s: %w", filepath, err) 32 | } 33 | 34 | uptime, err := strconv.ParseFloat(strings.Split(string(data), " ")[0], 64) 35 | if err != nil { 36 | return 0, fmt.Errorf("Not able to parse %s to int64: %w", filepath, err) 37 | } 38 | return int64(uptime), nil 39 | } 40 | -------------------------------------------------------------------------------- /test/k8s-compatibility-test/run-k8s-compatibility-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | versions=("1.29" "1.30" "1.31" "1.32") 6 | E_CODE=0 7 | AFTER_FIRST_RUN_ARGS="" 8 | PASS_THRU_ARGS="" 9 | 10 | USAGE=$(cat << 'EOM' 11 | Usage: run-k8s-compatability-test [-h] 12 | Executes the spot termination integration test for each version of kubernetes (k8s 1.29 - 1.32 supported) 13 | 14 | Examples: 15 | # run test with direct download of go modules 16 | run-k8s-compatability-test -p "-d" 17 | 18 | Optional: 19 | -p Pass thru arguments to run-spot-termination-test.sh 20 | -h Display help 21 | EOM 22 | ) 23 | 24 | # Process our input arguments 25 | while getopts "p:" opt; do 26 | case ${opt} in 27 | p ) # PASS THRU ARGS 28 | PASS_THRU_ARGS="$OPTARG" 29 | ;; 30 | \? ) 31 | echo "$USAGE" 1>&2 32 | exit 33 | ;; 34 | esac 35 | done 36 | 37 | for i in "${!versions[@]}"; do 38 | version=${versions[$i]} 39 | $SCRIPTPATH/../k8s-local-cluster-test/run-test -b "test-${version//./-}" -v $version $PASS_THRU_ARGS $AFTER_FIRST_RUN_ARGS 40 | if [ $? -eq 0 ]; then 41 | echo "✅ Passed test for K8s version $version" 42 | else 43 | echo "❌ Failed test for K8s version $version" 44 | E_CODE=1 45 | fi 46 | AFTER_FIRST_RUN_ARGS="-n node-termination-handler:customtest -w webhook-test-proxy:customtest" 47 | done 48 | 49 | exit $E_CODE 50 | -------------------------------------------------------------------------------- /test/eks-cluster-test/reset-cluster: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function reset_cluster { 4 | echo "Resetting cluster" 5 | charts=$(helm ls --all --short) 6 | if [[ ! -z "$charts" ]]; then 7 | helm del $charts || : 8 | fi 9 | system_charts=$(helm ls --all --short --namespace kube-system) 10 | if [[ ! -z "$system_charts" ]]; then 11 | helm del $system_charts --namespace kube-system || : 12 | fi 13 | for node in $(kubectl get nodes | tail -n+2 | cut -d' ' -f1); do 14 | kubectl uncordon $node 15 | kubectl taint node $node aws-node-termination-handler/scheduled-maintenance- || true 16 | kubectl taint node $node aws-node-termination-handler/spot-itn- || true 17 | done 18 | remove_labels || : 19 | sleep 2 20 | } 21 | 22 | function remove_labels { 23 | echo "Removing labels from NTH cluster nodes" 24 | 25 | labels_to_remove=() 26 | while IFS='' read -r line; do 27 | labels_to_remove+=("$line"); 28 | done < <(kubectl get nodes -o json | jq '.items[].metadata.labels' | grep 'aws-node-termination-handler' | tr -d '[:blank:]' | tr -d '\"' | cut -d':' -f1) 29 | 30 | if [[ "${#labels_to_remove[@]}" -ne 0 ]]; then 31 | for l in "${labels_to_remove[@]}"; do 32 | for n in $(kubectl get nodes -o json | jq -r '.items[].metadata.name'); do 33 | echo "Deleting label $l on node $n" 34 | kubectl label node $n "$l"- 35 | done 36 | done 37 | fi 38 | } 39 | 40 | reset_cluster 41 | echo "✅ reset done" 42 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/templates/regular-pod-test.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.regularPodTest.create -}} 2 | {{- $isWindows := (contains "windows" .Values.targetNodeOs) -}} 3 | {{- $osSelector := printf "kubernetes.io/os" -}} 4 | apiVersion: apps/v1 5 | kind: Deployment 6 | metadata: 7 | name: {{ .Values.regularPodTest.label }} 8 | labels: 9 | app: {{ .Values.regularPodTest.label }} 10 | spec: 11 | selector: 12 | matchLabels: 13 | app: {{ .Values.regularPodTest.label }} 14 | template: 15 | metadata: 16 | labels: 17 | app: {{ .Values.regularPodTest.label }} 18 | spec: 19 | nodeSelector: 20 | {{ $osSelector }}: {{ $isWindows | ternary "windows" "linux" }} 21 | {{- with .Values.nodeSelector }} 22 | {{- toYaml . | nindent 8 }} 23 | {{- end }} 24 | {{- with .Values.linuxNodeSelector }} 25 | {{- toYaml . | nindent 8 }} 26 | {{- end }} 27 | {{- if (not $isWindows) }} 28 | serviceAccountName: {{ template "webhook-test-proxy.serviceAccountName" . }} 29 | securityContext: 30 | runAsUser: 1000 31 | runAsGroup: 3000 32 | fsGroup: 2000 33 | {{- end }} 34 | containers: 35 | - name: {{ .Values.regularPodTest.label }} 36 | image: {{ .Values.webhookTestProxy.image.repository }}:{{ .Values.webhookTestProxy.image.tag }} 37 | imagePullPolicy: {{ .Values.webhookTestProxy.image.pullPolicy }} 38 | env: 39 | - name: PORT 40 | value: {{ .Values.regularPodTest.port | quote }} 41 | {{- end -}} 42 | -------------------------------------------------------------------------------- /pkg/uptime/common_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package uptime 15 | 16 | import ( 17 | "os" 18 | "testing" 19 | 20 | h "github.com/aws/aws-node-termination-handler/pkg/test" 21 | ) 22 | 23 | const testFile = "test.out" 24 | 25 | func TestUptimeFromFileSuccess(t *testing.T) { 26 | d1 := []byte("350735.47 234388.90") 27 | err := os.WriteFile(testFile, d1, 0644) 28 | h.Ok(t, err) 29 | 30 | value, err := UptimeFromFile(testFile) 31 | os.Remove(testFile) 32 | h.Ok(t, err) 33 | h.Equals(t, int64(350735), value) 34 | } 35 | 36 | func TestUptimeFromFileReadFail(t *testing.T) { 37 | _, err := UptimeFromFile("does-not-exist") 38 | h.Assert(t, err != nil, "Failed to return error when ReadFile failed") 39 | } 40 | 41 | func TestUptimeFromFileBadData(t *testing.T) { 42 | d1 := []byte("Something not time") 43 | err := os.WriteFile(testFile, d1, 0644) 44 | h.Ok(t, err) 45 | 46 | _, err = UptimeFromFile(testFile) 47 | os.Remove(testFile) 48 | h.Assert(t, err != nil, "Failed to return error for int64 parse") 49 | } 50 | -------------------------------------------------------------------------------- /test/webhook-test-proxy/cmd/webhook-test-proxy.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package main 15 | 16 | import ( 17 | "log" 18 | "net/http" 19 | "os" 20 | ) 21 | 22 | // Get env var or default 23 | func getEnv(key, fallback string) string { 24 | if value, ok := os.LookupEnv(key); ok { 25 | return value 26 | } 27 | return fallback 28 | } 29 | 30 | // Get the port to listen on 31 | func getListenAddress() string { 32 | port := getEnv("PORT", "1338") 33 | return ":" + port 34 | } 35 | 36 | func handleRequest(res http.ResponseWriter, req *http.Request) { 37 | log.Println("GOT REQUEST: ", req.URL.Path) 38 | // support webhook test 39 | if req.Method == http.MethodPost { 40 | res.WriteHeader(http.StatusOK) 41 | return 42 | } 43 | res.WriteHeader(http.StatusBadRequest) 44 | } 45 | 46 | func main() { 47 | log.Println("The webhook-test-proxy started on port ", getListenAddress()) 48 | // start server 49 | http.HandleFunc("/", handleRequest) 50 | if err := http.ListenAndServe(getListenAddress(), nil); err != nil { 51 | panic(err) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/templates/daemonset.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.webhookTestProxy.create -}} 2 | {{- $isWindows := (contains "windows" .Values.targetNodeOs) -}} 3 | {{- $osSelector := printf "kubernetes.io/os" -}} 4 | apiVersion: apps/v1 5 | kind: DaemonSet 6 | metadata: 7 | name: {{ .Values.webhookTestProxy.label }} 8 | labels: 9 | app: {{ .Values.webhookTestProxy.label }} 10 | spec: 11 | selector: 12 | matchLabels: 13 | app: {{ .Values.webhookTestProxy.label }} 14 | template: 15 | metadata: 16 | labels: 17 | app: {{ .Values.webhookTestProxy.label }} 18 | spec: 19 | serviceAccountName: {{ template "webhook-test-proxy.serviceAccountName" . }} 20 | {{- if (not $isWindows) }} 21 | securityContext: 22 | runAsUser: 1000 23 | runAsGroup: 3000 24 | fsGroup: 2000 25 | {{- end }} 26 | nodeSelector: 27 | {{ $osSelector }}: {{ $isWindows | ternary "windows" "linux" }} 28 | containers: 29 | - name: {{ .Values.webhookTestProxy.label }} 30 | image: {{ .Values.webhookTestProxy.image.repository }}:{{ .Values.webhookTestProxy.image.tag }} 31 | imagePullPolicy: {{ .Values.webhookTestProxy.image.pullPolicy }} 32 | ports: 33 | - containerPort: {{ .Values.webhookTestProxy.containerPort }} 34 | env: 35 | - name: PORT 36 | value: {{ .Values.webhookTestProxy.containerPort | quote }} 37 | {{- if .Values.webhookTestProxy.tolerations }} 38 | tolerations: 39 | {{ toYaml .Values.webhookTestProxy.tolerations | indent 8 }} 40 | {{- end }} 41 | {{- end -}} 42 | -------------------------------------------------------------------------------- /pkg/monitor/sqsevent/event-bridge.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package sqsevent 15 | 16 | import ( 17 | "encoding/json" 18 | "time" 19 | 20 | "github.com/rs/zerolog/log" 21 | ) 22 | 23 | // EventBridgeEvent is a structure to hold generic event details from Amazon EventBridge 24 | type EventBridgeEvent struct { 25 | Version string `json:"version"` 26 | ID string `json:"id"` 27 | DetailType string `json:"detail-type"` 28 | Source string `json:"source"` 29 | Account string `json:"account"` 30 | Time string `json:"time"` 31 | Region string `json:"region"` 32 | Resources []string `json:"resources"` 33 | Detail json.RawMessage `json:"detail"` 34 | } 35 | 36 | func (e EventBridgeEvent) getTime() time.Time { 37 | terminationTime, err := time.Parse(time.RFC3339, e.Time) 38 | if err != nil { 39 | log.Warn().Msgf("Unable to parse time as RFC3339 from event %s (%s), using current time instead.", e.DetailType, e.ID) 40 | return time.Now() 41 | } 42 | return terminationTime 43 | } 44 | -------------------------------------------------------------------------------- /config/helm/squid/templates/psp.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.rbac.pspEnabled) (semverCompare "<1.25-0" .Capabilities.KubeVersion.GitVersion) }} 2 | apiVersion: policy/v1beta1 3 | kind: PodSecurityPolicy 4 | metadata: 5 | name: {{ template "squid.fullname" . }} 6 | labels: 7 | {{ include "squid.labels" . | indent 4 }} 8 | annotations: 9 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' 10 | spec: 11 | privileged: false 12 | hostIPC: false 13 | hostNetwork: true 14 | hostPorts: 15 | - min: 1024 16 | max: 65535 17 | hostPID: false 18 | readOnlyRootFilesystem: false 19 | allowPrivilegeEscalation: false 20 | allowedCapabilities: 21 | - '*' 22 | fsGroup: 23 | rule: RunAsAny 24 | runAsUser: 25 | rule: RunAsAny 26 | seLinux: 27 | rule: RunAsAny 28 | supplementalGroups: 29 | rule: RunAsAny 30 | volumes: 31 | - '*' 32 | --- 33 | kind: ClusterRole 34 | apiVersion: rbac.authorization.k8s.io/v1 35 | metadata: 36 | name: {{ template "squid.fullname" . }}-psp 37 | labels: 38 | {{ include "squid.labels" . | indent 4 }} 39 | rules: 40 | - apiGroups: ['policy'] 41 | resources: ['podsecuritypolicies'] 42 | verbs: ['use'] 43 | resourceNames: 44 | - {{ template "squid.fullname" . }} 45 | --- 46 | apiVersion: rbac.authorization.k8s.io/v1 47 | kind: RoleBinding 48 | metadata: 49 | name: {{ template "squid.fullname" . }}-psp 50 | labels: 51 | {{ include "squid.labels" . | indent 4 }} 52 | roleRef: 53 | apiGroup: rbac.authorization.k8s.io 54 | kind: ClusterRole 55 | name: {{ template "squid.fullname" . }}-psp 56 | subjects: 57 | - kind: ServiceAccount 58 | name: {{ template "squid.serviceAccountName" . }} 59 | namespace: {{ .Release.Namespace }} 60 | {{- end }} 61 | -------------------------------------------------------------------------------- /pkg/monitor/sqsevent/sqs-retryer.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package sqsevent 15 | 16 | import ( 17 | "strings" 18 | "time" 19 | 20 | "github.com/aws/aws-sdk-go/aws" 21 | "github.com/aws/aws-sdk-go/aws/client" 22 | "github.com/aws/aws-sdk-go/aws/request" 23 | "github.com/aws/aws-sdk-go/aws/session" 24 | "github.com/aws/aws-sdk-go/service/sqs" 25 | ) 26 | 27 | type SqsRetryer struct { 28 | client.DefaultRetryer 29 | } 30 | 31 | func (r SqsRetryer) ShouldRetry(req *request.Request) bool { 32 | return r.DefaultRetryer.ShouldRetry(req) || 33 | (req.Error != nil && strings.Contains(req.Error.Error(), "connection reset")) 34 | } 35 | 36 | func GetSqsClient(sess *session.Session) *sqs.SQS { 37 | return sqs.New(sess, &aws.Config{ 38 | Retryer: SqsRetryer{ 39 | DefaultRetryer: client.DefaultRetryer{ 40 | // Monitor continuously monitors SQS for events every 2 seconds 41 | NumMaxRetries: client.DefaultRetryerMaxNumRetries, 42 | MinRetryDelay: client.DefaultRetryerMinRetryDelay, 43 | MaxRetryDelay: 1200 * time.Millisecond, 44 | MinThrottleDelay: client.DefaultRetryerMinThrottleDelay, 45 | MaxThrottleDelay: 1200 * time.Millisecond, 46 | }, 47 | }, 48 | }) 49 | } 50 | -------------------------------------------------------------------------------- /config/helm/localstack/templates/psp.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.rbac.pspEnabled) (semverCompare "<1.25-0" .Capabilities.KubeVersion.GitVersion) }} 2 | apiVersion: policy/v1beta1 3 | kind: PodSecurityPolicy 4 | metadata: 5 | name: {{ template "localstack.fullname" . }} 6 | labels: 7 | {{ include "localstack.labels" . | indent 4 }} 8 | annotations: 9 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' 10 | spec: 11 | privileged: false 12 | hostIPC: false 13 | hostNetwork: false 14 | hostPorts: 15 | - min: 1024 16 | max: 65535 17 | hostPID: false 18 | readOnlyRootFilesystem: false 19 | allowPrivilegeEscalation: false 20 | allowedCapabilities: 21 | - '*' 22 | fsGroup: 23 | rule: RunAsAny 24 | runAsUser: 25 | rule: RunAsAny 26 | seLinux: 27 | rule: RunAsAny 28 | supplementalGroups: 29 | rule: RunAsAny 30 | volumes: 31 | - '*' 32 | --- 33 | kind: ClusterRole 34 | apiVersion: rbac.authorization.k8s.io/v1 35 | metadata: 36 | name: {{ template "localstack.fullname" . }}-psp 37 | labels: 38 | {{ include "localstack.labels" . | indent 4 }} 39 | rules: 40 | - apiGroups: ['policy'] 41 | resources: ['podsecuritypolicies'] 42 | verbs: ['use'] 43 | resourceNames: 44 | - {{ template "localstack.fullname" . }} 45 | --- 46 | apiVersion: rbac.authorization.k8s.io/v1 47 | kind: RoleBinding 48 | metadata: 49 | name: {{ template "localstack.fullname" . }}-psp 50 | labels: 51 | {{ include "localstack.labels" . | indent 4 }} 52 | roleRef: 53 | apiGroup: rbac.authorization.k8s.io 54 | kind: ClusterRole 55 | name: {{ template "localstack.fullname" . }}-psp 56 | subjects: 57 | - kind: ServiceAccount 58 | name: {{ template "localstack.serviceAccountName" . }} 59 | namespace: {{ .Release.Namespace }} 60 | {{- end }} 61 | -------------------------------------------------------------------------------- /test/k8s-local-cluster-test/psp-default.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: policy/v1beta1 2 | kind: PodSecurityPolicy 3 | metadata: 4 | annotations: 5 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' 6 | name: default 7 | spec: 8 | allowedCapabilities: [] # default set of capabilities are implicitly allowed 9 | allowPrivilegeEscalation: false 10 | fsGroup: 11 | rule: 'MustRunAs' 12 | ranges: 13 | # Forbid adding the root group. 14 | - min: 1 15 | max: 65535 16 | hostIPC: false 17 | hostNetwork: false 18 | hostPID: false 19 | privileged: false 20 | readOnlyRootFilesystem: false 21 | runAsUser: 22 | rule: 'MustRunAsNonRoot' 23 | seLinux: 24 | rule: 'RunAsAny' 25 | supplementalGroups: 26 | rule: 'RunAsAny' 27 | ranges: 28 | # Forbid adding the root group. 29 | - min: 1 30 | max: 65535 31 | volumes: 32 | - 'configMap' 33 | - 'downwardAPI' 34 | - 'emptyDir' 35 | - 'persistentVolumeClaim' 36 | - 'projected' 37 | - 'secret' 38 | 39 | --- 40 | 41 | # Cluster role which grants access to the default pod security policy 42 | apiVersion: rbac.authorization.k8s.io/v1 43 | kind: ClusterRole 44 | metadata: 45 | name: default-psp 46 | rules: 47 | - apiGroups: 48 | - policy 49 | resourceNames: 50 | - default 51 | resources: 52 | - podsecuritypolicies 53 | verbs: 54 | - use 55 | 56 | --- 57 | 58 | # Cluster role binding for default pod security policy granting all authenticated users access 59 | apiVersion: rbac.authorization.k8s.io/v1 60 | kind: ClusterRoleBinding 61 | metadata: 62 | name: default-psp 63 | roleRef: 64 | apiGroup: rbac.authorization.k8s.io 65 | kind: ClusterRole 66 | name: default-psp 67 | subjects: 68 | - apiGroup: rbac.authorization.k8s.io 69 | kind: Group 70 | name: system:authenticated 71 | -------------------------------------------------------------------------------- /scripts/push-helm-chart: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")";pwd -P )" 5 | 6 | REPO_ROOT_PATH=$SCRIPTPATH/../ 7 | MAKE_FILE_PATH=$REPO_ROOT_PATH/Makefile 8 | CHART_VERSION=$(make -s -f $MAKE_FILE_PATH chart-version) 9 | HELM_CHART_PATH=$REPO_ROOT_PATH/config/helm/aws-node-termination-handler 10 | 11 | USAGE=$(cat << 'EOM' 12 | Usage: push-helm-chart 13 | Pushes helm charts 14 | Optional: 15 | -h HELM CHART REGISTRY: set the helm chart registry 16 | -v CHART VERSION: The chart version [DEFAULT: output of `make chart-version`] 17 | -r HELM CHART REPOSITORY: Set the helm chart repository 18 | EOM 19 | ) 20 | 21 | # Process our input arguments 22 | while getopts "r:v:h:" opt; do 23 | case ${opt} in 24 | r ) # Helm Chart Repository 25 | HELM_CHART_REPOSITORY="$OPTARG" 26 | ;; 27 | v ) # Image Version 28 | CHART_VERSION="$OPTARG" 29 | ;; 30 | h ) # Helm Chart Registry 31 | ECR_REGISTRY="$OPTARG" 32 | ;; 33 | \? ) 34 | echo "$USAGE" 1>&2 35 | exit 36 | ;; 37 | esac 38 | done 39 | 40 | CHART_EXISTS=$(aws ecr-public describe-images --repository-name "helm/$HELM_CHART_REPOSITORY" --region us-east-1 --query "imageDetails[?contains(imageTags, '$CHART_VERSION')].imageTags[]" --output text) 41 | 42 | if [[ -n "$CHART_EXISTS" ]]; then 43 | echo "chart with version $CHART_VERSION already exists in the repository, skipping pushing of chart..." 44 | exit 0 45 | fi 46 | 47 | echo "chart with version $CHART_VERSION not found in repository, pushing new chart..." 48 | #Package the chart 49 | helm package $HELM_CHART_PATH --destination $REPO_ROOT_PATH/build 50 | #Pushing helm chart 51 | helm push $REPO_ROOT_PATH/build/$HELM_CHART_REPOSITORY-$CHART_VERSION.tgz oci://$ECR_REGISTRY/helm 52 | -------------------------------------------------------------------------------- /config/helm/localstack/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | {{- $osSelector := printf "kubernetes.io/os" -}} 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: {{ .Values.label }} 6 | labels: 7 | app: {{ .Values.label }} 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: {{ .Values.label }} 13 | template: 14 | metadata: 15 | labels: 16 | app: {{ .Values.label }} 17 | spec: 18 | serviceAccountName: {{ template "localstack.serviceAccountName" . }} 19 | securityContext: 20 | runAsUser: 1000 21 | runAsGroup: 3000 22 | fsGroup: 2000 23 | nodeSelector: 24 | {{ $osSelector }}: "linux" 25 | {{- with .Values.nodeSelector }} 26 | {{- toYaml . | nindent 8 }} 27 | {{- end }} 28 | containers: 29 | - name: {{ .Values.label }} 30 | image: {{ .Values.image.repository }}:{{ .Values.image.tag }} 31 | imagePullPolicy: {{ .Values.image.pullPolicy }} 32 | ports: 33 | - containerPort: {{ .Values.containerPort }} 34 | livenessProbe: 35 | exec: 36 | command: 37 | - curl 38 | - http://localhost:{{ .Values.containerPort }}/health 39 | initialDelaySeconds: 15 40 | periodSeconds: 15 41 | env: 42 | - name: LOCALSTACK_EDGE_PORT 43 | value: {{ .Values.containerPort | quote }} 44 | - name: LOCALSTACK_SERVICES 45 | value: {{ .Values.services | quote }} 46 | - name: LOCALSTACK_DEFAULT_REGION 47 | value: {{ .Values.defaultRegion | quote }} 48 | - name: LOCALSTACK_START_WEB 49 | value: "0" 50 | {{- with .Values.tolerations }} 51 | tolerations: 52 | {{- toYaml . | nindent 8 }} 53 | {{- end }} 54 | -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/templates/psp.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.rbac.pspEnabled) (semverCompare "<1.25-0" .Capabilities.KubeVersion.GitVersion) }} 2 | apiVersion: policy/v1beta1 3 | kind: PodSecurityPolicy 4 | metadata: 5 | name: {{ template "webhook-test-proxy.fullname" . }} 6 | labels: 7 | {{ include "webhook-test-proxy.labels" . | indent 4 }} 8 | annotations: 9 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' 10 | spec: 11 | privileged: false 12 | hostIPC: false 13 | hostNetwork: true 14 | hostPorts: 15 | - min: 1024 16 | max: 65535 17 | hostPID: false 18 | readOnlyRootFilesystem: false 19 | allowPrivilegeEscalation: false 20 | allowedCapabilities: 21 | - '*' 22 | fsGroup: 23 | rule: RunAsAny 24 | runAsUser: 25 | rule: RunAsAny 26 | seLinux: 27 | rule: RunAsAny 28 | supplementalGroups: 29 | rule: RunAsAny 30 | volumes: 31 | - '*' 32 | --- 33 | kind: ClusterRole 34 | apiVersion: rbac.authorization.k8s.io/v1 35 | metadata: 36 | name: {{ template "webhook-test-proxy.fullname" . }}-psp 37 | labels: 38 | {{ include "webhook-test-proxy.labels" . | indent 4 }} 39 | rules: 40 | - apiGroups: ['policy'] 41 | resources: ['podsecuritypolicies'] 42 | verbs: ['use'] 43 | resourceNames: 44 | - {{ template "webhook-test-proxy.fullname" . }} 45 | --- 46 | apiVersion: rbac.authorization.k8s.io/v1 47 | kind: RoleBinding 48 | metadata: 49 | name: {{ template "webhook-test-proxy.fullname" . }}-psp 50 | labels: 51 | {{ include "webhook-test-proxy.labels" . | indent 4 }} 52 | roleRef: 53 | apiGroup: rbac.authorization.k8s.io 54 | kind: ClusterRole 55 | name: {{ template "webhook-test-proxy.fullname" . }}-psp 56 | subjects: 57 | - kind: ServiceAccount 58 | name: {{ template "webhook-test-proxy.serviceAccountName" . }} 59 | namespace: {{ .Release.Namespace }} 60 | {{- end }} 61 | -------------------------------------------------------------------------------- /test/helm/validate-chart-versions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | NTH_HELM_DIR=config/helm/aws-node-termination-handler 5 | 6 | function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; } 7 | 8 | EXIT_CODE=0 9 | # Helm yaml changes should increment Chart version EXCEPT FOR test.yaml 10 | HELM_FILES_CHANGED=$(git show --name-only | grep -E "$NTH_HELM_DIR" | grep .yaml | grep -v test.yaml || :) 11 | if [[ -z $HELM_FILES_CHANGED ]]; then 12 | echo "✅ No Helm file changes detected" 13 | exit $EXIT_CODE 14 | fi 15 | 16 | echo "📝 Helm chart changes detected! Checking for updated Chart versions since the last release" 17 | LATEST_COMMIT_HASH=$(git rev-parse HEAD) 18 | LATEST_COMMIT_CHART_VERSION=$(git --no-pager show "$LATEST_COMMIT_HASH":"$NTH_HELM_DIR"/Chart.yaml | grep 'version:' | xargs | cut -d' ' -f2 | tr -d '[:space:]') 19 | 20 | TAG=$(git describe HEAD --tags | grep -Eo "^v[0-9]+(\.[0-9]+)*") 21 | LAST_RELEASE_HASH=$(git rev-list -1 "$TAG") 22 | LAST_RELEASE_CHART_VERSION=$(git --no-pager show "$LAST_RELEASE_HASH":"$NTH_HELM_DIR"/Chart.yaml | grep 'version:' | xargs | cut -d' ' -f2 | tr -d '[:space:]') 23 | if [[ $LAST_RELEASE_CHART_VERSION == "$LATEST_COMMIT_CHART_VERSION" ]]; then 24 | echo "❌ This commit's NTH Chart has the same Chart version as the latest release $LATEST_COMMIT_CHART_VERSION -- please increment Chart version in NTH" 25 | EXIT_CODE=1 26 | elif [[ $(version $LATEST_COMMIT_CHART_VERSION) -lt $(version $LAST_RELEASE_CHART_VERSION) ]]; then 27 | echo "❌ This commit's NTH Chart version $LATEST_COMMIT_CHART_VERSION is BEHIND the latest release's chart version $LAST_RELEASE_CHART_VERSION -- please increment Chart version in NTH" 28 | EXIT_CODE=1 29 | else 30 | echo "✅ This commit's NTH Chart has a different version since the last release ($LAST_RELEASE_CHART_VERSION -> $LATEST_COMMIT_CHART_VERSION)" 31 | fi 32 | 33 | exit $EXIT_CODE -------------------------------------------------------------------------------- /scripts/run-unit-tests-in-docker: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | project_root_dir="$(cd "$(dirname "$0")/.." && pwd -P)" 6 | work_dir="/workplace/aws-node-termination-handler" 7 | container_name="nth_unit_test_on_linux" 8 | deps="go,git,make" 9 | recreate=0 10 | usage=$(cat <&2 37 | exit 38 | ;; 39 | esac 40 | done 41 | 42 | echo "unit tests will be run in docker container named $container_name" 43 | 44 | if [[ $recreate -eq 1 ]]; then 45 | docker container rm "$container_name" >/dev/null 2>&1 || true 46 | fi 47 | 48 | if ! [[ -n $(docker container ls -a | grep "$container_name") ]]; then 49 | echo "creating container ..." 50 | 51 | IFS=',' read -ra deps <<< "$deps" 52 | echo "dependencies to install: ${deps[*]}" 53 | 54 | docker container create \ 55 | --name "$container_name" \ 56 | --volume "$project_root_dir:$work_dir" \ 57 | --env GOPROXY=direct \ 58 | --env GO111MODULE=auto \ 59 | --workdir "$work_dir" \ 60 | --init \ 61 | alpine:latest \ 62 | sh -c "apk add ${deps[*]} && make clean unit-test" 63 | 64 | echo "container created" 65 | else 66 | echo "container exists" 67 | fi 68 | 69 | echo "running unit tests ..." 70 | docker container start --attach "$container_name" 71 | 72 | -------------------------------------------------------------------------------- /config/helm/squid/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "squid.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "squid.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Common labels 29 | */}} 30 | {{- define "squid.labels" -}} 31 | app.kubernetes.io/name: {{ include "squid.name" . }} 32 | helm.sh/chart: {{ include "squid.chart" . }} 33 | app.kubernetes.io/instance: {{ .Release.Name }} 34 | k8s-app: squid 35 | {{- if .Chart.AppVersion }} 36 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 37 | {{- end }} 38 | app.kubernetes.io/managed-by: {{ .Release.Service }} 39 | {{- end -}} 40 | 41 | {{/* 42 | Create chart name and version as used by the chart label. 43 | */}} 44 | {{- define "squid.chart" -}} 45 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 46 | {{- end -}} 47 | 48 | {{/* 49 | Create the name of the service account to use 50 | */}} 51 | {{- define "squid.serviceAccountName" -}} 52 | {{- if .Values.serviceAccount.create -}} 53 | {{ default (include "squid.fullname" .) .Values.serviceAccount.name }} 54 | {{- else -}} 55 | {{ default "default" .Values.serviceAccount.name }} 56 | {{- end -}} 57 | {{- end -}} 58 | -------------------------------------------------------------------------------- /test/k8s-local-cluster-test/psp-privileged.yaml: -------------------------------------------------------------------------------- 1 | # Should grant access to very few pods, i.e. kube-system system pods and possibly CNI pods 2 | apiVersion: policy/v1beta1 3 | kind: PodSecurityPolicy 4 | metadata: 5 | annotations: 6 | # See https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp 7 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' 8 | name: privileged 9 | spec: 10 | allowedCapabilities: 11 | - '*' 12 | allowPrivilegeEscalation: true 13 | fsGroup: 14 | rule: 'RunAsAny' 15 | hostIPC: true 16 | hostNetwork: true 17 | hostPID: true 18 | hostPorts: 19 | - min: 0 20 | max: 65535 21 | privileged: true 22 | readOnlyRootFilesystem: false 23 | runAsUser: 24 | rule: 'RunAsAny' 25 | seLinux: 26 | rule: 'RunAsAny' 27 | supplementalGroups: 28 | rule: 'RunAsAny' 29 | volumes: 30 | - '*' 31 | 32 | --- 33 | 34 | # Cluster role which grants access to the privileged pod security policy 35 | apiVersion: rbac.authorization.k8s.io/v1 36 | kind: ClusterRole 37 | metadata: 38 | name: privileged-psp 39 | rules: 40 | - apiGroups: 41 | - policy 42 | resourceNames: 43 | - privileged 44 | resources: 45 | - podsecuritypolicies 46 | verbs: 47 | - use 48 | 49 | --- 50 | 51 | # Role binding for kube-system - allow nodes and kube-system service accounts - should take care of CNI i.e. flannel running in the kube-system namespace 52 | # Assumes access to the kube-system is restricted 53 | apiVersion: rbac.authorization.k8s.io/v1 54 | kind: RoleBinding 55 | metadata: 56 | name: kube-system-psp 57 | namespace: kube-system 58 | roleRef: 59 | apiGroup: rbac.authorization.k8s.io 60 | kind: ClusterRole 61 | name: privileged-psp 62 | subjects: 63 | # For the kubeadm kube-system nodes 64 | - apiGroup: rbac.authorization.k8s.io 65 | kind: Group 66 | name: system:nodes 67 | # For all service accounts in the kube-system namespace 68 | - apiGroup: rbac.authorization.k8s.io 69 | kind: Group 70 | name: system:serviceaccounts:kube-system 71 | -------------------------------------------------------------------------------- /config/helm/localstack/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "localstack.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "localstack.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Common labels 29 | */}} 30 | {{- define "localstack.labels" -}} 31 | app.kubernetes.io/name: {{ include "localstack.name" . }} 32 | helm.sh/chart: {{ include "localstack.chart" . }} 33 | app.kubernetes.io/instance: {{ .Release.Name }} 34 | k8s-app: localstack 35 | {{- if .Chart.AppVersion }} 36 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 37 | {{- end }} 38 | app.kubernetes.io/managed-by: {{ .Release.Service }} 39 | {{- end -}} 40 | 41 | {{/* 42 | Create chart name and version as used by the chart label. 43 | */}} 44 | {{- define "localstack.chart" -}} 45 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 46 | {{- end -}} 47 | 48 | {{/* 49 | Create the name of the service account to use 50 | */}} 51 | {{- define "localstack.serviceAccountName" -}} 52 | {{- if .Values.serviceAccount.create -}} 53 | {{ default (include "localstack.fullname" .) .Values.serviceAccount.name }} 54 | {{- else -}} 55 | {{ default "default" .Values.serviceAccount.name }} 56 | {{- end -}} 57 | {{- end -}} -------------------------------------------------------------------------------- /config/helm/webhook-test-proxy/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "webhook-test-proxy.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "webhook-test-proxy.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Common labels 29 | */}} 30 | {{- define "webhook-test-proxy.labels" -}} 31 | app.kubernetes.io/name: {{ include "webhook-test-proxy.name" . }} 32 | helm.sh/chart: {{ include "webhook-test-proxy.chart" . }} 33 | app.kubernetes.io/instance: {{ .Release.Name }} 34 | k8s-app: webhook-test-proxy 35 | {{- if .Chart.AppVersion }} 36 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 37 | {{- end }} 38 | app.kubernetes.io/managed-by: {{ .Release.Service }} 39 | {{- end -}} 40 | 41 | {{/* 42 | Create chart name and version as used by the chart label. 43 | */}} 44 | {{- define "webhook-test-proxy.chart" -}} 45 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 46 | {{- end -}} 47 | 48 | {{/* 49 | Create the name of the service account to use 50 | */}} 51 | {{- define "webhook-test-proxy.serviceAccountName" -}} 52 | {{- if .Values.serviceAccount.create -}} 53 | {{ default (include "webhook-test-proxy.fullname" .) .Values.serviceAccount.name }} 54 | {{- else -}} 55 | {{ default "default" .Values.serviceAccount.name }} 56 | {{- end -}} 57 | {{- end -}} -------------------------------------------------------------------------------- /pkg/monitor/types_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package monitor_test 15 | 16 | import ( 17 | "testing" 18 | "time" 19 | 20 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 21 | h "github.com/aws/aws-node-termination-handler/pkg/test" 22 | ) 23 | 24 | func TestTimeUntilEvent(t *testing.T) { 25 | startTime := time.Now().Add(time.Second * 10) 26 | expected := time.Until(startTime).Round(time.Second) 27 | 28 | event := &monitor.InterruptionEvent{ 29 | StartTime: startTime, 30 | } 31 | 32 | result := event.TimeUntilEvent() 33 | h.Equals(t, expected, result.Round(time.Second)) 34 | } 35 | 36 | func TestIsRebalanceRecommendation_Monitor_Success(t *testing.T) { 37 | monitorEventId := "rebalance-recommendation-" 38 | event := &monitor.InterruptionEvent{ 39 | EventID: monitorEventId, 40 | } 41 | 42 | h.Equals(t, true, event.IsRebalanceRecommendation()) 43 | } 44 | 45 | func TestIsRebalanceRecommendation_SQS_Success(t *testing.T) { 46 | sqsEventId := "rebalance-recommendation-event-" 47 | event := &monitor.InterruptionEvent{ 48 | EventID: sqsEventId, 49 | } 50 | 51 | h.Equals(t, true, event.IsRebalanceRecommendation()) 52 | } 53 | 54 | func TestIsRebalanceRecommendation_Failure(t *testing.T) { 55 | eventId := "reblaance-recommendation" 56 | event := &monitor.InterruptionEvent{ 57 | EventID: eventId, 58 | } 59 | 60 | h.Equals(t, false, event.IsRebalanceRecommendation()) 61 | } 62 | 63 | func TestIsRebalanceRecommendation_Empty_Failure(t *testing.T) { 64 | event := &monitor.InterruptionEvent{} 65 | h.Equals(t, false, event.IsRebalanceRecommendation()) 66 | } 67 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: Stale Issues / PRs 2 | 3 | on: 4 | schedule: 5 | - cron: "0 17 * * *" # Runs every day at 12:00PM CST 6 | 7 | jobs: 8 | stale: 9 | runs-on: ubuntu-24.04 10 | steps: 11 | # 15+5 day stale policy for PRs 12 | # * Except PRs marked as "stalebot-ignore" 13 | - name: Stale PRs policy 14 | uses: actions/stale@v4.0.0 15 | with: 16 | repo-token: ${{ secrets.GITHUB_TOKEN }} 17 | exempt-pr-labels: "stalebot-ignore" 18 | days-before-stale: 15 19 | days-before-close: 5 20 | days-before-issue-stale: -1 21 | days-before-issue-close: -1 22 | remove-stale-when-updated: true 23 | stale-pr-label: "stale" 24 | operations-per-run: 100 25 | stale-pr-message: > 26 | This PR has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. 27 | If you want this PR to never become stale, please ask a maintainer to apply the "stalebot-ignore" label. 28 | close-pr-message: > 29 | This PR was closed because it has become stale with no activity. 30 | 31 | # 30+5 day stale policy for open issues 32 | # * Except Issues marked as "stalebot-ignore" 33 | - name: Stale Issues policy 34 | uses: actions/stale@v4.0.0 35 | with: 36 | repo-token: ${{ secrets.GITHUB_TOKEN }} 37 | exempt-issue-labels: "stalebot-ignore" 38 | days-before-stale: 30 39 | days-before-close: 5 40 | days-before-pr-stale: -1 41 | days-before-pr-close: -1 42 | remove-stale-when-updated: true 43 | stale-issue-label: "stale" 44 | operations-per-run: 100 45 | stale-issue-message: > 46 | This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. 47 | If you want this issue to never become stale, please ask a maintainer to apply the "stalebot-ignore" label. 48 | close-issue-message: > 49 | This issue was closed because it has become stale with no activity. 50 | -------------------------------------------------------------------------------- /docs/cfn-template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Resources: 3 | Queue: 4 | Type: 'AWS::SQS::Queue' 5 | Properties: 6 | MessageRetentionPeriod: 300 7 | SqsManagedSseEnabled: true 8 | QueuePolicy: 9 | Type: 'AWS::SQS::QueuePolicy' 10 | Properties: 11 | Queues: 12 | - !Ref Queue 13 | PolicyDocument: 14 | Statement: 15 | - Effect: Allow 16 | Principal: 17 | Service: 18 | - events.amazonaws.com 19 | - sqs.amazonaws.com 20 | Action: 'sqs:SendMessage' 21 | Resource: !GetAtt Queue.Arn 22 | ASGTermRule: 23 | Type: 'AWS::Events::Rule' 24 | Properties: 25 | EventPattern: 26 | source: 27 | - aws.autoscaling 28 | detail-type: 29 | - EC2 Instance-terminate Lifecycle Action 30 | Targets: 31 | - Id: 1 32 | Arn: !GetAtt Queue.Arn 33 | ScheduledChangeRule: 34 | Type: 'AWS::Events::Rule' 35 | Properties: 36 | EventPattern: 37 | source: 38 | - aws.health 39 | detail-type: 40 | - AWS Health Event 41 | detail: 42 | service: 43 | - EC2 44 | eventTypeCategory: 45 | - scheduledChange 46 | Targets: 47 | - Id: 1 48 | Arn: !GetAtt Queue.Arn 49 | SpotTermRule: 50 | Type: 'AWS::Events::Rule' 51 | Properties: 52 | EventPattern: 53 | source: 54 | - aws.ec2 55 | detail-type: 56 | - EC2 Spot Instance Interruption Warning 57 | Targets: 58 | - Id: 1 59 | Arn: !GetAtt Queue.Arn 60 | RebalanceRule: 61 | Type: 'AWS::Events::Rule' 62 | Properties: 63 | EventPattern: 64 | source: 65 | - aws.ec2 66 | detail-type: 67 | - EC2 Instance Rebalance Recommendation 68 | Targets: 69 | - Id: 1 70 | Arn: !GetAtt Queue.Arn 71 | InstanceStateChangeRule: 72 | Type: 'AWS::Events::Rule' 73 | Properties: 74 | EventPattern: 75 | source: 76 | - aws.ec2 77 | detail-type: 78 | - EC2 Instance State-change Notification 79 | Targets: 80 | - Id: 1 81 | Arn: !GetAtt Queue.Arn 82 | Outputs: 83 | QueueURL: 84 | Description: Queue url for AWS NTH controller 85 | Value: !Ref Queue 86 | -------------------------------------------------------------------------------- /pkg/logging/routing-integration_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package logging_test 15 | 16 | import ( 17 | "strings" 18 | "testing" 19 | 20 | "github.com/aws/aws-node-termination-handler/pkg/logging" 21 | h "github.com/aws/aws-node-termination-handler/pkg/test" 22 | 23 | "github.com/rs/zerolog/log" 24 | ) 25 | 26 | func TestIntegration_zerologInfo(t *testing.T) { 27 | buf := &strings.Builder{} 28 | errBuf := &strings.Builder{} 29 | 30 | l := log.Output(logging.RoutingLevelWriter{Writer: buf, ErrWriter: errBuf}) 31 | 32 | const s = "this is a test" 33 | l.Info().Msg(s) 34 | 35 | h.Equals(t, errBuf.Len(), 0) 36 | 37 | h.Assert(t, buf.Len() > 0, "no message was written to the default location") 38 | h.Assert(t, strings.Contains(buf.String(), s), "expected message not found in default location") 39 | } 40 | 41 | func TestIntegration_zerologWarn(t *testing.T) { 42 | buf := &strings.Builder{} 43 | errBuf := &strings.Builder{} 44 | 45 | l := log.Output(logging.RoutingLevelWriter{Writer: buf, ErrWriter: errBuf}) 46 | 47 | const s = "this is a test" 48 | l.Warn().Msg(s) 49 | 50 | h.Equals(t, buf.Len(), 0) 51 | 52 | h.Assert(t, errBuf.Len() > 0, "no message was written to the error location") 53 | h.Assert(t, strings.Contains(errBuf.String(), s), "expected message not found in error location") 54 | } 55 | 56 | func TestIntegration_zerologError(t *testing.T) { 57 | buf := &strings.Builder{} 58 | errBuf := &strings.Builder{} 59 | 60 | l := log.Output(logging.RoutingLevelWriter{Writer: buf, ErrWriter: errBuf}) 61 | 62 | const s = "this is a test" 63 | l.Error().Msg(s) 64 | 65 | h.Equals(t, buf.Len(), 0) 66 | 67 | h.Assert(t, errBuf.Len() > 0, "no message was written to the error location") 68 | h.Assert(t, strings.Contains(errBuf.String(), s), "expected message not found in error location") 69 | } 70 | -------------------------------------------------------------------------------- /scripts/retag-docker-images: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | 6 | REPO_ROOT_PATH=$SCRIPTPATH/../ 7 | MAKE_FILE_PATH=$REPO_ROOT_PATH/Makefile 8 | 9 | VERSION=$(make -s -f $MAKE_FILE_PATH version) 10 | PLATFORMS=("linux/amd64") 11 | 12 | 13 | USAGE=$(cat << 'EOM' 14 | Usage: retag-docker-images [-p ] 15 | Tags created docker images with a new prefix 16 | 17 | Example: retag-docker-images -p "linux/amd64,linux/arm" -o -n 18 | Optional: 19 | -p Platform pair list (os/architecture) [DEFAULT: linux/amd64] 20 | -o OLD IMAGE REPO to retag 21 | -n NEW IMAGE REPO to tag with 22 | -v VERSION: The application version of the docker image [DEFAULT: output of `make version`] 23 | EOM 24 | ) 25 | 26 | # Process our input arguments 27 | while getopts "p:o:n:v:" opt; do 28 | case ${opt} in 29 | p ) # Platform Pairs 30 | IFS=',' read -ra PLATFORMS <<< "$OPTARG" 31 | ;; 32 | o ) # Old Image Repo 33 | OLD_IMAGE_REPO="$OPTARG" 34 | ;; 35 | n ) # New Image Repo 36 | NEW_IMAGE_REPO="$OPTARG" 37 | ;; 38 | v ) # Image Version 39 | VERSION="$OPTARG" 40 | ;; 41 | \? ) 42 | echo "$USAGE" 1>&2 43 | exit 44 | ;; 45 | esac 46 | done 47 | 48 | function exit_and_fail() { 49 | echo "❌ Failed retagging docker images" 50 | } 51 | 52 | trap "exit_and_fail" INT TERM ERR 53 | 54 | for os_arch in "${PLATFORMS[@]}"; do 55 | os=$(echo $os_arch | cut -d'/' -f1) 56 | arch=$(echo $os_arch | cut -d'/' -f2) 57 | 58 | old_img_tag="$OLD_IMAGE_REPO:$VERSION-$os-$arch" 59 | new_img_tag="$NEW_IMAGE_REPO:$VERSION-$os-$arch" 60 | 61 | current_os=$(uname) 62 | # Windows will append '\r' to the end of $img which 63 | # results in docker failing to create the manifest due to invalid reference format. 64 | # However, MacOS does not recognize '\r' as carriage return 65 | # and attempts to remove literal 'r' chars; therefore, made this so portable 66 | if [[ $current_os != "Darwin" ]]; then 67 | old_img_tag=$(echo $old_img_tag | sed -e 's/\r//') 68 | new_img_tag=$(echo $new_img_tag | sed -e 's/\r//') 69 | fi 70 | 71 | docker tag ${old_img_tag} ${new_img_tag} 72 | echo "✅ Successfully retagged docker image $old_img_tag to $new_img_tag" 73 | done 74 | 75 | echo "✅ Done Retagging!" -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/psp.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.rbac.pspEnabled) (semverCompare "<1.25-0" .Capabilities.KubeVersion.GitVersion) }} 2 | apiVersion: policy/v1beta1 3 | kind: PodSecurityPolicy 4 | metadata: 5 | name: {{ template "aws-node-termination-handler.fullname" . }} 6 | labels: 7 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 8 | annotations: 9 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' 10 | spec: 11 | privileged: false 12 | hostIPC: false 13 | hostNetwork: {{ .Values.useHostNetwork }} 14 | hostPID: false 15 | {{- if and (and (not .Values.enableSqsTerminationDraining) .Values.useHostNetwork ) (or .Values.enablePrometheusServer .Values.enableProbesServer) }} 16 | hostPorts: 17 | {{- if .Values.enablePrometheusServer }} 18 | - min: {{ .Values.prometheusServerPort }} 19 | max: {{ .Values.prometheusServerPort }} 20 | {{- end }} 21 | {{- if .Values.enableProbesServer }} 22 | - min: {{ .Values.probes.httpGet.port }} 23 | max: {{ .Values.probes.httpGet.port }} 24 | {{- end }} 25 | {{- end }} 26 | readOnlyRootFilesystem: false 27 | allowPrivilegeEscalation: false 28 | allowedCapabilities: 29 | - '*' 30 | fsGroup: 31 | rule: RunAsAny 32 | runAsUser: 33 | rule: RunAsAny 34 | seLinux: 35 | rule: RunAsAny 36 | supplementalGroups: 37 | rule: RunAsAny 38 | volumes: 39 | - '*' 40 | --- 41 | apiVersion: rbac.authorization.k8s.io/v1 42 | kind: Role 43 | metadata: 44 | name: {{ template "aws-node-termination-handler.fullname" . }}-psp 45 | namespace: {{ .Release.Namespace }} 46 | labels: 47 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 48 | rules: 49 | - apiGroups: ['policy'] 50 | resources: ['podsecuritypolicies'] 51 | verbs: ['use'] 52 | resourceNames: 53 | - {{ template "aws-node-termination-handler.fullname" . }} 54 | --- 55 | apiVersion: rbac.authorization.k8s.io/v1 56 | kind: RoleBinding 57 | metadata: 58 | name: {{ template "aws-node-termination-handler.fullname" . }}-psp 59 | namespace: {{ .Release.Namespace }} 60 | labels: 61 | {{- include "aws-node-termination-handler.labels" . | nindent 4 }} 62 | roleRef: 63 | apiGroup: rbac.authorization.k8s.io 64 | kind: Role 65 | name: {{ template "aws-node-termination-handler.fullname" . }}-psp 66 | subjects: 67 | - kind: ServiceAccount 68 | name: {{ template "aws-node-termination-handler.serviceAccountName" . }} 69 | namespace: {{ .Release.Namespace }} 70 | {{- end }} 71 | -------------------------------------------------------------------------------- /pkg/test/helpers.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License 13 | 14 | package test 15 | 16 | import ( 17 | "fmt" 18 | "path/filepath" 19 | "reflect" 20 | "runtime" 21 | "testing" 22 | "time" 23 | ) 24 | 25 | // Assert fails the test if the condition is false. 26 | func Assert(tb testing.TB, condition bool, msg string, v ...interface{}) { 27 | if !condition { 28 | _, file, line, _ := runtime.Caller(1) 29 | fmt.Printf("\033[31m%s:%d: "+msg+"\033[39m\n\n", append([]interface{}{filepath.Base(file), line}, v...)...) 30 | tb.FailNow() 31 | } 32 | } 33 | 34 | // Ok fails the test if an err is not nil. 35 | func Ok(tb testing.TB, err error) { 36 | if err != nil { 37 | _, file, line, _ := runtime.Caller(1) 38 | fmt.Printf("\033[31m%s:%d: unexpected error: %s\033[39m\n\n", filepath.Base(file), line, err.Error()) 39 | tb.FailNow() 40 | } 41 | } 42 | 43 | // Nok fails the test if an err is nil. 44 | func Nok(tb testing.TB, err error) { 45 | if err == nil { 46 | _, file, line, _ := runtime.Caller(1) 47 | fmt.Printf("\033[31m%s:%d: unexpected success \033[39m\n\n", filepath.Base(file), line) 48 | tb.FailNow() 49 | } 50 | } 51 | 52 | // Equals fails the test if exp is not equal to act. 53 | func Equals(tb testing.TB, exp, act interface{}) { 54 | if !reflect.DeepEqual(exp, act) { 55 | _, file, line, _ := runtime.Caller(1) 56 | fmt.Printf("\033[31m%s:%d:\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, exp, act) 57 | tb.FailNow() 58 | } 59 | 60 | } 61 | 62 | // TimeWithinRange fails the test if act is not after lowerBound or not before upperBound 63 | func TimeWithinRange(tb testing.TB, act time.Time, lowerBound time.Time, upperBound time.Time) { 64 | if !(act.After(lowerBound) && act.Before(upperBound)) { 65 | _, file, line, _ := runtime.Caller(1) 66 | fmt.Printf("\033[31m%s:%d:\n\n\tlower bound: %#v\n\n\tgot: %#v\n\n\tupper bound: %#v\033[39m\n\n", filepath.Base(file), line, lowerBound, act, upperBound) 67 | tb.FailNow() 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /scripts/build-docker-images: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | 6 | REPO_ROOT_PATH=$SCRIPTPATH/../ 7 | MAKE_FILE_PATH=$REPO_ROOT_PATH/Makefile 8 | DOCKERFILE_PATH=$REPO_ROOT_PATH/Dockerfile 9 | 10 | VERSION=$(make -s -f $MAKE_FILE_PATH version) 11 | PLATFORMS=("linux/amd64") 12 | GOPROXY="direct|https://proxy.golang.org" 13 | 14 | 15 | USAGE=$(cat << 'EOM' 16 | Usage: build-docker-images [-p ] 17 | Builds docker images for the platform pair 18 | 19 | Example: build-docker-images -p "linux/amd64,linux/arm" 20 | Optional: 21 | -p Platform pair list (os/architecture) [DEFAULT: linux/amd64] 22 | -r IMAGE REPO: set the docker image repo 23 | -v VERSION: The application version of the docker image [DEFAULT: output of `make version`] 24 | EOM 25 | ) 26 | 27 | # Process our input arguments 28 | while getopts "p:r:v:" opt; do 29 | case ${opt} in 30 | p ) # Platform Pairs 31 | IFS=',' read -ra PLATFORMS <<< "$OPTARG" 32 | ;; 33 | r ) # Image Repo 34 | IMAGE_REPO="$OPTARG" 35 | ;; 36 | v ) # Image Version 37 | VERSION="$OPTARG" 38 | ;; 39 | \? ) 40 | echo "$USAGE" 1>&2 41 | exit 42 | ;; 43 | esac 44 | done 45 | 46 | 47 | for os_arch in "${PLATFORMS[@]}"; do 48 | os=$(echo $os_arch | cut -d'/' -f1) 49 | arch=$(echo $os_arch | cut -d'/' -f2) 50 | 51 | dockerfile="$DOCKERFILE_PATH" 52 | if [[ $os == "windows"* ]]; then 53 | windows_version=$(echo $os | cut -d'-' -f2) 54 | os=$(echo $os | cut -d'-' -f1) 55 | img_tag="$IMAGE_REPO:$VERSION-$os-$windows_version-$arch" 56 | dockerfile="${dockerfile}.windows" 57 | docker build \ 58 | --file "${dockerfile}" \ 59 | --build-arg GOOS=${os} \ 60 | --build-arg GOARCH=${arch} \ 61 | --build-arg WINDOWS_VERSION=${windows_version} \ 62 | --build-arg GOPROXY=${GOPROXY} \ 63 | --tag ${img_tag} \ 64 | ${REPO_ROOT_PATH} 65 | else 66 | # Launch a docker buildx instance and save its name so we can terminate it later 67 | img_tag="$IMAGE_REPO:$VERSION-$os-$arch" 68 | buildx_instance_name=$(docker buildx create --use) 69 | docker buildx build \ 70 | --load \ 71 | --file "${dockerfile}" \ 72 | --build-arg GOPROXY=${GOPROXY} \ 73 | --tag ${img_tag} \ 74 | --platform "${os_arch}" \ 75 | ${REPO_ROOT_PATH} 76 | docker buildx rm ${buildx_instance_name} 77 | fi 78 | done -------------------------------------------------------------------------------- /pkg/ec2helper/ec2helper.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package ec2helper 15 | 16 | import ( 17 | "fmt" 18 | 19 | "github.com/aws/aws-sdk-go/aws" 20 | "github.com/aws/aws-sdk-go/service/ec2" 21 | "github.com/aws/aws-sdk-go/service/ec2/ec2iface" 22 | ) 23 | 24 | type IEC2Helper interface { 25 | GetInstanceIdsMapByTagKey(tag string) (map[string]bool, error) 26 | } 27 | 28 | type EC2Helper struct { 29 | ec2ServiceClient ec2iface.EC2API 30 | } 31 | 32 | func New(ec2 ec2iface.EC2API) EC2Helper { 33 | return EC2Helper{ 34 | ec2ServiceClient: ec2, 35 | } 36 | } 37 | 38 | func (h EC2Helper) GetInstanceIdsByTagKey(tag string) ([]string, error) { 39 | ids := []string{} 40 | var nextToken string 41 | 42 | for { 43 | result, err := h.ec2ServiceClient.DescribeInstances(&ec2.DescribeInstancesInput{ 44 | Filters: []*ec2.Filter{ 45 | { 46 | Name: aws.String("tag-key"), 47 | Values: []*string{aws.String(tag)}, 48 | }, 49 | }, 50 | NextToken: &nextToken, 51 | }) 52 | 53 | if err != nil { 54 | return nil, err 55 | } 56 | 57 | if result == nil || result.Reservations == nil { 58 | return nil, fmt.Errorf("describe instances success but return empty response for tag key: %s", tag) 59 | } 60 | 61 | for _, reservation := range result.Reservations { 62 | if reservation.Instances == nil { 63 | continue 64 | } 65 | for _, instance := range reservation.Instances { 66 | if instance == nil || instance.InstanceId == nil { 67 | continue 68 | } 69 | ids = append(ids, *instance.InstanceId) 70 | } 71 | } 72 | 73 | if result.NextToken == nil { 74 | break 75 | } 76 | nextToken = *result.NextToken 77 | } 78 | 79 | return ids, nil 80 | } 81 | 82 | func (h EC2Helper) GetInstanceIdsMapByTagKey(tag string) (map[string]bool, error) { 83 | idMap := map[string]bool{} 84 | ids, err := h.GetInstanceIdsByTagKey(tag) 85 | if err != nil { 86 | return nil, err 87 | } 88 | 89 | if ids == nil { 90 | return nil, fmt.Errorf("get instance ids success but return empty response for tag key: %s", tag) 91 | } 92 | 93 | for _, id := range ids { 94 | idMap[id] = true 95 | } 96 | 97 | return idMap, nil 98 | } 99 | -------------------------------------------------------------------------------- /scripts/build-binaries: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | 6 | REPO_ROOT_PATH=$SCRIPTPATH/../ 7 | MAKE_FILE_PATH=$REPO_ROOT_PATH/Makefile 8 | BIN_DIR=$SCRIPTPATH/../build/bin 9 | mkdir -p $BIN_DIR 10 | 11 | VERSION=$(make -s -f $MAKE_FILE_PATH version) 12 | BASE_BIN_NAME=$(make -s -f $MAKE_FILE_PATH binary-name) 13 | PLATFORMS=("linux/amd64") 14 | 15 | USAGE=$(cat << 'EOM' 16 | Usage: build-binaries [-p ] 17 | Builds static binaries for the platform pairs passed in 18 | 19 | Example: build-binaries -p "linux/amd64,linux/arm" 20 | Optional: 21 | -b Base bin name [DEFAULT: output of "make binary-name"] 22 | -p Platform pair list (os/architecture) [DEFAULT: linux/amd64] 23 | -v VERSION: The application version of the docker image [DEFAULT: output of `make version`] 24 | EOM 25 | ) 26 | 27 | # Process our input arguments 28 | while getopts "p:v:b:" opt; do 29 | case ${opt} in 30 | p ) # Platform Pairs 31 | IFS=',' read -ra PLATFORMS <<< "$OPTARG" 32 | ;; 33 | v ) # Image Version 34 | VERSION="$OPTARG" 35 | ;; 36 | b ) # Base bin name 37 | BASE_BIN_NAME="$OPTARG" 38 | ;; 39 | \? ) 40 | echo "$USAGE" 1>&2 41 | exit 42 | ;; 43 | esac 44 | done 45 | 46 | for os_arch in "${PLATFORMS[@]}"; do 47 | os=$(echo $os_arch | cut -d'/' -f1) 48 | arch=$(echo $os_arch | cut -d'/' -f2) 49 | container_name="build-$BASE_BIN_NAME-$os-$arch" 50 | repo_name="bin-build" 51 | 52 | if [[ $os == "windows"* ]]; then 53 | bin_name="$BASE_BIN_NAME-$os-$arch.exe" 54 | else 55 | bin_name="$BASE_BIN_NAME-$os-$arch" 56 | fi 57 | 58 | docker container rm $container_name || : 59 | $SCRIPTPATH/build-docker-images -p $os_arch -v $VERSION -r $repo_name 60 | docker container create --rm --name $container_name "$repo_name:$VERSION-$os-$arch" 61 | docker container cp $container_name:/${BASE_BIN_NAME} $BIN_DIR/$bin_name 62 | 63 | if [[ $os == "windows"* ]]; then 64 | ## Create zip archive with binary taking into account windows .exe 65 | cp ${BIN_DIR}/${bin_name} ${BIN_DIR}/${BASE_BIN_NAME}.exe 66 | ## Can't reuse bin_name below because it includes .exe 67 | curr_dir=$(pwd) 68 | cd "${BIN_DIR}" 69 | zip -9 -q ${BASE_BIN_NAME}-$os-$arch.zip ${BASE_BIN_NAME}.exe 70 | cd "${curr_dir}" 71 | rm -f ${BIN_DIR}/${BASE_BIN_NAME}.exe 72 | else 73 | ## Create tar.gz archive with binary 74 | cp ${BIN_DIR}/${bin_name} ${BIN_DIR}/${BASE_BIN_NAME} 75 | tar -zcvf ${BIN_DIR}/${bin_name}.tar.gz -C ${BIN_DIR} ${BASE_BIN_NAME} 76 | rm -f ${BIN_DIR}/${BASE_BIN_NAME} 77 | fi 78 | done 79 | -------------------------------------------------------------------------------- /test/helm/helm-lint: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 5 | TMP_DIR="$SCRIPTPATH/../../build" 6 | PLATFORM=$(uname | tr '[:upper:]' '[:lower:]') 7 | HELM_VERSION="3.7.1" 8 | HELM_DIR="${SCRIPTPATH}/../../config/helm" 9 | 10 | mkdir -p $TMP_DIR 11 | 12 | if [ ! -x "$TMP_DIR/helm" ]; then 13 | echo "🥑 Downloading the \"helm\" binary" 14 | curl -L https://get.helm.sh/helm-v$HELM_VERSION-$PLATFORM-amd64.tar.gz | tar zxf - -C $TMP_DIR 15 | mv $TMP_DIR/$PLATFORM-amd64/helm $TMP_DIR/. 16 | chmod +x $TMP_DIR/helm 17 | echo "👍 Downloaded the \"helm\" binary" 18 | fi 19 | export PATH=$TMP_DIR:$PATH 20 | 21 | echo "==============================================================================" 22 | echo " Linting Helm Chart" 23 | echo "==============================================================================" 24 | helm lint "${HELM_DIR}/aws-node-termination-handler/" 25 | 26 | echo "✅ Helm Linting has successfully completed!" 27 | 28 | echo "==============================================================================" 29 | echo " Generate Template from Helm Chart with default values" 30 | echo "==============================================================================" 31 | 32 | helm template nth "${HELM_DIR}/aws-node-termination-handler" --namespace=kube-system --debug > /dev/null 33 | 34 | echo "==============================================================================" 35 | echo " Generate Template from Helm Chart with queue-proccessor values" 36 | echo "==============================================================================" 37 | 38 | helm template nth "${HELM_DIR}/aws-node-termination-handler" --namespace=kube-system --debug -f "${HELM_DIR}/aws-node-termination-handler/example-values-queue.yaml" > /dev/null 39 | 40 | echo "==============================================================================" 41 | echo " Generate Template from Helm Chart with Linux IMDS values" 42 | echo "==============================================================================" 43 | 44 | helm template nth "${HELM_DIR}/aws-node-termination-handler" --namespace=kube-system --debug -f "${HELM_DIR}/aws-node-termination-handler/example-values-imds-linux.yaml" > /dev/null 45 | 46 | echo "==============================================================================" 47 | echo " Generate Template from Helm Chart with Windows IMDS values" 48 | echo "==============================================================================" 49 | 50 | helm template nth "${HELM_DIR}/aws-node-termination-handler" --namespace=kube-system --debug -f "${HELM_DIR}/aws-node-termination-handler/example-values-imds-windows.yaml" > /dev/null 51 | 52 | echo "✅ Helm template generation has successfully completed!" 53 | -------------------------------------------------------------------------------- /pkg/ec2metadata/ec2metadata_internal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License 13 | 14 | package ec2metadata 15 | 16 | import ( 17 | "bytes" 18 | "errors" 19 | "io" 20 | "net/http" 21 | "net/http/httptest" 22 | "strconv" 23 | "testing" 24 | "time" 25 | 26 | h "github.com/aws/aws-node-termination-handler/pkg/test" 27 | ) 28 | 29 | func TestRetry(t *testing.T) { 30 | var numRetries int = 3 31 | var errorMsg string = "Request failed" 32 | var requestCount int 33 | 34 | request := func() (*http.Response, error) { 35 | requestCount++ 36 | return &http.Response{ 37 | StatusCode: 400, 38 | Body: io.NopCloser(bytes.NewBufferString(`OK`)), 39 | Header: make(http.Header), 40 | }, errors.New(errorMsg) 41 | } 42 | 43 | resp, err := retry(numRetries, time.Microsecond, request) 44 | h.Assert(t, err != nil, "Should have gotten a \"Request failed\" error") 45 | defer resp.Body.Close() 46 | 47 | h.Equals(t, errorMsg, err.Error()) 48 | h.Equals(t, numRetries, requestCount) 49 | } 50 | 51 | func TestGetV2Token(t *testing.T) { 52 | server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { 53 | h.Equals(t, req.Header.Get(tokenTTLHeader), strconv.Itoa(tokenTTL)) 54 | h.Equals(t, req.URL.String(), tokenRefreshPath) 55 | rw.Header().Set(tokenTTLHeader, "100") 56 | _, err := rw.Write([]byte(`token`)) 57 | h.Ok(t, err) 58 | })) 59 | defer server.Close() 60 | imds := New(server.URL, 1) 61 | 62 | token, ttl, err := imds.getV2Token() 63 | h.Ok(t, err) 64 | h.Equals(t, "token", token) 65 | h.Equals(t, 100, ttl) 66 | } 67 | 68 | func TestGetV2TokenBadURL(t *testing.T) { 69 | imds := New(string([]byte{0x7f}), 1) 70 | _, _, err := imds.getV2Token() 71 | h.Assert(t, err != nil, "Should error on invalid metadata URL") 72 | } 73 | 74 | func TestGetV2TokenBadTTLHeader(t *testing.T) { 75 | server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { 76 | h.Equals(t, req.Header.Get(tokenTTLHeader), strconv.Itoa(tokenTTL)) 77 | h.Equals(t, req.URL.String(), tokenRefreshPath) 78 | rw.Header().Set(tokenTTLHeader, "badttl") 79 | _, err := rw.Write([]byte(`token`)) 80 | h.Ok(t, err) 81 | })) 82 | defer server.Close() 83 | imds := New(server.URL, 1) 84 | 85 | _, _, err := imds.getV2Token() 86 | h.Assert(t, err != nil, "Non-int TTL should have caused an error") 87 | } 88 | -------------------------------------------------------------------------------- /pkg/config/config_internal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package config 15 | 16 | import ( 17 | "flag" 18 | "strconv" 19 | "testing" 20 | 21 | h "github.com/aws/aws-node-termination-handler/pkg/test" 22 | ) 23 | 24 | // All of these needed for TestIsConfigProvided 25 | var location string 26 | var cliArgName = "name" 27 | var envVarName = "NAME_TEST" 28 | var value = "haugenj" 29 | 30 | func init() { 31 | flag.StringVar(&location, cliArgName, value, value) 32 | } 33 | 34 | func TestGetEnv(t *testing.T) { 35 | var key = "STRING_TEST" 36 | var successVal = "success" 37 | var failVal = "failure" 38 | 39 | t.Setenv(key, successVal) 40 | 41 | result := getEnv(key+"bla", failVal) 42 | h.Equals(t, failVal, result) 43 | 44 | result = getEnv(key, failVal) 45 | h.Equals(t, successVal, result) 46 | } 47 | 48 | func TestGetIntEnv(t *testing.T) { 49 | var key = "INT_TEST" 50 | var successVal = 1 51 | var failVal = 0 52 | 53 | t.Setenv(key, strconv.Itoa(successVal)) 54 | 55 | result := getIntEnv(key+"bla", failVal) 56 | h.Equals(t, failVal, result) 57 | 58 | result = getIntEnv(key, failVal) 59 | h.Equals(t, successVal, result) 60 | 61 | defer func() { 62 | if r := recover(); r == nil { 63 | t.Errorf("getIntEnv did not panic") 64 | } 65 | }() 66 | t.Setenv(key, "hi") 67 | getIntEnv(key, 0) 68 | } 69 | 70 | func TestGetBoolEnv(t *testing.T) { 71 | var key = "BOOL_TEST" 72 | var successVal = true 73 | var failVal = false 74 | 75 | t.Setenv(key, strconv.FormatBool(successVal)) 76 | 77 | result := getBoolEnv(key+"bla", failVal) 78 | h.Equals(t, failVal, result) 79 | 80 | result = getBoolEnv(key, failVal) 81 | h.Equals(t, successVal, result) 82 | 83 | defer func() { 84 | if r := recover(); r == nil { 85 | t.Errorf("getBoolEnv did not panic") 86 | } 87 | }() 88 | t.Setenv(key, "hi") 89 | getBoolEnv(key, false) 90 | } 91 | 92 | func TestIsConfigProvided(t *testing.T) { 93 | result := isConfigProvided(cliArgName, envVarName) 94 | h.Equals(t, false, result) 95 | 96 | err := flag.Set(cliArgName, value) 97 | h.Ok(t, err) 98 | result = isConfigProvided(cliArgName, envVarName) 99 | h.Equals(t, true, result) 100 | 101 | t.Setenv(envVarName, value) 102 | result = isConfigProvided(cliArgName, envVarName) 103 | h.Equals(t, true, result) 104 | } 105 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v1.*.*" 7 | 8 | permissions: 9 | contents: write # required for uploading releases 10 | id-token: write 11 | 12 | env: 13 | DEFAULT_GO_VERSION: ^1.22.0 14 | GITHUB_USERNAME: ${{ secrets.EC2_BOT_GITHUB_USERNAME }} 15 | GITHUB_TOKEN: ${{ secrets.EC2_BOT_GITHUB_TOKEN }} 16 | WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }} 17 | 18 | jobs: 19 | releaseLinux: 20 | name: Release Linux 21 | runs-on: ubuntu-24.04 22 | steps: 23 | - name: Set up Go 1.x 24 | uses: actions/setup-go@v2 25 | with: 26 | go-version: ${{ env.DEFAULT_GO_VERSION }} 27 | 28 | - name: Check out code into the Go module directory 29 | uses: actions/checkout@v2 30 | 31 | - name: Configure AWS credentials 32 | uses: aws-actions/configure-aws-credentials@v4 33 | with: 34 | role-to-assume: ${{ secrets.WF_ROLE_ARN }} 35 | role-session-name: "nth-release-linux-${{ github.run_id }}" 36 | aws-region: us-east-1 37 | 38 | - name: Release Linux Assets 39 | run: make release 40 | 41 | releaseWindows: 42 | name: Release Windows 43 | needs: [releaseLinux] 44 | strategy: 45 | matrix: 46 | version: [2022] 47 | runs-on: windows-${{matrix.version}} 48 | steps: 49 | - name: Set up Go 1.x 50 | uses: actions/setup-go@v2 51 | with: 52 | go-version: ${{ env.DEFAULT_GO_VERSION }} 53 | 54 | - name: Check out code into the Go module directory 55 | uses: actions/checkout@v2 56 | 57 | - name: Configure AWS credentials 58 | uses: aws-actions/configure-aws-credentials@v4 59 | with: 60 | role-to-assume: ${{ secrets.WF_ROLE_ARN }} 61 | role-session-name: "nth-release-windows-${{ github.run_id }}" 62 | aws-region: us-east-1 63 | 64 | - name: Release Windows Assets 65 | run: | 66 | $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.." 67 | Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" 68 | refreshenv 69 | 70 | choco install make && choco install zip && make release-windows-${{matrix.version}} 71 | 72 | release: 73 | name: Release 74 | runs-on: ubuntu-24.04 75 | needs: [releaseLinux, releaseWindows] 76 | steps: 77 | - name: Set up Go 1.x 78 | uses: actions/setup-go@v2 79 | with: 80 | go-version: ${{ env.DEFAULT_GO_VERSION }} 81 | 82 | - name: Check out code into the Go module directory 83 | uses: actions/checkout@v2 84 | 85 | - name: Configure AWS credentials 86 | uses: aws-actions/configure-aws-credentials@v4 87 | with: 88 | role-to-assume: ${{ secrets.WF_ROLE_ARN }} 89 | role-session-name: "nth-release-${{ github.run_id }}" 90 | aws-region: us-east-1 91 | 92 | - name: Sync Helm Chart Catalog information 93 | run: make sync-catalog-information-for-helm-chart 94 | 95 | - name: Sync Helm Chart to ECR Public 96 | run: make push-helm-chart 97 | 98 | - name: Sync Readme to ECR Public 99 | run: make sync-readme-to-ecr-public -------------------------------------------------------------------------------- /test/eks-cluster-test/provision-cluster: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 6 | ACCOUNT_ID=$(aws sts get-caller-identity | jq -r '.Account') 7 | REGION="us-west-2" 8 | NTH_REPO_NAME="node-termination-handler" 9 | WEBHOOK_REPO_NAME="webhook-test-proxy" 10 | TEST_ID=$(uuidgen | cut -d'-' -f1 | tr '[:upper:]' '[:lower:]') 11 | ECR_TAG="test-$TEST_ID" 12 | DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG="node-termination-handler:customtest-$TEST_OS" 13 | NODE_TERMINATION_HANDLER_DOCKER_IMG="" 14 | DEFAULT_WEBHOOK_DOCKER_IMG="webhook-test-proxy:customtest-$TEST_OS" 15 | WEBHOOK_DOCKER_IMG="" 16 | DOCKER_ARGS="--build-arg GOPROXY=direct" 17 | CLUSTER_NAME="nth-eks-cluster-test" 18 | 19 | ## Provision cluster 20 | cluster_exists=$(eksctl get cluster --region $REGION | grep "$CLUSTER_NAME" || :) 21 | if [[ -z $cluster_exists ]]; then 22 | echo "🥑 Provisioning EKS cluster" 23 | eksctl create cluster -f $CLUSTER_CONFIG_FILE 24 | else 25 | echo "🥑 $CLUSTER_NAME already exists; continuing with test run" 26 | fi 27 | 28 | ## Build Docker images 29 | echo "🥑 Building the node-termination-handler docker image" 30 | docker buildx build --load $DOCKER_ARGS -t $DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG "$SCRIPTPATH/../../." 31 | NODE_TERMINATION_HANDLER_DOCKER_IMG="$DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG" 32 | echo "👍 Built the node-termination-handler docker image" 33 | 34 | echo "🥑 Building the webhook-test-proxy docker image" 35 | docker buildx build --load $DOCKER_ARGS -t $DEFAULT_WEBHOOK_DOCKER_IMG "$SCRIPTPATH/../webhook-test-proxy/." 36 | WEBHOOK_DOCKER_IMG="$DEFAULT_WEBHOOK_DOCKER_IMG" 37 | echo "👍 Built the webhook-test-proxy docker image" 38 | 39 | ## ECR setup and push 40 | echo "🥑 Setting up ECR repos" 41 | 42 | aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin "$ACCOUNT_ID".dkr.ecr."$REGION".amazonaws.com 43 | echo "👍 Docker-ECR Authentication successful" 44 | 45 | NTH_ECR_REPO_URI=$(aws ecr --region $REGION describe-repositories --repository-names "${NTH_REPO_NAME}" --query "repositories[0].repositoryUri" --output text 2>/dev/null || \ 46 | aws ecr --region $REGION create-repository --repository-name "${NTH_REPO_NAME}" --query "repository.repositoryUri" --output text) 47 | 48 | WEBHOOK_ECR_REPO_URI=$(aws ecr --region $REGION describe-repositories --repository-names "${WEBHOOK_REPO_NAME}" --query "repositories[0].repositoryUri" --output text 2>/dev/null || \ 49 | aws ecr --region $REGION create-repository --repository-name "${WEBHOOK_REPO_NAME}" --query "repository.repositoryUri" --output text) 50 | echo "👍 ECR repos created" 51 | 52 | docker tag $NODE_TERMINATION_HANDLER_DOCKER_IMG $NTH_ECR_REPO_URI:$ECR_TAG 53 | docker tag $WEBHOOK_DOCKER_IMG $WEBHOOK_ECR_REPO_URI:$ECR_TAG 54 | docker push $NTH_ECR_REPO_URI:$ECR_TAG 55 | docker push $WEBHOOK_ECR_REPO_URI:$ECR_TAG 56 | echo "👍 Docker images pushed to ECR repos with tag: $ECR_TAG" 57 | 58 | export REGION 59 | export NTH_REPO_NAME 60 | export WEBHOOK_REPO_NAME 61 | export CLUSTER_NAME 62 | export NODE_TERMINATION_HANDLER_DOCKER_REPO=$NTH_ECR_REPO_URI 63 | export NODE_TERMINATION_HANDLER_DOCKER_TAG=$ECR_TAG 64 | export WEBHOOK_DOCKER_REPO=$WEBHOOK_ECR_REPO_URI 65 | export WEBHOOK_DOCKER_TAG=$ECR_TAG -------------------------------------------------------------------------------- /pkg/logging/routing_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package logging_test 15 | 16 | import ( 17 | "strings" 18 | "testing" 19 | 20 | "github.com/aws/aws-node-termination-handler/pkg/logging" 21 | h "github.com/aws/aws-node-termination-handler/pkg/test" 22 | 23 | "github.com/rs/zerolog" 24 | ) 25 | 26 | func TestWrite(t *testing.T) { 27 | buf := &strings.Builder{} 28 | errBuf := &strings.Builder{} 29 | 30 | r := logging.RoutingLevelWriter{Writer: buf, ErrWriter: errBuf} 31 | 32 | const s = "this is a test" 33 | p := []byte(s) 34 | n, err := r.Write(p) 35 | 36 | h.Ok(t, err) 37 | h.Equals(t, len(p), n) 38 | 39 | h.Equals(t, errBuf.Len(), 0) 40 | 41 | h.Assert(t, buf.Len() > 0, "no message was written to the default location") 42 | h.Assert(t, strings.Contains(buf.String(), s), "expected message not found in default location") 43 | } 44 | 45 | func TestWriteLevel_lessThanWarning(t *testing.T) { 46 | buf := &strings.Builder{} 47 | errBuf := &strings.Builder{} 48 | 49 | r := logging.RoutingLevelWriter{Writer: buf, ErrWriter: errBuf} 50 | 51 | const s = "this is a test" 52 | p := []byte(s) 53 | n, err := r.WriteLevel(zerolog.InfoLevel, p) 54 | 55 | h.Ok(t, err) 56 | h.Equals(t, len(p), n) 57 | 58 | h.Equals(t, errBuf.Len(), 0) 59 | 60 | h.Assert(t, buf.Len() > 0, "no message was written to the default location") 61 | h.Assert(t, strings.Contains(buf.String(), s), "expected message not found in default location") 62 | } 63 | 64 | func TestWriteLevel_warning(t *testing.T) { 65 | buf := &strings.Builder{} 66 | errBuf := &strings.Builder{} 67 | 68 | r := logging.RoutingLevelWriter{Writer: buf, ErrWriter: errBuf} 69 | 70 | const s = "this is a test" 71 | p := []byte(s) 72 | n, err := r.WriteLevel(zerolog.WarnLevel, p) 73 | 74 | h.Ok(t, err) 75 | h.Equals(t, len(p), n) 76 | 77 | h.Equals(t, buf.Len(), 0) 78 | 79 | h.Assert(t, errBuf.Len() > 0, "no message was written to the error location") 80 | h.Assert(t, strings.Contains(errBuf.String(), s), "expected message not found in error location") 81 | } 82 | 83 | func TestWriteLevel_greaterThanWarning(t *testing.T) { 84 | buf := &strings.Builder{} 85 | errBuf := &strings.Builder{} 86 | 87 | r := logging.RoutingLevelWriter{Writer: buf, ErrWriter: errBuf} 88 | 89 | const s = "this is a test" 90 | p := []byte(s) 91 | n, err := r.WriteLevel(zerolog.ErrorLevel, p) 92 | 93 | h.Ok(t, err) 94 | h.Equals(t, len(p), n) 95 | 96 | h.Equals(t, buf.Len(), 0) 97 | 98 | h.Assert(t, errBuf.Len() > 0, "no message was written to the error location") 99 | h.Assert(t, strings.Contains(errBuf.String(), s), "expected message not found in error location") 100 | } 101 | -------------------------------------------------------------------------------- /pkg/monitor/types.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package monitor 15 | 16 | import ( 17 | "strings" 18 | "time" 19 | 20 | "github.com/aws/aws-node-termination-handler/pkg/node" 21 | ) 22 | 23 | const ( 24 | // SpotITNKind is a const to define a Spot ITN kind of interruption event 25 | SpotITNKind = "SPOT_ITN" 26 | // ScheduledEventKind is a const to define a scheduled event kind of interruption event 27 | ScheduledEventKind = "SCHEDULED_EVENT" 28 | // RebalanceRecommendationKind is a const to define a Rebalance Recommendation kind of interruption event 29 | RebalanceRecommendationKind = "REBALANCE_RECOMMENDATION" 30 | // StateChangeKind is a const to define an EC2 State Change kind of interruption event 31 | StateChangeKind = "STATE_CHANGE" 32 | // ASGLifecycleKind is a const to define an ASG Lifecycle kind of interruption event 33 | ASGLifecycleKind = "ASG_LIFECYCLE" 34 | // ASGLifecycleKind is a const to define an ASG Launch Lifecycle kind of interruption event 35 | ASGLaunchLifecycleKind = "ASG_LAUNCH_LIFECYCLE" 36 | // SQSTerminateKind is a const to define an SQS termination kind of interruption event 37 | SQSTerminateKind = "SQS_TERMINATE" 38 | ) 39 | 40 | // DrainTask defines a task to be run when draining a node 41 | type DrainTask func(InterruptionEvent, node.Node) error 42 | 43 | // InterruptionEvent gives more context of the interruption event 44 | type InterruptionEvent struct { 45 | EventID string 46 | Kind string 47 | Monitor string 48 | Description string 49 | State string 50 | AutoScalingGroupName string 51 | NodeName string 52 | NodeLabels map[string]string 53 | Pods []string 54 | InstanceID string 55 | ProviderID string 56 | InstanceType string 57 | IsManaged bool 58 | StartTime time.Time 59 | EndTime time.Time 60 | NodeProcessed bool 61 | InProgress bool 62 | PreDrainTask DrainTask `json:"-"` 63 | PostDrainTask DrainTask `json:"-"` 64 | CancelDrainTask DrainTask `json:"-"` 65 | } 66 | 67 | // TimeUntilEvent returns the duration until the event start time 68 | func (e *InterruptionEvent) TimeUntilEvent() time.Duration { 69 | return time.Until(e.StartTime) 70 | } 71 | 72 | // IsRebalanceRecommendation returns true if the interruption event is a rebalance recommendation 73 | func (e *InterruptionEvent) IsRebalanceRecommendation() bool { 74 | return strings.Contains(e.EventID, "rebalance-recommendation") 75 | } 76 | 77 | // Monitor is an interface which can be implemented for various sources of interruption events 78 | type Monitor interface { 79 | Monitor() error 80 | Kind() string 81 | } 82 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | -------------------------------------------------------------------------------- /pkg/monitor/sqsevent/ec2-state-change-event.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package sqsevent 15 | 16 | import ( 17 | "encoding/json" 18 | "fmt" 19 | "strings" 20 | 21 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 22 | "github.com/aws/aws-node-termination-handler/pkg/node" 23 | "github.com/aws/aws-sdk-go/service/sqs" 24 | ) 25 | 26 | /* Example EC2 State Change Event: 27 | { 28 | "version": "0", 29 | "id": "7bf73129-1428-4cd3-a780-95db273d1602", 30 | "detail-type": "EC2 Instance State-change Notification", 31 | "source": "aws.ec2", 32 | "account": "123456789012", 33 | "time": "2015-11-11T21:29:54Z", 34 | "region": "us-east-1", 35 | "resources": [ 36 | "arn:aws:ec2:us-east-1:123456789012:instance/i-abcd1111" 37 | ], 38 | "detail": { 39 | "instance-id": "i-abcd1111", 40 | "state": "pending" 41 | } 42 | } 43 | */ 44 | 45 | // EC2StateChangeDetail holds the event details for EC2 state change events from Amazon EventBridge 46 | type EC2StateChangeDetail struct { 47 | InstanceID string `json:"instance-id"` 48 | State string `json:"state"` 49 | } 50 | 51 | const instanceStatesToDrain = "stopping,stopped,shutting-down,terminated" 52 | 53 | func (m SQSMonitor) ec2StateChangeToInterruptionEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { 54 | ec2StateChangeDetail := &EC2StateChangeDetail{} 55 | err := json.Unmarshal(event.Detail, ec2StateChangeDetail) 56 | if err != nil { 57 | return nil, err 58 | } 59 | 60 | if !strings.Contains(instanceStatesToDrain, strings.ToLower(ec2StateChangeDetail.State)) { 61 | return nil, nil 62 | } 63 | 64 | nodeInfo, err := m.getNodeInfo(ec2StateChangeDetail.InstanceID) 65 | if err != nil { 66 | return nil, err 67 | } 68 | interruptionEvent := monitor.InterruptionEvent{ 69 | EventID: fmt.Sprintf("ec2-state-change-event-%x", event.ID), 70 | Kind: monitor.StateChangeKind, 71 | Monitor: SQSMonitorKind, 72 | StartTime: event.getTime(), 73 | NodeName: nodeInfo.Name, 74 | IsManaged: nodeInfo.IsManaged, 75 | AutoScalingGroupName: nodeInfo.AsgName, 76 | InstanceID: ec2StateChangeDetail.InstanceID, 77 | ProviderID: nodeInfo.ProviderID, 78 | InstanceType: nodeInfo.InstanceType, 79 | Description: fmt.Sprintf("EC2 State Change event received. Instance %s went into %s at %s \n", ec2StateChangeDetail.InstanceID, ec2StateChangeDetail.State, event.getTime()), 80 | } 81 | 82 | interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { 83 | errs := m.deleteMessages([]*sqs.Message{message}) 84 | if errs != nil { 85 | return errs[0] 86 | } 87 | return nil 88 | } 89 | return &interruptionEvent, nil 90 | } 91 | -------------------------------------------------------------------------------- /BUILD.md: -------------------------------------------------------------------------------- 1 | # Build 2 | If you would like to build and run the project locally you can follow these steps: 3 | 4 | Clone the repo: 5 | ``` 6 | git clone https://github.com/aws/aws-node-termination-handler.git 7 | ``` 8 | Build the latest version of the docker image for `linux/amd64`: 9 | ``` 10 | make docker-build 11 | ``` 12 | 13 | ### Multi-Target 14 | 15 | If you instead want to build for all support Linux architectures (`linux/amd64` and `linux/arm64`), you can run this make target: 16 | ``` 17 | make build-docker-images 18 | ``` 19 | 20 | Under the hood, this passes each architecture as the `--platform` argument to `docker buildx build`, like this: 21 | ``` 22 | docker buildx create --use 23 | docker buildx build --load --platform "linux/amd64" -t ${USER}/aws-node-termination-handler-amd64:v1.0.0 . 24 | docker buildx build --load --platform "linux/arm64" -t ${USER}/aws-node-termination-handler-arm64:v1.0.0 . 25 | ``` 26 | 27 | To push a multi-arch image, you can use the helper tool [manifest-tool](https://github.com/estesp/manifest-tool). 28 | 29 | ``` 30 | cat << EOF > manifest.yaml 31 | image: ${USER}/aws-node-termination-handler:v1.0.0 32 | manifests: 33 | - 34 | image: ${USER}/aws-node-termination-handler-amd64:v1.0.0 35 | platform: 36 | architecture: amd64 37 | os: linux 38 | - 39 | image: ${USER}/aws-node-termination-handler-arm64:v1.0.0 40 | platform: 41 | architecture: arm64 42 | os: linux 43 | EOF 44 | manifest-tool push from-spec manifest.yaml 45 | ``` 46 | 47 | ### Building for Windows 48 | 49 | You can build the Windows docker image with the following command: 50 | ``` 51 | make build-docker-images-windows 52 | ``` 53 | Currently, our `windows/amd64` builds use the older `docker build` system, not `docker buildx build` because it does not seem to be well supported. We hope to unify them in the future. 54 | 55 | ### Go Module Proxy 56 | 57 | By default, Go 1.13+ uses the proxy.golang.org proxy for go module downloads. You can change this to a different go module proxy or revert back to pre-go 1.13 default which was "direct". `GOPROXY=direct` will pull from the VCS provider directly instead of going through a proxy at all. 58 | 59 | ``` 60 | ## No Proxy 61 | docker buildx build --load --build-arg=GOPROXY=direct -t ${USER}/aws-node-termination-handler:v1.0.0 . 62 | 63 | ## My Corp Proxy 64 | docker buildx build --load --build-arg=GOPROXY=go-proxy.mycorp.com -t ${USER}/aws-node-termination-handler:v1.0.0 . 65 | ``` 66 | 67 | ### Kubernetes Object Files 68 | 69 | We use Kustomize to create a master Kubernetes yaml file. You can apply the base (default confg), use the provided overlays, or write your own custom overlays. 70 | 71 | *NOTE: Kustomize was built into kubectl starting with kubernetes 1.14. If you are using an older version of kubernetes or `kubectl`, you can download the `kustomize` binary for your platform on their github releases page: https://github.com/kubernetes-sigs/kustomize/releases* 72 | 73 | ``` 74 | ## Apply base kustomize directly kubernetes 75 | kubectl apply -k $REPO_ROOT/config/base 76 | 77 | ## OR apply an overlay specifying a node selector to run the daemonset only on spot instances 78 | ## This will use the base and add a node selector into the daemonset K8s object definition 79 | kubectl apply -k $REPO_ROOT/config/overlays/spot-node-selector 80 | ``` 81 | 82 | Read more about Kustomize and Overlays: https://kustomize.io 83 | -------------------------------------------------------------------------------- /pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package asglifecycle 15 | 16 | import ( 17 | "context" 18 | "testing" 19 | "time" 20 | 21 | "github.com/rs/zerolog/log" 22 | 23 | "github.com/aws/aws-node-termination-handler/pkg/config" 24 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 25 | "github.com/aws/aws-node-termination-handler/pkg/node" 26 | h "github.com/aws/aws-node-termination-handler/pkg/test" 27 | "github.com/aws/aws-node-termination-handler/pkg/uptime" 28 | v1 "k8s.io/api/core/v1" 29 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 | "k8s.io/client-go/kubernetes/fake" 31 | "k8s.io/kubectl/pkg/drain" 32 | ) 33 | 34 | const nodeName = "NAME" 35 | 36 | func getDrainHelper(client *fake.Clientset) *drain.Helper { 37 | return &drain.Helper{ 38 | Client: client, 39 | Force: true, 40 | GracePeriodSeconds: -1, 41 | IgnoreAllDaemonSets: true, 42 | DeleteEmptyDirData: true, 43 | Timeout: time.Duration(120) * time.Second, 44 | Out: log.Logger, 45 | ErrOut: log.Logger, 46 | } 47 | } 48 | 49 | func TestSetInterruptionTaint(t *testing.T) { 50 | drainEvent := monitor.InterruptionEvent{ 51 | EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", 52 | } 53 | nthConfig := config.Config{ 54 | DryRun: true, 55 | NodeName: nodeName, 56 | } 57 | 58 | client := fake.NewSimpleClientset() 59 | _, err := client.CoreV1().Nodes().Create(context.Background(), &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}, metav1.CreateOptions{}) 60 | h.Ok(t, err) 61 | 62 | tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) 63 | h.Ok(t, err) 64 | 65 | err = setInterruptionTaint(drainEvent, *tNode) 66 | 67 | h.Ok(t, err) 68 | } 69 | 70 | func TestInterruptionTaintAlreadyPresent(t *testing.T) { 71 | drainEvent := monitor.InterruptionEvent{ 72 | EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", 73 | } 74 | nthConfig := config.Config{ 75 | DryRun: false, 76 | NodeName: nodeName, 77 | } 78 | 79 | client := fake.NewSimpleClientset() 80 | newNode := &v1.Node{ 81 | ObjectMeta: metav1.ObjectMeta{Name: nodeName}, 82 | Spec: v1.NodeSpec{Taints: []v1.Taint{{ 83 | Key: node.ASGLifecycleTerminationTaint, 84 | Value: drainEvent.EventID[:63], 85 | Effect: v1.TaintEffectNoSchedule, 86 | }, 87 | }}, 88 | } 89 | 90 | _, err := client.CoreV1().Nodes().Create(context.Background(), newNode, metav1.CreateOptions{}) 91 | h.Ok(t, err) 92 | 93 | tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) 94 | h.Ok(t, err) 95 | 96 | err = setInterruptionTaint(drainEvent, *tNode) 97 | 98 | h.Ok(t, err) 99 | } 100 | -------------------------------------------------------------------------------- /pkg/monitor/spotitn/spot-itn-monitor_internal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package spotitn 15 | 16 | import ( 17 | "context" 18 | "testing" 19 | "time" 20 | 21 | "github.com/rs/zerolog/log" 22 | 23 | "github.com/aws/aws-node-termination-handler/pkg/config" 24 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 25 | "github.com/aws/aws-node-termination-handler/pkg/node" 26 | h "github.com/aws/aws-node-termination-handler/pkg/test" 27 | "github.com/aws/aws-node-termination-handler/pkg/uptime" 28 | v1 "k8s.io/api/core/v1" 29 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 | "k8s.io/client-go/kubernetes/fake" 31 | "k8s.io/kubectl/pkg/drain" 32 | ) 33 | 34 | var spotNodeName = "NAME" 35 | 36 | func getSpotDrainHelper(client *fake.Clientset) *drain.Helper { 37 | return &drain.Helper{ 38 | Client: client, 39 | Force: true, 40 | GracePeriodSeconds: -1, 41 | IgnoreAllDaemonSets: true, 42 | DeleteEmptyDirData: true, 43 | Timeout: time.Duration(120) * time.Second, 44 | Out: log.Logger, 45 | ErrOut: log.Logger, 46 | } 47 | } 48 | 49 | func TestSetInterruptionTaint(t *testing.T) { 50 | drainEvent := monitor.InterruptionEvent{ 51 | EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", 52 | } 53 | nthConfig := config.Config{ 54 | DryRun: true, 55 | NodeName: spotNodeName, 56 | } 57 | 58 | client := fake.NewSimpleClientset() 59 | _, err := client.CoreV1().Nodes().Create(context.Background(), &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: spotNodeName}}, metav1.CreateOptions{}) 60 | h.Ok(t, err) 61 | 62 | tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client), uptime.Uptime) 63 | h.Ok(t, err) 64 | 65 | err = setInterruptionTaint(drainEvent, *tNode) 66 | 67 | h.Ok(t, err) 68 | } 69 | 70 | func TestInterruptionTaintAlreadyPresent(t *testing.T) { 71 | drainEvent := monitor.InterruptionEvent{ 72 | EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", 73 | } 74 | nthConfig := config.Config{ 75 | DryRun: false, 76 | NodeName: spotNodeName, 77 | } 78 | 79 | client := fake.NewSimpleClientset() 80 | newNode := &v1.Node{ 81 | ObjectMeta: metav1.ObjectMeta{Name: spotNodeName}, 82 | Spec: v1.NodeSpec{Taints: []v1.Taint{{ 83 | Key: node.SpotInterruptionTaint, 84 | Value: drainEvent.EventID[:63], 85 | Effect: v1.TaintEffectNoSchedule, 86 | }, 87 | }}, 88 | } 89 | 90 | _, err := client.CoreV1().Nodes().Create(context.Background(), newNode, metav1.CreateOptions{}) 91 | h.Ok(t, err) 92 | 93 | tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client), uptime.Uptime) 94 | h.Ok(t, err) 95 | 96 | err = setInterruptionTaint(drainEvent, *tNode) 97 | 98 | h.Ok(t, err) 99 | } 100 | -------------------------------------------------------------------------------- /docs/kubernetes_events.md: -------------------------------------------------------------------------------- 1 | # AWS Node Termination Handler Kubernetes events 2 | 3 | AWS Node Termination Handler has the ability to emit a Kubernetes event every time an interruption signal is sent from AWS and also every time an operation is attempted on a node. More information on how to get events can be found [here](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application-introspection/). 4 | 5 | ## Configuration 6 | 7 | There are two relevant parameters: 8 | 9 | * `emit-kubernetes-events`: 10 | 11 | If true, Kubernetes events will be emitted when interruption events are received and when actions are taken on Kubernetes nodes. Defaults to `false`. 12 | 13 | * `kubernetes-events-extra-annotations`: 14 | 15 | A comma-separated list of `key=value` extra annotations to attach to all emitted Kubernetes events. Example: 16 | 17 | `"first=annotation,sample.annotation/number=two"` 18 | 19 | ## Event reasons 20 | 21 | There are a number of events that can be emitted, each one with a reason that can be used to quickly identify the event nature and for filtering. Each event will also have a message with extended information. Here's a reasons summary: 22 | 23 | AWS interruption event reasons: 24 | 25 | * `RebalanceRecommendation` 26 | * `ScheduledEvent` 27 | * `SQSTermination` 28 | * `SpotInterruption` 29 | 30 | Node action reasons: 31 | 32 | * `Cordon` 33 | * `CordonError` 34 | * `CordonAndDrain` 35 | * `CordonAndDrainError` 36 | * `PreDrain` 37 | * `PreDrainError` 38 | * `PostDrain` 39 | * `PostDrainError` 40 | * `Uncordon` 41 | * `UncordonError` 42 | * `MonitorError` 43 | 44 | ## Default IMDS mode annotations 45 | 46 | If `emit-kubernetes-events` is enabled and `enable-sqs-termination-draining` is disabled (meaning we're operating in IMDS mode), AWS Node Termination Handler will automatically inject a set of annotations to each event it emits. Such annotations are gathered from the underlying host's IMDS endpoint and enrich each event with information about the host that emitted it. 47 | 48 | _**NOTE**: In Queue Processor mode, the default IMDS mode annotations will be disabled but you can still define a set of extra annotations._ 49 | 50 | The default IMDS mode annotations are: 51 | 52 | Name | Example value 53 | --- | --- 54 | `account-id` | `123456789012` 55 | `availability-zone` | `us-west-2a` 56 | `instance-id` | `i-abcdef12345678901` 57 | `instance-life-cycle` | `spot` 58 | `instance-type` | `m5.8xlarge` 59 | `local-hostname` | `ip-10-1-2-3.us-west-2.compute.internal` 60 | `local-ipv4` | `10.1.2.3` 61 | `public-hostname` | `my-example.host.net` 62 | `public-ipv4` | `42.42.42.42` 63 | `region` | `us-west-2` 64 | 65 | If `kubernetes-events-extra-annotations` are specified they will be appended to the above. In case of collision, the user-defined annotation wins. 66 | 67 | ## How to get events 68 | 69 | All events are about Kubernetes `Node` objects so they belong in the `default` namespace. The event source is `aws-node-termination-handler`. From command line, use `kubectl` to get the events as follows: 70 | 71 | ```sh 72 | kubectl get events --field-selector "source=aws-node-termination-handler" 73 | ``` 74 | 75 | To narrow down the search you can use multiple field selectors, like: 76 | 77 | ```sh 78 | kubectl get events --field-selector "reason=SpotInterruption,involvedObject.name=ip-10-1-2-3.us-west-2.compute.internal" 79 | ``` 80 | 81 | Results can also be printed out in JSON or YAML format and piped to processors like `jq` or `yq`. Then, the above annotations can also be used for discovery and filtering. 82 | -------------------------------------------------------------------------------- /pkg/monitor/rebalancerecommendation/rebalance-recommendation-monitor_internal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package rebalancerecommendation 15 | 16 | import ( 17 | "context" 18 | "testing" 19 | "time" 20 | 21 | "github.com/rs/zerolog/log" 22 | 23 | "github.com/aws/aws-node-termination-handler/pkg/config" 24 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 25 | "github.com/aws/aws-node-termination-handler/pkg/node" 26 | h "github.com/aws/aws-node-termination-handler/pkg/test" 27 | "github.com/aws/aws-node-termination-handler/pkg/uptime" 28 | v1 "k8s.io/api/core/v1" 29 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 | "k8s.io/client-go/kubernetes/fake" 31 | "k8s.io/kubectl/pkg/drain" 32 | ) 33 | 34 | const spotNodeName = "NAME" 35 | 36 | func getSpotDrainHelper(client *fake.Clientset) *drain.Helper { 37 | return &drain.Helper{ 38 | Client: client, 39 | Force: true, 40 | GracePeriodSeconds: -1, 41 | IgnoreAllDaemonSets: true, 42 | DeleteEmptyDirData: true, 43 | Timeout: time.Duration(120) * time.Second, 44 | Out: log.Logger, 45 | ErrOut: log.Logger, 46 | } 47 | } 48 | 49 | func TestSetInterruptionTaint(t *testing.T) { 50 | drainEvent := monitor.InterruptionEvent{ 51 | EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", 52 | } 53 | nthConfig := config.Config{ 54 | DryRun: true, 55 | NodeName: spotNodeName, 56 | } 57 | 58 | client := fake.NewSimpleClientset() 59 | _, err := client.CoreV1().Nodes().Create(context.Background(), &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: spotNodeName}}, metav1.CreateOptions{}) 60 | h.Ok(t, err) 61 | 62 | tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client), uptime.Uptime) 63 | h.Ok(t, err) 64 | 65 | err = setInterruptionTaint(drainEvent, *tNode) 66 | 67 | h.Ok(t, err) 68 | } 69 | 70 | func TestInterruptionTaintAlreadyPresent(t *testing.T) { 71 | drainEvent := monitor.InterruptionEvent{ 72 | EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", 73 | } 74 | nthConfig := config.Config{ 75 | DryRun: false, 76 | NodeName: spotNodeName, 77 | } 78 | 79 | client := fake.NewSimpleClientset() 80 | newNode := &v1.Node{ 81 | ObjectMeta: metav1.ObjectMeta{Name: spotNodeName}, 82 | Spec: v1.NodeSpec{Taints: []v1.Taint{{ 83 | Key: node.RebalanceRecommendationTaint, 84 | Value: drainEvent.EventID[:63], 85 | Effect: v1.TaintEffectNoSchedule, 86 | }, 87 | }}, 88 | } 89 | 90 | _, err := client.CoreV1().Nodes().Create(context.Background(), newNode, metav1.CreateOptions{}) 91 | h.Ok(t, err) 92 | 93 | tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client), uptime.Uptime) 94 | h.Ok(t, err) 95 | 96 | err = setInterruptionTaint(drainEvent, *tNode) 97 | 98 | h.Ok(t, err) 99 | } 100 | -------------------------------------------------------------------------------- /pkg/monitor/sqsevent/sqs-retryer_test.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package sqsevent_test 15 | 16 | import ( 17 | "fmt" 18 | "net" 19 | "testing" 20 | "time" 21 | 22 | "github.com/aws/aws-node-termination-handler/pkg/monitor/sqsevent" 23 | h "github.com/aws/aws-node-termination-handler/pkg/test" 24 | "github.com/aws/aws-sdk-go/aws" 25 | "github.com/aws/aws-sdk-go/aws/awserr" 26 | "github.com/aws/aws-sdk-go/aws/client" 27 | "github.com/aws/aws-sdk-go/aws/request" 28 | "github.com/aws/aws-sdk-go/aws/session" 29 | ) 30 | 31 | type temporaryError struct { 32 | error 33 | temp bool 34 | } 35 | 36 | func TestGetSqsClient(t *testing.T) { 37 | retryer := getSqsRetryer(t) 38 | 39 | h.Equals(t, client.DefaultRetryerMaxNumRetries, retryer.NumMaxRetries) 40 | h.Equals(t, time.Duration(1200*time.Millisecond), retryer.MaxRetryDelay) 41 | } 42 | 43 | func TestShouldRetry(t *testing.T) { 44 | retryer := getSqsRetryer(t) 45 | 46 | testCases := []struct { 47 | name string 48 | req *request.Request 49 | shouldRetry bool 50 | }{ 51 | { 52 | name: "AWS throttling error", 53 | req: &request.Request{ 54 | Error: awserr.New("ThrottlingException", "Rate exceeded", nil), 55 | }, 56 | shouldRetry: true, 57 | }, 58 | { 59 | name: "AWS validation error", 60 | req: &request.Request{ 61 | Error: awserr.New("ValidationError", "Invalid parameter", nil), 62 | }, 63 | shouldRetry: false, 64 | }, 65 | { 66 | name: "read connection reset by peer error", 67 | req: &request.Request{ 68 | Error: &temporaryError{ 69 | error: &net.OpError{ 70 | Op: "read", 71 | Err: fmt.Errorf("read: connection reset by peer"), 72 | }, 73 | temp: false, 74 | }}, 75 | shouldRetry: true, 76 | }, 77 | { 78 | name: "read unknown error", 79 | req: &request.Request{ 80 | Error: &temporaryError{ 81 | error: &net.OpError{ 82 | Op: "read", 83 | Err: fmt.Errorf("read unknown error"), 84 | }, 85 | temp: false, 86 | }}, 87 | shouldRetry: false, 88 | }, 89 | } 90 | 91 | for _, tc := range testCases { 92 | t.Run(tc.name, func(t *testing.T) { 93 | result := retryer.ShouldRetry(tc.req) 94 | h.Equals(t, tc.shouldRetry, result) 95 | }) 96 | } 97 | } 98 | 99 | func getSqsRetryer(t *testing.T) sqsevent.SqsRetryer { 100 | sess, err := session.NewSession(&aws.Config{ 101 | Region: aws.String("us-east-1"), 102 | }) 103 | h.Ok(t, err) 104 | 105 | sqsClient := sqsevent.GetSqsClient(sess) 106 | h.Assert(t, sqsClient.Client.Config.Region != nil, "Region should not be nil") 107 | h.Equals(t, "us-east-1", *sqsClient.Client.Config.Region) 108 | 109 | retryer, ok := sqsClient.Client.Config.Retryer.(sqsevent.SqsRetryer) 110 | h.Assert(t, ok, "Retryer should be of type SqsRetryer") 111 | return retryer 112 | } 113 | 114 | func (e *temporaryError) Temporary() bool { 115 | return e.temp 116 | } 117 | -------------------------------------------------------------------------------- /pkg/interruptionevent/internal/common/handler.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License 13 | 14 | package common 15 | 16 | import ( 17 | "fmt" 18 | 19 | "github.com/aws/aws-node-termination-handler/pkg/config" 20 | "github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore" 21 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 22 | "github.com/aws/aws-node-termination-handler/pkg/node" 23 | "github.com/aws/aws-node-termination-handler/pkg/observability" 24 | "github.com/rs/zerolog/log" 25 | ) 26 | 27 | type Handler struct { 28 | InterruptionEventStore *interruptioneventstore.Store 29 | Node node.Node 30 | NthConfig config.Config 31 | Metrics observability.Metrics 32 | Recorder observability.K8sEventRecorder 33 | } 34 | 35 | func (h *Handler) GetNodeName(drainEvent *monitor.InterruptionEvent) (string, error) { 36 | if !h.NthConfig.UseProviderId { 37 | return drainEvent.NodeName, nil 38 | } 39 | 40 | nodeName, err := h.Node.GetNodeNameFromProviderID(drainEvent.ProviderID) 41 | if err != nil { 42 | return "", fmt.Errorf("parse node name from providerID=%q: %w", drainEvent.ProviderID, err) 43 | } 44 | return nodeName, nil 45 | } 46 | 47 | func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) { 48 | err := drainEvent.PreDrainTask(*drainEvent, h.Node) 49 | if err != nil { 50 | log.Err(err).Msg("There was a problem executing the pre-drain task") 51 | h.Recorder.Emit(nodeName, observability.Warning, observability.PreDrainErrReason, observability.PreDrainErrMsgFmt, err.Error()) 52 | } else { 53 | h.Recorder.Emit(nodeName, observability.Normal, observability.PreDrainReason, observability.PreDrainMsg) 54 | } 55 | h.Metrics.NodeActionsInc("pre-drain", nodeName, drainEvent.EventID, err) 56 | } 57 | 58 | func (h *Handler) RunCancelDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) { 59 | err := drainEvent.CancelDrainTask(*drainEvent, h.Node) 60 | if err != nil { 61 | log.Err(err).Msg("There was a problem executing the early exit task") 62 | h.Recorder.Emit(nodeName, observability.Warning, observability.CancelDrainErrReason, observability.CancelDrainErrMsgFmt, err.Error()) 63 | } else { 64 | h.Recorder.Emit(nodeName, observability.Normal, observability.CancelDrainReason, observability.CancelDrainMsg) 65 | } 66 | } 67 | 68 | func (h *Handler) RunPostDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) { 69 | err := drainEvent.PostDrainTask(*drainEvent, h.Node) 70 | if err != nil { 71 | log.Err(err).Msg("There was a problem executing the post-drain task") 72 | h.Recorder.Emit(nodeName, observability.Warning, observability.PostDrainErrReason, observability.PostDrainErrMsgFmt, err.Error()) 73 | } else { 74 | h.Recorder.Emit(nodeName, observability.Normal, observability.PostDrainReason, observability.PostDrainMsg) 75 | } 76 | h.Metrics.NodeActionsInc("post-drain", nodeName, drainEvent.EventID, err) 77 | } 78 | 79 | func IsAllowedKind(kind string, allowedKinds ...string) bool { 80 | for _, allowedKind := range allowedKinds { 81 | if kind == allowedKind { 82 | return true 83 | } 84 | } 85 | return false 86 | } 87 | -------------------------------------------------------------------------------- /test/e2e/spot-interruption-dry-run-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Available env vars: 5 | # $TMP_DIR 6 | # $CLUSTER_NAME 7 | # $KUBECONFIG 8 | # $NODE_TERMINATION_HANDLER_DOCKER_REPO 9 | # $NODE_TERMINATION_HANDLER_DOCKER_TAG 10 | # $WEBHOOK_DOCKER_REPO 11 | # $WEBHOOK_DOCKER_TAG 12 | # $AEMM_URL 13 | # $AEMM_VERSION 14 | 15 | function fail_and_exit { 16 | echo "❌ Spot Interruption Dry Run test failed $CLUSTER_NAME ❌" 17 | exit "${1:-1}" 18 | } 19 | 20 | echo "Starting Maintenance Events Dry-Run Test for Node Termination Handler" 21 | 22 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 23 | 24 | common_helm_args=() 25 | [[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") 26 | [[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL") 27 | 28 | anth_helm_args=( 29 | upgrade 30 | --install 31 | --namespace kube-system 32 | "$CLUSTER_NAME-anth" 33 | "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" 34 | --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" 35 | --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" 36 | --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" 37 | --set dryRun="true" 38 | --set enableSpotInterruptionDraining="true" 39 | --set enableScheduledEventDraining="true" 40 | --wait 41 | --force 42 | ) 43 | [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && 44 | anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") 45 | [[ ${#common_helm_args[@]} -gt 0 ]] && 46 | anth_helm_args+=("${common_helm_args[@]}") 47 | 48 | set -x 49 | helm "${anth_helm_args[@]}" 50 | set +x 51 | 52 | emtp_helm_args=( 53 | upgrade 54 | --install 55 | --namespace default 56 | "$CLUSTER_NAME-emtp" 57 | "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" 58 | --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" 59 | --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" 60 | --wait 61 | ) 62 | [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && 63 | emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") 64 | [[ ${#common_helm_args[@]} -gt 0 ]] && 65 | emtp_helm_args+=("${common_helm_args[@]}") 66 | 67 | set -x 68 | helm "${emtp_helm_args[@]}" 69 | set +x 70 | 71 | aemm_helm_args=( 72 | upgrade 73 | --install 74 | --namespace default 75 | "$CLUSTER_NAME-aemm" 76 | "$AEMM_DL_URL" 77 | --set servicePort="$IMDS_PORT" 78 | --set arguments='{spot}' 79 | --wait 80 | ) 81 | [[ ${#common_helm_args[@]} -gt 0 ]] && 82 | aemm_helm_args+=("${common_helm_args[@]}") 83 | 84 | set -x 85 | retry 5 helm "${aemm_helm_args[@]}" 86 | set +x 87 | 88 | TAINT_CHECK_CYCLES=15 89 | TAINT_CHECK_SLEEP=15 90 | 91 | logs=0 92 | pod_id=$(get_nth_worker_pod) 93 | test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" 94 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 95 | if [[ $logs -eq 0 && ! -z $(kubectl logs "${pod_id}" -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then 96 | echo "✅ Verified the dryrun logs were executed" 97 | logs=1 98 | fi 99 | 100 | if [[ $logs -eq 1 ]] && kubectl get nodes "${test_node}" --no-headers | grep -v SchedulingDisabled >/dev/null; then 101 | echo "✅ Verified the worker node was not cordoned!" 102 | echo "✅ Spot Interruption Dry Run Test Passed $CLUSTER_NAME! ✅" 103 | exit 0 104 | fi 105 | echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 106 | sleep $TAINT_CHECK_SLEEP 107 | done 108 | 109 | if [[ $logs -eq 0 ]]; then 110 | echo "❌ dryrun logs were not executed" 111 | else 112 | echo "❌ Worker node was cordoned" 113 | fi 114 | 115 | fail_and_exit 1 116 | -------------------------------------------------------------------------------- /test/e2e/maintenance-event-dry-run-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Available env vars: 5 | # $TMP_DIR 6 | # $CLUSTER_NAME 7 | # $KUBECONFIG 8 | # $NODE_TERMINATION_HANDLER_DOCKER_REPO 9 | # $NODE_TERMINATION_HANDLER_DOCKER_TAG 10 | # $WEBHOOK_DOCKER_REPO 11 | # $WEBHOOK_DOCKER_TAG 12 | # $AEMM_URL 13 | # $AEMM_VERSION 14 | 15 | 16 | function fail_and_exit { 17 | echo "❌ Scheduled Maintenance Events Dry-Run Test failed $CLUSTER_NAME ❌" 18 | exit "${1:-1}" 19 | } 20 | 21 | echo "Starting Maintenance Events Dry-Run Test for Node Termination Handler" 22 | 23 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 24 | 25 | common_helm_args=() 26 | [[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") 27 | [[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL") 28 | 29 | anth_helm_args=( 30 | upgrade 31 | --install 32 | --namespace kube-system 33 | "$CLUSTER_NAME-anth" 34 | "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" 35 | --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" 36 | --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" 37 | --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" 38 | --set dryRun="true" 39 | --set enableSpotInterruptionDraining="true" 40 | --set enableScheduledEventDraining="true" 41 | --wait 42 | --force 43 | ) 44 | [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && 45 | anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") 46 | [[ ${#common_helm_args[@]} -gt 0 ]] && 47 | anth_helm_args+=("${common_helm_args[@]}") 48 | 49 | set -x 50 | helm "${anth_helm_args[@]}" 51 | set +x 52 | 53 | emtp_helm_args=( 54 | upgrade 55 | --install 56 | --namespace default 57 | "$CLUSTER_NAME-emtp" 58 | "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" 59 | --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" 60 | --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" 61 | --wait 62 | --force 63 | ) 64 | [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && 65 | emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") 66 | [[ ${#common_helm_args[@]} -gt 0 ]] && 67 | emtp_helm_args+=("${common_helm_args[@]}") 68 | 69 | set -x 70 | helm "${emtp_helm_args[@]}" 71 | set +x 72 | 73 | aemm_helm_args=( 74 | upgrade 75 | --install 76 | --namespace default 77 | "$CLUSTER_NAME-aemm" 78 | "$AEMM_DL_URL" 79 | --set servicePort="$IMDS_PORT" 80 | --set arguments='{events}' 81 | --wait 82 | ) 83 | [[ ${#common_helm_args[@]} -gt 0 ]] && 84 | aemm_helm_args+=("${common_helm_args[@]}") 85 | 86 | set -x 87 | retry 5 helm "${aemm_helm_args[@]}" 88 | set +x 89 | 90 | TAINT_CHECK_CYCLES=15 91 | TAINT_CHECK_SLEEP=15 92 | 93 | logs=0 94 | pod_id="$(get_nth_worker_pod)" 95 | test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" 96 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 97 | if [[ $logs -eq 0 && ! -z $(kubectl logs "${pod_id}" -n kube-system | grep -i -e 'would have been cordoned and drained') ]]; then 98 | echo "✅ Verified the dryrun logs were executed" 99 | logs=1 100 | fi 101 | 102 | if [[ $logs -eq 1 ]] && kubectl get nodes "${test_node}" --no-headers | grep -v SchedulingDisabled >/dev/null; then 103 | echo "✅ Verified the worker node was not cordoned!" 104 | echo "✅ Scheduled Maintenance Event Dry Run Test Passed $CLUSTER_NAME! ✅" 105 | exit 0 106 | fi 107 | echo "Assertion Loop $i/$TAINT_CHECK_SLEEP, sleeping for $TAINT_CHECK_SLEEP seconds" 108 | sleep $TAINT_CHECK_SLEEP 109 | done 110 | 111 | if [[ $logs -eq 0 ]]; then 112 | echo "❌ Dryrun logs were not executed" 113 | else 114 | echo "❌ Worker node was cordoned" 115 | fi 116 | 117 | fail_and_exit 1 118 | -------------------------------------------------------------------------------- /pkg/monitor/scheduledevent/scheduled-event-monitor_internal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package scheduledevent 15 | 16 | import ( 17 | "context" 18 | "testing" 19 | "time" 20 | 21 | "github.com/rs/zerolog/log" 22 | 23 | "github.com/aws/aws-node-termination-handler/pkg/config" 24 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 25 | "github.com/aws/aws-node-termination-handler/pkg/node" 26 | h "github.com/aws/aws-node-termination-handler/pkg/test" 27 | "github.com/aws/aws-node-termination-handler/pkg/uptime" 28 | v1 "k8s.io/api/core/v1" 29 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 | "k8s.io/client-go/kubernetes/fake" 31 | "k8s.io/kubectl/pkg/drain" 32 | ) 33 | 34 | var nodeName = "NAME" 35 | 36 | func getDrainHelper(client *fake.Clientset) *drain.Helper { 37 | return &drain.Helper{ 38 | Client: client, 39 | Force: true, 40 | GracePeriodSeconds: -1, 41 | IgnoreAllDaemonSets: true, 42 | DeleteEmptyDirData: true, 43 | Timeout: time.Duration(120) * time.Second, 44 | Out: log.Logger, 45 | ErrOut: log.Logger, 46 | } 47 | } 48 | 49 | func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node { 50 | nthConfig := config.Config{ 51 | NodeName: nodeName, 52 | } 53 | tNode, err := node.NewWithValues(nthConfig, drainHelper, uptime.Uptime) 54 | if err != nil { 55 | t.Error("failed to create node") 56 | } 57 | return tNode 58 | } 59 | 60 | func TestUncordonAfterRebootPreDrainSuccess(t *testing.T) { 61 | drainEvent := monitor.InterruptionEvent{ 62 | EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", 63 | } 64 | nthConfig := config.Config{ 65 | DryRun: true, 66 | NodeName: nodeName, 67 | } 68 | 69 | client := fake.NewSimpleClientset() 70 | _, err := client.CoreV1().Nodes().Create(context.Background(), &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}, metav1.CreateOptions{}) 71 | h.Ok(t, err) 72 | 73 | tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) 74 | h.Ok(t, err) 75 | 76 | err = uncordonAfterRebootPreDrain(drainEvent, *tNode) 77 | 78 | h.Ok(t, err) 79 | } 80 | 81 | func TestUncordonAfterRebootPreDrainMarkWithEventIDFailure(t *testing.T) { 82 | tNode := getNode(t, getDrainHelper(fake.NewSimpleClientset())) 83 | err := uncordonAfterRebootPreDrain(monitor.InterruptionEvent{}, *tNode) 84 | h.Assert(t, err != nil, "Failed to return error on MarkWithEventID failing to fetch node") 85 | } 86 | 87 | func TestUncordonAfterRebootPreDrainNodeAlreadyMarkedSuccess(t *testing.T) { 88 | nthConfig := config.Config{ 89 | DryRun: true, 90 | NodeName: nodeName, 91 | } 92 | 93 | client := fake.NewSimpleClientset() 94 | _, err := client.CoreV1().Nodes().Create(context.Background(), 95 | &v1.Node{ 96 | ObjectMeta: metav1.ObjectMeta{ 97 | Name: nodeName, 98 | }, 99 | Spec: v1.NodeSpec{ 100 | Unschedulable: true, 101 | }, 102 | }, 103 | metav1.CreateOptions{}) 104 | h.Ok(t, err) 105 | 106 | tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) 107 | h.Ok(t, err) 108 | 109 | err = uncordonAfterRebootPreDrain(monitor.InterruptionEvent{}, *tNode) 110 | h.Ok(t, err) 111 | } 112 | -------------------------------------------------------------------------------- /pkg/logging/versioned.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package logging 15 | 16 | import ( 17 | "fmt" 18 | 19 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 20 | "github.com/rs/zerolog/log" 21 | ) 22 | 23 | type versionedMsgsV1 struct{} 24 | 25 | func (versionedMsgsV1) MonitoringStarted(monitorKind string) { 26 | log.Info().Str("event_type", monitorKind).Msg("Started monitoring for events") 27 | } 28 | 29 | func (versionedMsgsV1) ProblemMonitoringForEvents(monitorKind string, err error) { 30 | log.Warn().Str("event_type", monitorKind).Err(err).Msg("There was a problem monitoring for events") 31 | } 32 | 33 | func (versionedMsgsV1) ProcessingInterruptionEvent(event *monitor.InterruptionEvent) { 34 | var message string 35 | switch event.Kind { 36 | case monitor.ASGLaunchLifecycleKind: 37 | message = "Waiting for node to be ready before completing ASG launch lifecycle" 38 | default: 39 | message = "Requesting instance drain" 40 | } 41 | 42 | log.Info(). 43 | Str("event-id", event.EventID). 44 | Str("kind", event.Kind). 45 | Str("node-name", event.NodeName). 46 | Str("instance-id", event.InstanceID). 47 | Str("provider-id", event.ProviderID). 48 | Msg(message) 49 | } 50 | 51 | func (versionedMsgsV1) SendingInterruptionEventToChannel(_ string) { 52 | log.Debug().Msg("Sending SQS_TERMINATE interruption event to the interruption channel") 53 | } 54 | 55 | type versionedMsgsV2 struct{} 56 | 57 | func (versionedMsgsV2) MonitoringStarted(monitorKind string) { 58 | log.Info().Str("monitor_type", monitorKind).Msg("Started monitoring for events") 59 | } 60 | 61 | func (versionedMsgsV2) ProblemMonitoringForEvents(monitorKind string, err error) { 62 | log.Warn().Str("monitor_type", monitorKind).Err(err).Msg("There was a problem monitoring for events") 63 | } 64 | 65 | func (versionedMsgsV2) ProcessingInterruptionEvent(event *monitor.InterruptionEvent) { 66 | var message string 67 | switch event.Kind { 68 | case monitor.ASGLaunchLifecycleKind: 69 | message = "Waiting for node to be ready before completing ASG launch lifecycle" 70 | default: 71 | message = "Requesting instance drain" 72 | } 73 | 74 | log.Info(). 75 | Str("event-id", event.EventID). 76 | Str("kind", event.Kind). 77 | Str("monitor", event.Monitor). 78 | Str("node-name", event.NodeName). 79 | Str("instance-id", event.InstanceID). 80 | Str("provider-id", event.ProviderID). 81 | Msg(message) 82 | } 83 | 84 | func (versionedMsgsV2) SendingInterruptionEventToChannel(eventKind string) { 85 | log.Debug().Msgf("Sending %s interruption event to the interruption channel", eventKind) 86 | } 87 | 88 | var VersionedMsgs interface { 89 | MonitoringStarted(monitorKind string) 90 | ProblemMonitoringForEvents(monitorKind string, err error) 91 | ProcessingInterruptionEvent(event *monitor.InterruptionEvent) 92 | SendingInterruptionEventToChannel(eventKind string) 93 | } = versionedMsgsV1{} 94 | 95 | func SetFormatVersion(version int) error { 96 | switch version { 97 | case 1: 98 | VersionedMsgs = versionedMsgsV1{} 99 | return nil 100 | case 2: 101 | VersionedMsgs = versionedMsgsV2{} 102 | return nil 103 | default: 104 | VersionedMsgs = versionedMsgsV1{} 105 | return fmt.Errorf("Unrecognized log format version: %d, using version 1", version) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /test/e2e/emit-events-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Available env vars: 5 | # $TMP_DIR 6 | # $CLUSTER_NAME 7 | # $KUBECONFIG 8 | # $NODE_TERMINATION_HANDLER_DOCKER_REPO 9 | # $NODE_TERMINATION_HANDLER_DOCKER_TAG 10 | # $WEBHOOK_DOCKER_REPO 11 | # $WEBHOOK_DOCKER_TAG 12 | # $AEMM_URL 13 | # $AEMM_VERSION 14 | 15 | function fail_and_exit { 16 | echo "❌ K8s Emit Events Test failed $CLUSTER_NAME ❌" 17 | exit "${1:-1}" 18 | } 19 | 20 | echo "Starting K8s Emit Events Test for Node Termination Handler" 21 | 22 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 23 | 24 | common_helm_args=() 25 | [[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") 26 | [[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL") 27 | 28 | aemm_helm_args=( 29 | upgrade 30 | --install 31 | --namespace default 32 | "$CLUSTER_NAME-aemm" 33 | "$AEMM_DL_URL" 34 | --set aemm.IMDSv2="true" 35 | --set servicePort="$IMDS_PORT" 36 | --wait 37 | ) 38 | [[ ${#common_helm_args[@]} -gt 0 ]] && 39 | aemm_helm_args+=("${common_helm_args[@]}") 40 | 41 | set -x 42 | retry 5 helm "${aemm_helm_args[@]}" 43 | set +x 44 | 45 | sleep 5 46 | 47 | anth_helm_args=( 48 | upgrade 49 | --install 50 | --namespace kube-system 51 | "$CLUSTER_NAME-anth" 52 | "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" 53 | --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" 54 | --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" 55 | --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" 56 | --set enableSpotInterruptionDraining="true" 57 | --set enableScheduledEventDraining="true" 58 | --set emitKubernetesEvents="true" 59 | --wait 60 | --force 61 | ) 62 | [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && 63 | anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") 64 | [[ ${#common_helm_args[@]} -gt 0 ]] && 65 | anth_helm_args+=("${common_helm_args[@]}") 66 | 67 | set -x 68 | helm "${anth_helm_args[@]}" 69 | set +x 70 | 71 | emtp_helm_args=( 72 | upgrade 73 | --install 74 | --namespace default 75 | "$CLUSTER_NAME-emtp" 76 | "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" 77 | --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" 78 | --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" 79 | --wait 80 | ) 81 | [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && 82 | emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") 83 | [[ ${#common_helm_args[@]} -gt 0 ]] && 84 | emtp_helm_args+=("${common_helm_args[@]}") 85 | 86 | set -x 87 | helm "${emtp_helm_args[@]}" 88 | set +x 89 | 90 | TAINT_CHECK_CYCLES=15 91 | TAINT_CHECK_SLEEP=15 92 | 93 | DEPLOYED=0 94 | 95 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 96 | if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then 97 | echo "✅ Verified regular-pod-test pod was scheduled and started!" 98 | DEPLOYED=1 99 | break 100 | fi 101 | echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 102 | sleep $TAINT_CHECK_SLEEP 103 | done 104 | 105 | if [[ $DEPLOYED -eq 0 ]]; then 106 | echo "❌ regular-pod-test pod deployment failed" 107 | fail_and_exit 2 108 | fi 109 | 110 | test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" 111 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 112 | if kubectl get events | tr -s " " | grep "CordonAndDrain node/${test_node} Node successfully cordoned and drained" >/dev/null; then 113 | echo "✅ Verified CordonAndDrain was emitted as a k8s event! (success event)" 114 | echo "✅ K8s Emit Events Test Passed $CLUSTER_NAME! ✅" 115 | exit 0 116 | fi 117 | 118 | echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 119 | sleep $TAINT_CHECK_SLEEP 120 | done 121 | 122 | 123 | echo "❌ k8s CordonAndDrain event was not emitted to k8s" 124 | 125 | fail_and_exit 1 126 | -------------------------------------------------------------------------------- /pkg/monitor/sqsevent/spot-itn-event.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package sqsevent 15 | 16 | import ( 17 | "encoding/json" 18 | "fmt" 19 | 20 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 21 | "github.com/aws/aws-node-termination-handler/pkg/node" 22 | "github.com/aws/aws-sdk-go/service/sqs" 23 | "github.com/rs/zerolog/log" 24 | ) 25 | 26 | /* Example Spot ITN Event: 27 | { 28 | "version": "0", 29 | "id": "1e5527d7-bb36-4607-3370-4164db56a40e", 30 | "detail-type": "EC2 Spot Instance Interruption Warning", 31 | "source": "aws.ec2", 32 | "account": "", 33 | "time": "1970-01-01T00:00:00Z", 34 | "region": "us-east-1", 35 | "resources": [ 36 | "arn:aws:ec2:us-east-1b:instance/i-0b662ef9931388ba0" 37 | ], 38 | "detail": { 39 | "instance-id": "i-0b662ef9931388ba0", 40 | "instance-action": "terminate" 41 | } 42 | } 43 | */ 44 | 45 | // SpotInterruptionDetail holds the event details for spot interruption events from Amazon EventBridge 46 | type SpotInterruptionDetail struct { 47 | InstanceID string `json:"instance-id"` 48 | InstanceAction string `json:"instance-action"` 49 | } 50 | 51 | func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { 52 | spotInterruptionDetail := &SpotInterruptionDetail{} 53 | err := json.Unmarshal(event.Detail, spotInterruptionDetail) 54 | if err != nil { 55 | return nil, err 56 | } 57 | 58 | nodeInfo, err := m.getNodeInfo(spotInterruptionDetail.InstanceID) 59 | if err != nil { 60 | return nil, err 61 | } 62 | interruptionEvent := monitor.InterruptionEvent{ 63 | EventID: fmt.Sprintf("spot-itn-event-%x", event.ID), 64 | Kind: monitor.SpotITNKind, 65 | Monitor: SQSMonitorKind, 66 | AutoScalingGroupName: nodeInfo.AsgName, 67 | StartTime: event.getTime(), 68 | NodeName: nodeInfo.Name, 69 | IsManaged: nodeInfo.IsManaged, 70 | InstanceID: spotInterruptionDetail.InstanceID, 71 | ProviderID: nodeInfo.ProviderID, 72 | InstanceType: nodeInfo.InstanceType, 73 | Description: fmt.Sprintf("Spot Interruption notice for instance %s was sent at %s \n", spotInterruptionDetail.InstanceID, event.getTime()), 74 | } 75 | interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { 76 | errs := m.deleteMessages([]*sqs.Message{message}) 77 | if errs != nil { 78 | return errs[0] 79 | } 80 | return nil 81 | } 82 | interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { 83 | // Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured 84 | nthConfig := n.GetNthConfig() 85 | nodeName := interruptionEvent.NodeName 86 | if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" { 87 | resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID) 88 | if err != nil { 89 | log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event") 90 | } else { 91 | nodeName = resolvedNodeName 92 | } 93 | } 94 | 95 | err := n.TaintSpotItn(nodeName, interruptionEvent.EventID) 96 | if err != nil { 97 | log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID) 98 | } 99 | return nil 100 | } 101 | return &interruptionEvent, nil 102 | } 103 | -------------------------------------------------------------------------------- /pkg/monitor/sqsevent/rebalance-recommendation-event.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package sqsevent 15 | 16 | import ( 17 | "encoding/json" 18 | "fmt" 19 | 20 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 21 | "github.com/aws/aws-node-termination-handler/pkg/node" 22 | "github.com/aws/aws-sdk-go/service/sqs" 23 | "github.com/rs/zerolog/log" 24 | ) 25 | 26 | /* Example Rebalance Recommendation Event: 27 | { 28 | "version": "0", 29 | "id": "5d5555d5-dd55-5555-5555-5555dd55d55d", 30 | "detail-type": "EC2 Instance Rebalance Recommendation", 31 | "source": "aws.ec2", 32 | "account": "123456789012", 33 | "time": "2020-10-26T14:14:14Z", 34 | "region": "us-east-1", 35 | "resources": [ 36 | "arn:aws:ec2:us-east-1b:instance/i-0b662ef9931388ba0" 37 | ], 38 | "detail": { 39 | "instance-id": "i-0b662ef9931388ba0" 40 | } 41 | } 42 | */ 43 | 44 | // RebalanceRecommendationDetail holds the event details for rebalance recommendation events from Amazon EventBridge 45 | type RebalanceRecommendationDetail struct { 46 | InstanceID string `json:"instance-id"` 47 | } 48 | 49 | func (m SQSMonitor) rebalanceRecommendationToInterruptionEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { 50 | rebalanceRecDetail := &RebalanceRecommendationDetail{} 51 | err := json.Unmarshal(event.Detail, rebalanceRecDetail) 52 | if err != nil { 53 | return nil, err 54 | } 55 | 56 | nodeInfo, err := m.getNodeInfo(rebalanceRecDetail.InstanceID) 57 | if err != nil { 58 | return nil, err 59 | } 60 | interruptionEvent := monitor.InterruptionEvent{ 61 | EventID: fmt.Sprintf("rebalance-recommendation-event-%x", event.ID), 62 | Kind: monitor.RebalanceRecommendationKind, 63 | Monitor: SQSMonitorKind, 64 | AutoScalingGroupName: nodeInfo.AsgName, 65 | StartTime: event.getTime(), 66 | NodeName: nodeInfo.Name, 67 | IsManaged: nodeInfo.IsManaged, 68 | InstanceID: nodeInfo.InstanceID, 69 | ProviderID: nodeInfo.ProviderID, 70 | InstanceType: nodeInfo.InstanceType, 71 | Description: fmt.Sprintf("Rebalance recommendation event received. Instance %s will be cordoned at %s \n", rebalanceRecDetail.InstanceID, event.getTime()), 72 | } 73 | interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { 74 | errs := m.deleteMessages([]*sqs.Message{message}) 75 | if errs != nil { 76 | return errs[0] 77 | } 78 | return nil 79 | } 80 | interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { 81 | // Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured 82 | nthConfig := n.GetNthConfig() 83 | nodeName := interruptionEvent.NodeName 84 | if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" { 85 | resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID) 86 | if err != nil { 87 | log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event") 88 | } else { 89 | nodeName = resolvedNodeName 90 | } 91 | } 92 | 93 | err := n.TaintRebalanceRecommendation(nodeName, interruptionEvent.EventID) 94 | if err != nil { 95 | log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.RebalanceRecommendationTaint, interruptionEvent.EventID) 96 | } 97 | return nil 98 | } 99 | return &interruptionEvent, nil 100 | } 101 | -------------------------------------------------------------------------------- /test/e2e/cordon-only-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Available env vars: 5 | # $TMP_DIR 6 | # $CLUSTER_NAME 7 | # $KUBECONFIG 8 | # $NODE_TERMINATION_HANDLER_DOCKER_REPO 9 | # $NODE_TERMINATION_HANDLER_DOCKER_TAG 10 | # $WEBHOOK_DOCKER_REPO 11 | # $WEBHOOK_DOCKER_TAG 12 | # $AEMM_URL 13 | # $AEMM_VERSION 14 | 15 | function fail_and_exit { 16 | echo "❌ Cordon Only Test failed $CLUSTER_NAME ❌" 17 | exit "${1:-1}" 18 | } 19 | 20 | echo "Starting Cordon Only Test for Node Termination Handler" 21 | 22 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 23 | 24 | common_helm_args=() 25 | [[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") 26 | [[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL") 27 | 28 | anth_helm_args=( 29 | upgrade 30 | --install 31 | --namespace kube-system 32 | "$CLUSTER_NAME-anth" 33 | "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" 34 | --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" 35 | --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" 36 | --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" 37 | --set cordonOnly="true" 38 | --wait 39 | ) 40 | [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && 41 | anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") 42 | [[ ${#common_helm_args[@]} -gt 0 ]] && 43 | anth_helm_args+=("${common_helm_args[@]}") 44 | 45 | set -x 46 | helm "${anth_helm_args[@]}" 47 | set +x 48 | 49 | aemm_helm_args=( 50 | upgrade 51 | --install 52 | --namespace default 53 | "$CLUSTER_NAME-aemm" 54 | "$AEMM_DL_URL" 55 | --set servicePort="$IMDS_PORT" 56 | --wait 57 | ) 58 | [[ ${#common_helm_args[@]} -gt 0 ]] && 59 | aemm_helm_args+=("${common_helm_args[@]}") 60 | 61 | set -x 62 | retry 5 helm "${aemm_helm_args[@]}" 63 | set +x 64 | 65 | emtp_helm_args=( 66 | upgrade 67 | --install 68 | --namespace default 69 | "$CLUSTER_NAME-emtp" 70 | "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" 71 | --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" 72 | --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" 73 | --wait 74 | ) 75 | [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && 76 | emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") 77 | [[ ${#common_helm_args[@]} -gt 0 ]] && 78 | emtp_helm_args+=("${common_helm_args[@]}") 79 | 80 | set -x 81 | helm "${emtp_helm_args[@]}" 82 | set +x 83 | 84 | TAINT_CHECK_CYCLES=15 85 | TAINT_CHECK_SLEEP=15 86 | 87 | DEPLOYED=0 88 | 89 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 90 | if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then 91 | echo "✅ Verified regular-pod-test pod was scheduled and started!" 92 | DEPLOYED=1 93 | break 94 | fi 95 | echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 96 | sleep $TAINT_CHECK_SLEEP 97 | done 98 | 99 | if [[ $DEPLOYED -eq 0 ]]; then 100 | echo "❌ regular-pod-test pod deployment failed" 101 | fail_and_exit 2 102 | fi 103 | 104 | cordoned=0 105 | test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" 106 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 107 | if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then 108 | echo "✅ Verified the worker node was cordoned!" 109 | cordoned=1 110 | fi 111 | 112 | if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then 113 | echo "✅ Verified the regular-pod-test pod was NOT evicted!" 114 | echo "✅ Cordon Only Test Passed $CLUSTER_NAME! ✅" 115 | exit 0 116 | fi 117 | echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 118 | sleep $TAINT_CHECK_SLEEP 119 | done 120 | 121 | if [[ $cordoned -eq 0 ]]; then 122 | echo "❌ Worker node was not cordoned" 123 | else 124 | echo "❌ regular-pod-test was evicted" 125 | fi 126 | 127 | fail_and_exit 1 128 | -------------------------------------------------------------------------------- /test/e2e/imds-v2-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Available env vars: 5 | # $TMP_DIR 6 | # $CLUSTER_NAME 7 | # $KUBECONFIG 8 | # $NODE_TERMINATION_HANDLER_DOCKER_REPO 9 | # $NODE_TERMINATION_HANDLER_DOCKER_TAG 10 | # $WEBHOOK_DOCKER_REPO 11 | # $WEBHOOK_DOCKER_TAG 12 | # $AEMM_URL 13 | # $AEMM_VERSION 14 | 15 | function fail_and_exit { 16 | echo "❌ IMDSv2 Test failed $CLUSTER_NAME ❌" 17 | exit "${1:-1}" 18 | } 19 | 20 | echo "Starting IMDSv2 Test for Node Termination Handler" 21 | 22 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 23 | 24 | common_helm_args=() 25 | [[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") 26 | [[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL") 27 | 28 | anth_helm_args=( 29 | upgrade 30 | --install 31 | --namespace kube-system 32 | "$CLUSTER_NAME-anth" 33 | "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" 34 | --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" 35 | --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" 36 | --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" 37 | --set enableSpotInterruptionDraining="true" 38 | --set enableScheduledEventDraining="true" 39 | --wait 40 | --force 41 | ) 42 | [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && 43 | anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") 44 | [[ ${#common_helm_args[@]} -gt 0 ]] && 45 | anth_helm_args+=("${common_helm_args[@]}") 46 | 47 | set -x 48 | helm "${anth_helm_args[@]}" 49 | set +x 50 | 51 | emtp_helm_args=( 52 | upgrade 53 | --install 54 | --namespace default 55 | "$CLUSTER_NAME-emtp" 56 | "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" 57 | --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" 58 | --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" 59 | --wait 60 | ) 61 | [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && 62 | emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") 63 | [[ ${#common_helm_args[@]} -gt 0 ]] && 64 | emtp_helm_args+=("${common_helm_args[@]}") 65 | 66 | set -x 67 | helm "${emtp_helm_args[@]}" 68 | set +x 69 | 70 | aemm_helm_args=( 71 | upgrade 72 | --install 73 | --namespace default 74 | "$CLUSTER_NAME-aemm" 75 | "$AEMM_DL_URL" 76 | --set aemm.imdsv2="true" 77 | --set servicePort="$IMDS_PORT" 78 | --wait 79 | ) 80 | [[ ${#common_helm_args[@]} -gt 0 ]] && 81 | aemm_helm_args+=("${common_helm_args[@]}") 82 | 83 | set -x 84 | retry 5 helm "${aemm_helm_args[@]}" 85 | set +x 86 | 87 | TAINT_CHECK_CYCLES=15 88 | TAINT_CHECK_SLEEP=15 89 | 90 | DEPLOYED=0 91 | 92 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 93 | if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then 94 | echo "✅ Verified regular-pod-test pod was scheduled and started!" 95 | DEPLOYED=1 96 | break 97 | fi 98 | echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 99 | sleep $TAINT_CHECK_SLEEP 100 | done 101 | 102 | if [[ $DEPLOYED -eq 0 ]]; then 103 | echo "❌ regular-pod-test pod deployment failed" 104 | fail_and_exit 2 105 | fi 106 | 107 | cordoned=0 108 | test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" 109 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 110 | if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then 111 | echo "✅ Verified the worker node was cordoned!" 112 | cordoned=1 113 | fi 114 | 115 | if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then 116 | echo "✅ Verified the regular-pod-test pod was evicted!" 117 | echo "✅ IMDSv2 Test Passed $CLUSTER_NAME! ✅" 118 | exit 0 119 | fi 120 | echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 121 | sleep $TAINT_CHECK_SLEEP 122 | done 123 | 124 | if [[ $cordoned -eq 0 ]]; then 125 | echo "❌ Worker node was not cordoned" 126 | else 127 | echo "❌ regular-pod-test pod was not evicted" 128 | fi 129 | 130 | fail_and_exit 1 131 | -------------------------------------------------------------------------------- /scripts/upload-resources-to-github: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Script to upload release assets to Github. 5 | # This script cleans up after itself in cases of parital failures. i.e. either all assets are uploaded or none 6 | 7 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 8 | VERSION=$(make -s -f $SCRIPTPATH/../Makefile version) 9 | BUILD_DIR=$SCRIPTPATH/../build/k8s-resources/$VERSION 10 | BINARY_DIR=$SCRIPTPATH/../build/bin 11 | BINARIES_ONLY="false" 12 | K8s_ASSETS_ONLY="false" 13 | SUFFIX="" 14 | 15 | USAGE=$(cat << 'EOM' 16 | Usage: upload-resources-to-github [-b] 17 | Upload release assets to GitHub. Release assets include binaries for supported platforms and K8s resources for supported versions. 18 | 19 | Example: upload-resources-to-github -b 20 | Optional: 21 | -b Upload binaries only [DEFAULT: upload all the assets] 22 | -k Upload K8s assets only 23 | -s SUFFIX String appended to resource file names 24 | EOM 25 | ) 26 | 27 | # Process our input arguments 28 | while getopts "bks:" opt; do 29 | case ${opt} in 30 | b ) # Binaries only 31 | BINARIES_ONLY="true" 32 | ;; 33 | k ) # K8s assets only 34 | K8s_ASSETS_ONLY="true" 35 | ;; 36 | s) # Suffix 37 | SUFFIX=$OPTARG 38 | ;; 39 | \? ) 40 | echo "$USAGE" 1>&2 41 | exit 42 | ;; 43 | esac 44 | done 45 | 46 | INDV_K8S_RESOURCES=$BUILD_DIR/individual-resources${SUFFIX}.tar 47 | AGG_RESOURCES_YAML=$BUILD_DIR/all-resources${SUFFIX}.yaml 48 | QP_INDV_K8S_RESOURCES=$BUILD_DIR/individual-resources-queue-processor${SUFFIX}.tar 49 | QP_AGG_RESOURCES_YAML=$BUILD_DIR/all-resources-queue-processor${SUFFIX}.yaml 50 | 51 | RELEASE_ID=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \ 52 | https://api.github.com/repos/aws/aws-node-termination-handler/releases | \ 53 | jq --arg VERSION "$VERSION" '.[] | select(.tag_name==$VERSION) | .id') 54 | 55 | ASSET_IDS_UPLOADED=() 56 | 57 | trap 'handle_errors_and_cleanup $?' EXIT 58 | 59 | handle_errors_and_cleanup() { 60 | if [ $1 -eq 0 ]; then 61 | exit 0 62 | fi 63 | 64 | if [[ ${#ASSET_IDS_UPLOADED[@]} -ne 0 ]]; then 65 | echo -e "\nCleaning up assets uploaded in the current execution of the script" 66 | for asset_id in "${ASSET_IDS_UPLOADED[@]}"; do 67 | echo "Deleting asset $asset_id" 68 | curl -X DELETE \ 69 | -H "Authorization: token $GITHUB_TOKEN" \ 70 | "https://api.github.com/repos/aws/aws-node-termination-handler/releases/assets/$asset_id" 71 | done 72 | exit $1 73 | fi 74 | } 75 | 76 | gather_assets_to_upload() { 77 | local resources=() 78 | if [ $K8s_ASSETS_ONLY != "true" ]; then 79 | for binary in $BINARY_DIR/*; do 80 | resources+=("$binary") 81 | done 82 | fi 83 | if [ $BINARIES_ONLY != "true" ]; then 84 | resources+=("$INDV_K8S_RESOURCES" "$AGG_RESOURCES_YAML" "$QP_INDV_K8S_RESOURCES" "$QP_AGG_RESOURCES_YAML") 85 | fi 86 | echo "${resources[@]}" 87 | } 88 | 89 | # $1: absolute path to asset 90 | upload_asset() { 91 | resp=$(curl --write-out '%{http_code}' --silent \ 92 | -H "Authorization: token $GITHUB_TOKEN" \ 93 | -H "Content-Type: $(file -b --mime-type $1)" \ 94 | --data-binary @$1 \ 95 | "https://uploads.github.com/repos/aws/aws-node-termination-handler/releases/$RELEASE_ID/assets?name=$(basename $1)") 96 | 97 | response_code=$(echo $resp | sed 's/\(.*\)}//') 98 | response_content=$(echo $resp | sed "s/$response_code//") 99 | 100 | # HTTP success code expected - 201 Created 101 | if [[ $response_code -eq 201 ]]; then 102 | asset_id=$(echo $response_content | jq '.id') 103 | ASSET_IDS_UPLOADED+=("$asset_id") 104 | echo "Created asset ID $asset_id successfully" 105 | else 106 | echo -e "❌ Upload failed with response code $response_code and message \n$response_content ❌" 107 | exit 1 108 | fi 109 | } 110 | 111 | ASSETS=$(gather_assets_to_upload) 112 | COUNT=1 113 | echo -e "\nUploading release assets for release id '$RELEASE_ID' to Github" 114 | for asset in $ASSETS; do 115 | name=$(echo $asset | tr '/' '\n' | tail -1) 116 | echo -e "\n $((COUNT++)). $name" 117 | upload_asset $asset 118 | done -------------------------------------------------------------------------------- /pkg/monitor/spotitn/spot-itn-monitor.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package spotitn 15 | 16 | import ( 17 | "crypto/sha256" 18 | "fmt" 19 | "time" 20 | 21 | "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" 22 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 23 | "github.com/aws/aws-node-termination-handler/pkg/node" 24 | ) 25 | 26 | // SpotITNMonitorKind is a const to define this monitor kind 27 | const SpotITNMonitorKind = "SPOT_ITN_MONITOR" 28 | 29 | // SpotInterruptionMonitor is a struct definition which facilitates monitoring of spot ITNs from IMDS 30 | type SpotInterruptionMonitor struct { 31 | IMDS *ec2metadata.Service 32 | InterruptionChan chan<- monitor.InterruptionEvent 33 | CancelChan chan<- monitor.InterruptionEvent 34 | NodeName string 35 | } 36 | 37 | // NewSpotInterruptionMonitor creates an instance of a spot ITN IMDS monitor 38 | func NewSpotInterruptionMonitor(imds *ec2metadata.Service, interruptionChan chan<- monitor.InterruptionEvent, cancelChan chan<- monitor.InterruptionEvent, nodeName string) SpotInterruptionMonitor { 39 | return SpotInterruptionMonitor{ 40 | IMDS: imds, 41 | InterruptionChan: interruptionChan, 42 | CancelChan: cancelChan, 43 | NodeName: nodeName, 44 | } 45 | } 46 | 47 | // Monitor continuously monitors metadata for spot ITNs and sends interruption events to the passed in channel 48 | func (m SpotInterruptionMonitor) Monitor() error { 49 | interruptionEvent, err := m.checkForSpotInterruptionNotice() 50 | if err != nil { 51 | return err 52 | } 53 | if interruptionEvent != nil && interruptionEvent.Kind == monitor.SpotITNKind { 54 | m.InterruptionChan <- *interruptionEvent 55 | } 56 | return nil 57 | } 58 | 59 | // Kind denotes the kind of monitor 60 | func (m SpotInterruptionMonitor) Kind() string { 61 | return SpotITNMonitorKind 62 | } 63 | 64 | // checkForSpotInterruptionNotice Checks EC2 instance metadata for a spot interruption termination notice 65 | func (m SpotInterruptionMonitor) checkForSpotInterruptionNotice() (*monitor.InterruptionEvent, error) { 66 | instanceAction, err := m.IMDS.GetSpotITNEvent() 67 | if instanceAction == nil && err == nil { 68 | // if there are no spot itns and no errors 69 | return nil, nil 70 | } 71 | if err != nil { 72 | return nil, fmt.Errorf("There was a problem checking for spot ITNs: %w", err) 73 | } 74 | nodeName := m.NodeName 75 | interruptionTime, err := time.Parse(time.RFC3339, instanceAction.Time) 76 | if err != nil { 77 | return nil, fmt.Errorf("Could not parse time from spot interruption notice metadata json: %w", err) 78 | } 79 | 80 | // There's no EventID returned so we'll create it using a hash to prevent duplicates. 81 | hash := sha256.New() 82 | _, err = hash.Write([]byte(fmt.Sprintf("%v", instanceAction))) 83 | if err != nil { 84 | return nil, fmt.Errorf("There was a problem creating an event ID from the event: %w", err) 85 | } 86 | 87 | return &monitor.InterruptionEvent{ 88 | EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)), 89 | Kind: monitor.SpotITNKind, 90 | Monitor: SpotITNMonitorKind, 91 | StartTime: interruptionTime, 92 | NodeName: nodeName, 93 | Description: fmt.Sprintf("Spot ITN received. Instance will be interrupted at %s \n", instanceAction.Time), 94 | PreDrainTask: setInterruptionTaint, 95 | }, nil 96 | } 97 | 98 | func setInterruptionTaint(interruptionEvent monitor.InterruptionEvent, n node.Node) error { 99 | err := n.TaintSpotItn(interruptionEvent.NodeName, interruptionEvent.EventID) 100 | if err != nil { 101 | return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.SpotInterruptionTaint, interruptionEvent.EventID, err) 102 | } 103 | 104 | return nil 105 | } 106 | -------------------------------------------------------------------------------- /test/e2e/webhook-secret-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Available env vars: 5 | # $TMP_DIR 6 | # $CLUSTER_NAME 7 | # $KUBECONFIG 8 | # $NODE_TERMINATION_HANDLER_DOCKER_REPO 9 | # $NODE_TERMINATION_HANDLER_DOCKER_TAG 10 | # $WEBHOOK_DOCKER_REPO 11 | # $WEBHOOK_DOCKER_TAG 12 | 13 | echo "Starting Webhook URL Secret Test for Node Termination Handler" 14 | 15 | SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" 16 | 17 | WEBHOOKURL_LITERAL="webhookurl=${WEBHOOK_URL}" 18 | WEBHOOK_NAME="webhooksecret" 19 | 20 | function cleanup { 21 | kubectl delete secret -n kube-system "${WEBHOOK_NAME}" || : 22 | } 23 | 24 | kubectl create secret -n kube-system generic "${WEBHOOK_NAME}" --from-literal="${WEBHOOKURL_LITERAL}" 25 | 26 | trap "cleanup" EXIT INT TERM ERR 27 | 28 | common_helm_args=() 29 | [[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") 30 | [[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL") 31 | 32 | aemm_helm_args=( 33 | upgrade 34 | --install 35 | --namespace default 36 | "$CLUSTER_NAME-aemm" 37 | "$AEMM_DL_URL" 38 | --set servicePort="$IMDS_PORT" 39 | --wait 40 | ) 41 | [[ ${#common_helm_args[@]} -gt 0 ]] && 42 | aemm_helm_args+=("${common_helm_args[@]}") 43 | 44 | set -x 45 | retry 5 helm "${aemm_helm_args[@]}" 46 | set +x 47 | 48 | emtp_helm_args=( 49 | upgrade 50 | --install 51 | --namespace default 52 | "$CLUSTER_NAME-emtp" 53 | "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" 54 | --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" 55 | --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" 56 | --wait 57 | ) 58 | [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && 59 | emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") 60 | [[ ${#common_helm_args[@]} -gt 0 ]] && 61 | emtp_helm_args+=("${common_helm_args[@]}") 62 | 63 | set -x 64 | helm "${emtp_helm_args[@]}" 65 | set +x 66 | 67 | anth_helm_args=( 68 | upgrade 69 | --install 70 | --namespace kube-system 71 | "$CLUSTER_NAME-anth" 72 | "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" 73 | --set instanceMetadataURL="http://$AEMM_URL:$IMDS_PORT" 74 | --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" 75 | --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" 76 | --set enableSpotInterruptionDraining="true" 77 | --set enableScheduledEventDraining="true" 78 | --set webhookURLSecretName=webhooksecret \ 79 | --set webhookTemplate="\{\"Content\":\"[NTH][Instance Interruption] InstanceId: \{\{ \.InstanceID \}\} - Node: \{\{ \.NodeName \}\} - InstanceType: \{\{ \.InstanceType \}\} - Kind: \{\{ \.Kind \}\} - Start Time: \{\{ \.StartTime \}\}\"\}" 80 | --force 81 | --wait 82 | ) 83 | [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && 84 | anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") 85 | [[ ${#common_helm_args[@]} -gt 0 ]] && 86 | anth_helm_args+=("${common_helm_args[@]}") 87 | 88 | set -x 89 | helm "${anth_helm_args[@]}" 90 | set +x 91 | 92 | TAINT_CHECK_CYCLES=15 93 | TAINT_CHECK_SLEEP=15 94 | 95 | DEPLOYED=0 96 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 97 | if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then 98 | echo "✅ Verified regular-pod-test pod was scheduled and started!" 99 | DEPLOYED=1 100 | break 101 | fi 102 | echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 103 | sleep $TAINT_CHECK_SLEEP 104 | done 105 | 106 | if [[ $DEPLOYED -eq 0 ]]; then 107 | exit 2 108 | fi 109 | 110 | for i in $(seq 1 $TAINT_CHECK_CYCLES); do 111 | if kubectl get nodes "$CLUSTER_NAME-worker" | grep SchedulingDisabled; then 112 | echo "✅ Verified the worker node was cordoned!" 113 | NTH_POD_NAME=$(get_nth_worker_pod) 114 | if kubectl logs "${NTH_POD_NAME}" -n kube-system | grep 'Webhook Success'; then 115 | echo "✅ Verified the webhook message was sent!" 116 | echo "✅ Webhook URL as a Secret Test Passed $CLUSTER_NAME! ✅" 117 | exit 0 118 | fi 119 | fi 120 | echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" 121 | sleep $TAINT_CHECK_SLEEP 122 | done 123 | 124 | exit 1 125 | -------------------------------------------------------------------------------- /pkg/monitor/asglifecycle/asg-lifecycle-monitor.go: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"). You may 4 | // not use this file except in compliance with the License. A copy of the 5 | // License is located at 6 | // 7 | // http://aws.amazon.com/apache2.0/ 8 | // 9 | // or in the "license" file accompanying this file. This file is distributed 10 | // on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | // express or implied. See the License for the specific language governing 12 | // permissions and limitations under the License. 13 | 14 | package asglifecycle 15 | 16 | import ( 17 | "crypto/sha256" 18 | "fmt" 19 | "time" 20 | 21 | "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" 22 | "github.com/aws/aws-node-termination-handler/pkg/monitor" 23 | "github.com/aws/aws-node-termination-handler/pkg/node" 24 | ) 25 | 26 | // ASGLifecycleMonitorKind is a const to define this monitor kind 27 | const ASGLifecycleMonitorKind = "ASG_LIFECYCLE_MONITOR" 28 | 29 | // ASGLifecycleMonitor is a struct definition which facilitates monitoring of ASG target lifecycle state from IMDS 30 | type ASGLifecycleMonitor struct { 31 | IMDS *ec2metadata.Service 32 | InterruptionChan chan<- monitor.InterruptionEvent 33 | CancelChan chan<- monitor.InterruptionEvent 34 | NodeName string 35 | } 36 | 37 | // NewASGLifecycleMonitor creates an instance of a ASG lifecycle IMDS monitor 38 | func NewASGLifecycleMonitor(imds *ec2metadata.Service, interruptionChan chan<- monitor.InterruptionEvent, cancelChan chan<- monitor.InterruptionEvent, nodeName string) ASGLifecycleMonitor { 39 | return ASGLifecycleMonitor{ 40 | IMDS: imds, 41 | InterruptionChan: interruptionChan, 42 | CancelChan: cancelChan, 43 | NodeName: nodeName, 44 | } 45 | } 46 | 47 | // Monitor continuously monitors metadata for ASG target lifecycle state and sends interruption events to the passed in channel 48 | func (m ASGLifecycleMonitor) Monitor() error { 49 | interruptionEvent, err := m.checkForASGTargetLifecycleStateNotice() 50 | if err != nil { 51 | return err 52 | } 53 | if interruptionEvent != nil && interruptionEvent.Kind == monitor.ASGLifecycleKind { 54 | m.InterruptionChan <- *interruptionEvent 55 | } 56 | return nil 57 | } 58 | 59 | // Kind denotes the kind of monitor 60 | func (m ASGLifecycleMonitor) Kind() string { 61 | return ASGLifecycleMonitorKind 62 | } 63 | 64 | // checkForASGTargetLifecycleStateNotice Checks EC2 instance metadata for a asg lifecycle termination notice 65 | func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.InterruptionEvent, error) { 66 | state, err := m.IMDS.GetASGTargetLifecycleState() 67 | if err != nil { 68 | return nil, fmt.Errorf("There was a problem checking for ASG target lifecycle state: %w", err) 69 | } 70 | if state != "Terminated" { 71 | // if the state is not "Terminated", we can skip. State can also be empty (no hook configured). 72 | return nil, nil 73 | } 74 | 75 | nodeName := m.NodeName 76 | // there is no time in the response, we just set time to the latest check 77 | interruptionTime := time.Now() 78 | 79 | // There's no EventID returned, so we'll create it using a hash to prevent duplicates. 80 | hash := sha256.New() 81 | if _, err = hash.Write([]byte(fmt.Sprintf("%s:%s", state, interruptionTime))); err != nil { 82 | return nil, fmt.Errorf("There was a problem creating an event ID from the event: %w", err) 83 | } 84 | 85 | return &monitor.InterruptionEvent{ 86 | EventID: fmt.Sprintf("target-lifecycle-state-terminated-%x", hash.Sum(nil)), 87 | Kind: monitor.ASGLifecycleKind, 88 | Monitor: ASGLifecycleMonitorKind, 89 | StartTime: interruptionTime, 90 | NodeName: nodeName, 91 | Description: "AST target lifecycle state received. Instance will be terminated\n", 92 | PreDrainTask: setInterruptionTaint, 93 | }, nil 94 | } 95 | 96 | func setInterruptionTaint(interruptionEvent monitor.InterruptionEvent, n node.Node) error { 97 | err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID) 98 | if err != nil { 99 | return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID, err) 100 | } 101 | 102 | return nil 103 | } 104 | -------------------------------------------------------------------------------- /config/helm/aws-node-termination-handler/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | 3 | {{/* 4 | Expand the name of the chart. 5 | */}} 6 | {{- define "aws-node-termination-handler.name" -}} 7 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 8 | {{- end -}} 9 | 10 | {{/* 11 | Create a default fully qualified app name. 12 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 13 | If release name contains chart name it will be used as a full name. 14 | */}} 15 | {{- define "aws-node-termination-handler.fullname" -}} 16 | {{- if .Values.fullnameOverride -}} 17 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 18 | {{- else -}} 19 | {{- $name := default .Chart.Name .Values.nameOverride -}} 20 | {{- if contains $name .Release.Name -}} 21 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 22 | {{- else -}} 23 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 24 | {{- end -}} 25 | {{- end -}} 26 | {{- end -}} 27 | 28 | {{/* 29 | Equivalent to "aws-node-termination-handler.fullname" except that "-win" indicator is appended to the end. 30 | Name will not exceed 63 characters. 31 | */}} 32 | {{- define "aws-node-termination-handler.fullnameWindows" -}} 33 | {{- include "aws-node-termination-handler.fullname" . | trunc 59 | trimSuffix "-" | printf "%s-win" -}} 34 | {{- end -}} 35 | 36 | {{/* 37 | Create chart name and version as used by the chart label. 38 | */}} 39 | {{- define "aws-node-termination-handler.chart" -}} 40 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 41 | {{- end -}} 42 | 43 | {{/* 44 | Common labels 45 | */}} 46 | {{- define "aws-node-termination-handler.labels" -}} 47 | {{ include "aws-node-termination-handler.selectorLabels" . }} 48 | {{- if .Chart.AppVersion }} 49 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 50 | {{- end }} 51 | app.kubernetes.io/part-of: {{ .Release.Name }} 52 | app.kubernetes.io/managed-by: {{ .Release.Service }} 53 | helm.sh/chart: {{ include "aws-node-termination-handler.chart" . }} 54 | {{- with .Values.customLabels }} 55 | {{ toYaml . }} 56 | {{- end }} 57 | {{- end -}} 58 | 59 | {{/* 60 | Deployment labels 61 | */}} 62 | {{- define "aws-node-termination-handler.labelsDeployment" -}} 63 | {{ include "aws-node-termination-handler.labels" . }} 64 | app.kubernetes.io/component: deployment 65 | {{- end -}} 66 | 67 | {{/* 68 | Daemonset labels 69 | */}} 70 | {{- define "aws-node-termination-handler.labelsDaemonset" -}} 71 | {{ include "aws-node-termination-handler.labels" . }} 72 | app.kubernetes.io/component: daemonset 73 | {{- end -}} 74 | 75 | {{/* 76 | Selector labels 77 | */}} 78 | {{- define "aws-node-termination-handler.selectorLabels" -}} 79 | app.kubernetes.io/name: {{ include "aws-node-termination-handler.name" . }} 80 | app.kubernetes.io/instance: {{ .Release.Name }} 81 | {{- end -}} 82 | 83 | {{/* 84 | Selector labels for the deployment 85 | */}} 86 | {{- define "aws-node-termination-handler.selectorLabelsDeployment" -}} 87 | {{ include "aws-node-termination-handler.selectorLabels" . }} 88 | app.kubernetes.io/component: deployment 89 | {{- end -}} 90 | 91 | {{/* 92 | Selector labels for the daemonset 93 | */}} 94 | {{- define "aws-node-termination-handler.selectorLabelsDaemonset" -}} 95 | {{ include "aws-node-termination-handler.selectorLabels" . }} 96 | app.kubernetes.io/component: daemonset 97 | {{- end -}} 98 | 99 | {{/* 100 | Create the name of the service account to use 101 | */}} 102 | {{- define "aws-node-termination-handler.serviceAccountName" -}} 103 | {{- if .Values.serviceAccount.create -}} 104 | {{ default (include "aws-node-termination-handler.fullname" .) .Values.serviceAccount.name }} 105 | {{- else -}} 106 | {{ default "default" .Values.serviceAccount.name }} 107 | {{- end -}} 108 | {{- end -}} 109 | 110 | {{/* 111 | The image to use 112 | */}} 113 | {{- define "aws-node-termination-handler.image" -}} 114 | {{- printf "%s:%s" .Values.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.image.tag) }} 115 | {{- end }} 116 | 117 | {{/* Get PodDisruptionBudget API Version */}} 118 | {{- define "aws-node-termination-handler.pdb.apiVersion" -}} 119 | {{- if and (.Capabilities.APIVersions.Has "policy/v1") (semverCompare ">= 1.21-0" .Capabilities.KubeVersion.Version) -}} 120 | {{- print "policy/v1" -}} 121 | {{- else -}} 122 | {{- print "policy/v1beta1" -}} 123 | {{- end -}} 124 | {{- end -}} 125 | --------------------------------------------------------------------------------