├── .dockerignore
├── .gitignore
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── test.yml
    │   ├── publish-release.yml
    │   └── publish-latest.yml
├── Dockerfile
├── k8s
    ├── client_test.go
    ├── k8s.go
    ├── util.go
    ├── client.go
    └── util_test.go
├── metrics
    ├── metrics_test.go
    └── metrics.go
├── config
    ├── config_test.go
    └── config.go
├── cloud
    ├── aws_test.go
    └── aws.go
├── k8stest
    └── k8stest.go
├── go.mod
├── cloudtest
    └── cloudtest.go
├── LICENSE
├── README.md
├── go.sum
├── main.go
└── main_test.go


/.dockerignore:
--------------------------------------------------------------------------------
1 | .github
2 | .idea


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # IDE
 2 | *.iml
 3 | .idea
 4 | .vscode
 5 | 
 6 | # OS
 7 | .DS_Store
 8 | 
 9 | # JS
10 | node_modules
11 | 
12 | # Go
13 | /vendor
14 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     labels: ["dependencies"]
 6 |     schedule:
 7 |       interval: "weekly"
 8 |       day: "friday"
 9 |   - package-ecosystem: "gomod"
10 |     directory: "/"
11 |     labels: ["dependencies"]
12 |     schedule:
13 |       interval: "weekly"
14 |       day: "friday"
15 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | on:
 3 |   pull_request:
 4 |     paths-ignore:
 5 |       - '*.md'
 6 |   push:
 7 |     branches:
 8 |       - master
 9 |     paths-ignore:
10 |       - '*.md'
11 | jobs:
12 |   test:
13 |     name: test
14 |     runs-on: ubuntu-latest
15 |     timeout-minutes: 5
16 |     steps:
17 |       - uses: actions/setup-go@v5
18 |         with:
19 |           go-version: 1.23.5
20 |       - uses: actions/checkout@v6
21 |       - run: go build
22 |       - run: go test ./... -cover
23 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build the go application into a binary
 2 | FROM golang:alpine as builder
 3 | WORKDIR /app
 4 | ADD . ./
 5 | RUN go mod tidy
 6 | RUN CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -a -installsuffix cgo -o aws-eks-asg-rolling-update-handler .
 7 | RUN apk --update add ca-certificates
 8 | 
 9 | # Run the binary on an empty container
10 | FROM scratch
11 | COPY --from=builder /app/aws-eks-asg-rolling-update-handler .
12 | COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
13 | ENTRYPOINT ["/aws-eks-asg-rolling-update-handler"]
14 | 


--------------------------------------------------------------------------------
/k8s/client_test.go:
--------------------------------------------------------------------------------
 1 | package k8s
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	v1 "k8s.io/api/core/v1"
 7 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 8 | 	fakekubernetes "k8s.io/client-go/kubernetes/fake"
 9 | )
10 | 
11 | func TestClient_Drain(t *testing.T) {
12 | 	fakeKubernetesClient := fakekubernetes.NewSimpleClientset(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "default"}})
13 | 	kc := NewClient(fakeKubernetesClient)
14 | 	if err := kc.Cordon("default"); err != nil {
15 | 		t.Errorf("Unexpected error: %v", err)
16 | 	}
17 | 	if err := kc.Drain("default", true, true, -1); err != nil {
18 | 		t.Errorf("Unexpected error: %v", err)
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-release.yml:
--------------------------------------------------------------------------------
 1 | name: publish-release
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | jobs:
 6 |   publish-release:
 7 |     name: publish-release
 8 |     runs-on: ubuntu-latest
 9 |     timeout-minutes: 60
10 |     steps:
11 |       - uses: actions/checkout@v6
12 |       - name: Get image repository
13 |         run: echo IMAGE_REPOSITORY=$(echo ${{ secrets.DOCKER_USERNAME }}/${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV
14 |       - name: Get the release
15 |         run: echo RELEASE=${GITHUB_REF/refs\/tags\//} >> $GITHUB_ENV
16 |       - name: Set up QEMU
17 |         uses: docker/setup-qemu-action@v3
18 |       - name: Set up Docker Buildx
19 |         uses: docker/setup-buildx-action@v3
20 |       - name: Login to Docker Registry
21 |         uses: docker/login-action@v3
22 |         with:
23 |           username: ${{ secrets.DOCKER_USERNAME }}
24 |           password: ${{ secrets.DOCKER_PASSWORD }}
25 |       - name: Build and push docker image
26 |         uses: docker/build-push-action@v6
27 |         with:
28 |           platforms: linux/amd64,linux/arm/v7,linux/arm64
29 |           pull: true
30 |           push: true
31 |           tags: |
32 |             ${{ env.IMAGE_REPOSITORY }}:${{ env.RELEASE }}
33 |             ${{ env.IMAGE_REPOSITORY }}:stable
34 |             ${{ env.IMAGE_REPOSITORY }}:latest
35 | 


--------------------------------------------------------------------------------
/k8s/k8s.go:
--------------------------------------------------------------------------------
 1 | package k8s
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"path/filepath"
 6 | 
 7 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/config"
 8 | 	"k8s.io/client-go/kubernetes"
 9 | 	"k8s.io/client-go/rest"
10 | 	"k8s.io/client-go/tools/clientcmd"
11 | )
12 | 
13 | // CreateClientSet Creates a Kubernetes ClientSet for authenticating with a cluster
14 | // If the current environment is dev, use the user's kubeconfig
15 | // If it isn't, then it means that the application is inside the cluster, which means
16 | // we'll use the service account token
17 | func CreateClientSet() (*kubernetes.Clientset, error) {
18 | 	var cfg *rest.Config
19 | 	if config.Get().Environment == "dev" {
20 | 		var kubeConfig string
21 | 		if home := homeDir(); home != "" {
22 | 			kubeConfig = filepath.Join(home, ".kube", "config")
23 | 		} else {
24 | 			panic("Home directory not found")
25 | 		}
26 | 		// use the current context in kubeconfig
27 | 		clientConfig, err := clientcmd.BuildConfigFromFlags("", kubeConfig)
28 | 		if err != nil {
29 | 			return nil, err
30 | 		}
31 | 		cfg = clientConfig
32 | 		cfg.UserAgent = "aws-eks-asg-rolling-update-handler/1.0"
33 | 	} else {
34 | 		clientConfig, err := rest.InClusterConfig()
35 | 		if err != nil {
36 | 			return nil, err
37 | 		}
38 | 		cfg = clientConfig
39 | 	}
40 | 	return kubernetes.NewForConfig(cfg)
41 | }
42 | 
43 | func homeDir() string {
44 | 	if home := os.Getenv("HOME"); home != "" {
45 | 		return home
46 | 	}
47 | 	return os.Getenv("USERPROFILE") // windows
48 | }
49 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-latest.yml:
--------------------------------------------------------------------------------
 1 | name: publish-latest
 2 | on:
 3 |   workflow_run:
 4 |     workflows: [test]
 5 |     branches: [master]
 6 |     types: [completed]
 7 | concurrency:
 8 |   group: ${{ github.event.workflow_run.head_repository.full_name }}::${{ github.event.workflow_run.head_branch }}::${{ github.workflow }}
 9 |   cancel-in-progress: true
10 | jobs:
11 |   publish-latest:
12 |     name: publish-latest
13 |     runs-on: ubuntu-latest
14 |     if: ${{ (github.event.workflow_run.conclusion == 'success') && (github.event.workflow_run.head_repository.full_name == github.repository) }}
15 |     timeout-minutes: 60
16 |     steps:
17 |       - uses: actions/checkout@v6
18 |       - name: Get image repository
19 |         run: echo IMAGE_REPOSITORY=$(echo ${{ secrets.DOCKER_USERNAME }}/${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV
20 |       - name: Set up QEMU
21 |         uses: docker/setup-qemu-action@v3
22 |       - name: Set up Docker Buildx
23 |         uses: docker/setup-buildx-action@v3
24 |       - name: Login to Docker Registry
25 |         uses: docker/login-action@v3
26 |         with:
27 |           username: ${{ secrets.DOCKER_USERNAME }}
28 |           password: ${{ secrets.DOCKER_PASSWORD }}
29 |       - name: Build and push docker image
30 |         uses: docker/build-push-action@v6
31 |         with:
32 |           platforms: linux/amd64,linux/arm64
33 |           pull: true
34 |           push: true
35 |           tags: |
36 |             ${{ env.IMAGE_REPOSITORY }}:latest
37 | 


--------------------------------------------------------------------------------
/metrics/metrics_test.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/prometheus/client_golang/prometheus"
 8 | 	"github.com/prometheus/client_golang/prometheus/testutil"
 9 | )
10 | 
11 | func TestMetricServer(t *testing.T) {
12 | 
13 | 	Server.NodeGroups.WithLabelValues().Set(5)
14 | 	Server.Errors.Add(2)
15 | 	Server.ScaledUpNodes.WithLabelValues("nodeg-1").Inc()
16 | 	Server.ScaledUpNodes.WithLabelValues("nodeg-2").Inc()
17 | 	Server.ScaledDownNodes.WithLabelValues("nodeg-1").Inc()
18 | 	Server.ScaledDownNodes.WithLabelValues("nodeg-2").Inc()
19 | 	Server.OutdatedNodes.WithLabelValues("nodeg-1").Set(1)
20 | 	Server.OutdatedNodes.WithLabelValues("nodeg-2").Set(1)
21 | 	Server.UpdatedNodes.WithLabelValues("nodeg-1").Set(1)
22 | 	Server.UpdatedNodes.WithLabelValues("nodeg-2").Set(1)
23 | 	Server.DrainedNodes.WithLabelValues("nodeg-1").Inc()
24 | 	Server.DrainedNodes.WithLabelValues("nodeg-2").Inc()
25 | 
26 | 	err := testutil.GatherAndCompare(prometheus.Gatherers{Server.registry}, bytes.NewBufferString(`
27 | # HELP rolling_update_handler_drained_nodes_total The total number of drained nodes
28 | # TYPE rolling_update_handler_drained_nodes_total counter
29 | rolling_update_handler_drained_nodes_total{node_group="nodeg-1"} 1
30 | rolling_update_handler_drained_nodes_total{node_group="nodeg-2"} 1
31 | # HELP rolling_update_handler_errors The total number of errors
32 | # TYPE rolling_update_handler_errors counter
33 | rolling_update_handler_errors 2
34 |  # HELP rolling_update_handler_node_groups The total number of node groups managed
35 | # TYPE rolling_update_handler_node_groups gauge
36 | rolling_update_handler_node_groups 5
37 | # HELP rolling_update_handler_outdated_nodes The number of outdated nodes
38 | # TYPE rolling_update_handler_outdated_nodes gauge
39 | rolling_update_handler_outdated_nodes{node_group="nodeg-1"} 1
40 | rolling_update_handler_outdated_nodes{node_group="nodeg-2"} 1
41 | # HELP rolling_update_handler_scaled_down_nodes The total number of nodes scaled down
42 | # TYPE rolling_update_handler_scaled_down_nodes counter
43 | rolling_update_handler_scaled_down_nodes{node_group="nodeg-1"} 1
44 | rolling_update_handler_scaled_down_nodes{node_group="nodeg-2"} 1
45 | # HELP rolling_update_handler_scaled_up_nodes The total number of nodes scaled up
46 | # TYPE rolling_update_handler_scaled_up_nodes counter
47 | rolling_update_handler_scaled_up_nodes{node_group="nodeg-1"} 1
48 | rolling_update_handler_scaled_up_nodes{node_group="nodeg-2"} 1
49 | # HELP rolling_update_handler_updated_nodes The number of updated nodes
50 | # TYPE rolling_update_handler_updated_nodes gauge
51 | rolling_update_handler_updated_nodes{node_group="nodeg-1"} 1
52 | rolling_update_handler_updated_nodes{node_group="nodeg-2"} 1
53 | `))
54 | 
55 | 	if err != nil {
56 | 		t.Errorf("Expected no errors but got: %v", err)
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/metrics/metrics.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"net/http"
 5 | 	"reflect"
 6 | 	"strconv"
 7 | 
 8 | 	"github.com/prometheus/client_golang/prometheus"
 9 | 	"github.com/prometheus/client_golang/prometheus/promhttp"
10 | )
11 | 
12 | var (
13 | 	namespace = "rolling_update_handler"
14 | 	Server    *metricServer
15 | )
16 | 
17 | type metricServer struct {
18 | 	registry *prometheus.Registry
19 | 
20 | 	NodeGroups      *prometheus.GaugeVec
21 | 	OutdatedNodes   *prometheus.GaugeVec
22 | 	UpdatedNodes    *prometheus.GaugeVec
23 | 	ScaledUpNodes   *prometheus.CounterVec
24 | 	ScaledDownNodes *prometheus.CounterVec
25 | 	DrainedNodes    *prometheus.CounterVec
26 | 	Errors          prometheus.Counter
27 | }
28 | 
29 | func init() {
30 | 	Server = newMetricServer()
31 | }
32 | 
33 | func newMetricServer() *metricServer {
34 | 	m := &metricServer{
35 | 		registry: prometheus.NewPedanticRegistry(),
36 | 		NodeGroups: prometheus.NewGaugeVec(prometheus.GaugeOpts{
37 | 			Namespace: namespace,
38 | 			Name:      "node_groups",
39 | 			Help:      "The total number of node groups managed"},
40 | 			[]string{}),
41 | 		OutdatedNodes: prometheus.NewGaugeVec(prometheus.GaugeOpts{
42 | 			Namespace: namespace,
43 | 			Name:      "outdated_nodes",
44 | 			Help:      "The number of outdated nodes",
45 | 		}, []string{"node_group"}),
46 | 		UpdatedNodes: prometheus.NewGaugeVec(prometheus.GaugeOpts{
47 | 			Namespace: namespace,
48 | 			Name:      "updated_nodes",
49 | 			Help:      "The number of updated nodes",
50 | 		}, []string{"node_group"}),
51 | 		ScaledUpNodes: prometheus.NewCounterVec(prometheus.CounterOpts{
52 | 			Namespace: namespace,
53 | 			Name:      "scaled_up_nodes",
54 | 			Help:      "The total number of nodes scaled up",
55 | 		}, []string{"node_group"}),
56 | 		ScaledDownNodes: prometheus.NewCounterVec(prometheus.CounterOpts{
57 | 			Namespace: namespace,
58 | 			Name:      "scaled_down_nodes",
59 | 			Help:      "The total number of nodes scaled down",
60 | 		}, []string{"node_group"}),
61 | 		DrainedNodes: prometheus.NewCounterVec(prometheus.CounterOpts{
62 | 			Namespace: namespace,
63 | 			Name:      "drained_nodes_total",
64 | 			Help:      "The total number of drained nodes",
65 | 		}, []string{"node_group"}),
66 | 		Errors: prometheus.NewCounter(prometheus.CounterOpts{
67 | 			Namespace: namespace,
68 | 			Name:      "errors",
69 | 			Help:      "The total number of errors",
70 | 		}),
71 | 	}
72 | 	m.register()
73 | 	return m
74 | }
75 | 
76 | func (m *metricServer) register() {
77 | 	v := reflect.ValueOf(*m)
78 | 	for i := 0; i < v.NumField(); i++ {
79 | 		if v.Field(i).CanInterface() {
80 | 			if metric, ok := v.Field(i).Interface().(prometheus.Collector); ok {
81 | 				m.registry.MustRegister(metric)
82 | 			}
83 | 		}
84 | 	}
85 | }
86 | 
87 | func (m *metricServer) Listen(port int) error {
88 | 	gatherers := prometheus.Gatherers{prometheus.DefaultGatherer, m.registry}
89 | 	http.Handle("/metrics", promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{}))
90 | 	return http.ListenAndServe(":"+strconv.Itoa(port), nil)
91 | }
92 | 


--------------------------------------------------------------------------------
/config/config_test.go:
--------------------------------------------------------------------------------
  1 | package config
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"reflect"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestInitialize(t *testing.T) {
 10 | 	_ = os.Setenv(EnvAutoScalingGroupNames, "asg-a,asg-b,asg-c")
 11 | 	_ = os.Setenv(EnvIgnoreDaemonSets, "false")
 12 | 	_ = os.Setenv(EnvDeleteLocalData, "false")
 13 | 	_ = os.Setenv(EnvSlowMode, "true")
 14 | 	defer os.Clearenv()
 15 | 	_ = Initialize()
 16 | 	config := Get()
 17 | 	if len(config.AutoScalingGroupNames) != 3 {
 18 | 		t.Error()
 19 | 	}
 20 | 	if config.IgnoreDaemonSets {
 21 | 		t.Error("IgnoreDaemonSets should be false")
 22 | 	}
 23 | 	if config.DeleteEmptyDirData {
 24 | 		t.Error("DeleteEmptyDirData should be false")
 25 | 	}
 26 | 	if !config.SlowMode {
 27 | 		t.Error("SlowMode should be true")
 28 | 	}
 29 | }
 30 | 
 31 | func TestInitialize_withDefaultNonRequiredValues(t *testing.T) {
 32 | 	_ = os.Setenv(EnvAutoScalingGroupNames, "asg-a,asg-b,asg-c")
 33 | 	defer os.Clearenv()
 34 | 	_ = Initialize()
 35 | 	config := Get()
 36 | 	if len(config.AutoScalingGroupNames) != 3 {
 37 | 		t.Error()
 38 | 	}
 39 | 	if !config.IgnoreDaemonSets {
 40 | 		t.Error("should've defaulted to ignoring daemon sets")
 41 | 	}
 42 | 	if !config.DeleteEmptyDirData {
 43 | 		t.Error("should've defaulted to deleting local data")
 44 | 	}
 45 | 	if config.SlowMode {
 46 | 		t.Error("SlowMode should be false")
 47 | 	}
 48 | }
 49 | 
 50 | func TestInitialize_withMissingRequiredValues(t *testing.T) {
 51 | 	if err := Initialize(); err == nil {
 52 | 		t.Error("expected error because required environment variables are missing")
 53 | 	}
 54 | }
 55 | 
 56 | func TestSet(t *testing.T) {
 57 | 	Set([]string{"asg-a", "asg-b", "asg-c"}, true, true, false, false)
 58 | 	config := Get()
 59 | 	if len(config.AutoScalingGroupNames) != 3 {
 60 | 		t.Error()
 61 | 	}
 62 | 	if !config.IgnoreDaemonSets {
 63 | 		t.Error()
 64 | 	}
 65 | 	if !config.DeleteEmptyDirData {
 66 | 		t.Error()
 67 | 	}
 68 | }
 69 | 
 70 | func TestInitialize_withClusterName(t *testing.T) {
 71 | 	_ = os.Setenv(EnvClusterName, "foo")
 72 | 	_ = os.Setenv(EnvAutodiscoveryTags, "foo=bar")
 73 | 	_ = os.Setenv(EnvAutoScalingGroupNames, "foo,bar")
 74 | 	defer os.Clearenv()
 75 | 	_ = Initialize()
 76 | 	config := Get()
 77 | 	if config.AutodiscoveryTags != "k8s.io/cluster-autoscaler/foo=owned,k8s.io/cluster-autoscaler/enabled=true" {
 78 | 		t.Error()
 79 | 	} else if len(config.AutoScalingGroupNames) != 0 {
 80 | 		t.Error()
 81 | 	}
 82 | }
 83 | 
 84 | func TestInitialize_withAutodiscoveryTags(t *testing.T) {
 85 | 	_ = os.Unsetenv(EnvClusterName)
 86 | 	_ = os.Setenv(EnvAutodiscoveryTags, "foo=bar,foobar=true")
 87 | 	_ = os.Setenv(EnvAutoScalingGroupNames, "foo,bar")
 88 | 	defer os.Clearenv()
 89 | 	_ = Initialize()
 90 | 	config := Get()
 91 | 	if config.AutodiscoveryTags != "foo=bar,foobar=true" {
 92 | 		t.Error()
 93 | 	} else if len(config.AutoScalingGroupNames) != 0 {
 94 | 		t.Error()
 95 | 	}
 96 | }
 97 | 
 98 | func TestInitialize_withAutoScalingGroupNames(t *testing.T) {
 99 | 	_ = os.Unsetenv(EnvClusterName)
100 | 	_ = os.Unsetenv(EnvAutodiscoveryTags)
101 | 	_ = os.Setenv(EnvAutoScalingGroupNames, "foo,bar")
102 | 	defer os.Clearenv()
103 | 	_ = Initialize()
104 | 	config := Get()
105 | 	if !reflect.DeepEqual(config.AutoScalingGroupNames, []string{"foo", "bar"}) {
106 | 		t.Error()
107 | 	}
108 | }
109 | 


--------------------------------------------------------------------------------
/cloud/aws_test.go:
--------------------------------------------------------------------------------
  1 | package cloud_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/cloud"
  7 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/cloudtest"
  8 | 	"github.com/aws/aws-sdk-go/service/autoscaling"
  9 | )
 10 | 
 11 | func TestDescribeEnabledAutoScalingGroupsByTags(t *testing.T) {
 12 | 	type testCase struct {
 13 | 		autoScalingGroups []struct {
 14 | 			name string
 15 | 			tags map[string]string
 16 | 		}
 17 | 		inputTags   string
 18 | 		name        string
 19 | 		outputNames []string
 20 | 	}
 21 | 
 22 | 	testCases := []testCase{
 23 | 		{
 24 | 			name:        "match foo but not bar",
 25 | 			inputTags:   "foo=bar",
 26 | 			outputNames: []string{"foo"},
 27 | 			autoScalingGroups: []struct {
 28 | 				name string
 29 | 				tags map[string]string
 30 | 			}{
 31 | 				{
 32 | 					name: "bar",
 33 | 					tags: map[string]string{
 34 | 						"bar": "foo",
 35 | 					},
 36 | 				},
 37 | 				{
 38 | 					name: "foo",
 39 | 					tags: map[string]string{
 40 | 						"foo": "bar",
 41 | 					},
 42 | 				},
 43 | 			},
 44 | 		},
 45 | 		{
 46 | 			name:        "match foo and bar",
 47 | 			inputTags:   "foo=bar",
 48 | 			outputNames: []string{"foo", "bar"},
 49 | 			autoScalingGroups: []struct {
 50 | 				name string
 51 | 				tags map[string]string
 52 | 			}{
 53 | 				{
 54 | 					name: "bar",
 55 | 					tags: map[string]string{
 56 | 						"foo": "bar",
 57 | 					},
 58 | 				},
 59 | 				{
 60 | 					name: "foo",
 61 | 					tags: map[string]string{
 62 | 						"foo": "bar",
 63 | 					},
 64 | 				},
 65 | 			},
 66 | 		},
 67 | 		{
 68 | 			name:        "match foo but not bar with multiple input tags",
 69 | 			inputTags:   "foo=bar,foobar=true",
 70 | 			outputNames: []string{"foo"},
 71 | 			autoScalingGroups: []struct {
 72 | 				name string
 73 | 				tags map[string]string
 74 | 			}{
 75 | 				{
 76 | 					name: "bar",
 77 | 					tags: map[string]string{
 78 | 						"bar": "foo",
 79 | 					},
 80 | 				},
 81 | 				{
 82 | 					name: "foo",
 83 | 					tags: map[string]string{
 84 | 						"foo":    "bar",
 85 | 						"foobar": "true",
 86 | 					},
 87 | 				},
 88 | 			},
 89 | 		},
 90 | 		{
 91 | 			name:        "match foo and bar with multiple input tags",
 92 | 			inputTags:   "foo=bar,foobar=true",
 93 | 			outputNames: []string{"foo", "bar"},
 94 | 			autoScalingGroups: []struct {
 95 | 				name string
 96 | 				tags map[string]string
 97 | 			}{
 98 | 				{
 99 | 					name: "bar",
100 | 					tags: map[string]string{
101 | 						"foo":    "bar",
102 | 						"foobar": "true",
103 | 					},
104 | 				},
105 | 				{
106 | 					name: "foo",
107 | 					tags: map[string]string{
108 | 						"foo":    "bar",
109 | 						"foobar": "true",
110 | 					},
111 | 				},
112 | 			},
113 | 		},
114 | 	}
115 | 
116 | 	for _, test := range testCases {
117 | 		autoScalingGroups := []*autoscaling.Group{}
118 | 		for i, asg := range test.autoScalingGroups {
119 | 			autoScalingGroup := autoscaling.Group{AutoScalingGroupName: &test.autoScalingGroups[i].name}
120 | 			for k, v := range asg.tags {
121 | 				key := k
122 | 				value := v
123 | 				autoScalingGroup.Tags = append(autoScalingGroup.Tags, &autoscaling.TagDescription{
124 | 					Key:   &key,
125 | 					Value: &value,
126 | 				})
127 | 			}
128 | 			autoScalingGroups = append(autoScalingGroups, &autoScalingGroup)
129 | 		}
130 | 		svc := cloudtest.NewMockAutoScalingService(autoScalingGroups)
131 | 		output, err := cloud.DescribeEnabledAutoScalingGroupsByTags(svc, test.inputTags)
132 | 		if err != nil {
133 | 			t.Error(err)
134 | 		}
135 | 
136 | 		outMap := map[string]bool{}
137 | 		for _, outputAutoScalingGroup := range output {
138 | 			outMap[*outputAutoScalingGroup.AutoScalingGroupName] = false
139 | 		}
140 | 		for _, name := range test.outputNames {
141 | 			if _, ok := outMap[name]; ok {
142 | 				outMap[name] = true
143 | 			} else {
144 | 				t.Errorf("in '%s', expected '%s' to be present in output: %v", test.name, name, output)
145 | 			}
146 | 		}
147 | 		for name, v := range outMap {
148 | 			if !v {
149 | 				t.Errorf("in '%s', not expected '%s' to be present in output: %v", test.name, name, output)
150 | 			}
151 | 		}
152 | 	}
153 | }
154 | 


--------------------------------------------------------------------------------
/k8stest/k8stest.go:
--------------------------------------------------------------------------------
  1 | package k8stest
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 
  7 | 	"github.com/aws/aws-sdk-go/aws"
  8 | 	"github.com/aws/aws-sdk-go/service/autoscaling"
  9 | 	v1 "k8s.io/api/core/v1"
 10 | 	"k8s.io/apimachinery/pkg/api/resource"
 11 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 12 | )
 13 | 
 14 | // TODO: replace this by Kubernetes' official fake client (k8s.io/client-go/kubernetes/fake)
 15 | 
 16 | type MockClient struct {
 17 | 	Counter map[string]int64
 18 | 	Nodes   map[string]v1.Node
 19 | 	Pods    map[string]v1.Pod
 20 | }
 21 | 
 22 | func NewMockClient(nodes []v1.Node, pods []v1.Pod) *MockClient {
 23 | 	client := &MockClient{
 24 | 		Counter: make(map[string]int64),
 25 | 		Nodes:   make(map[string]v1.Node),
 26 | 		Pods:    make(map[string]v1.Pod),
 27 | 	}
 28 | 	for _, node := range nodes {
 29 | 		client.Nodes[node.Name] = node
 30 | 	}
 31 | 	for _, pod := range pods {
 32 | 		client.Pods[pod.Name] = pod
 33 | 	}
 34 | 	return client
 35 | }
 36 | 
 37 | func (mock *MockClient) GetNodes() ([]v1.Node, error) {
 38 | 	mock.Counter["GetNodes"]++
 39 | 	var nodes []v1.Node
 40 | 	for _, node := range mock.Nodes {
 41 | 		nodes = append(nodes, node)
 42 | 	}
 43 | 	return nodes, nil
 44 | }
 45 | 
 46 | func (mock *MockClient) GetPodsInNode(node string) ([]v1.Pod, error) {
 47 | 	mock.Counter["GetPodsInNode"]++
 48 | 	var pods []v1.Pod
 49 | 	for _, pod := range mock.Pods {
 50 | 		if pod.Spec.NodeName == node {
 51 | 			pods = append(pods, pod)
 52 | 		}
 53 | 	}
 54 | 	return pods, nil
 55 | }
 56 | 
 57 | func (mock *MockClient) GetNodeByAutoScalingInstance(instance *autoscaling.Instance) (*v1.Node, error) {
 58 | 	mock.Counter["GetNodeByAutoScalingInstance"]++
 59 | 	nodes, _ := mock.GetNodes()
 60 | 	return mock.FilterNodeByAutoScalingInstance(nodes, instance)
 61 | }
 62 | 
 63 | func (mock *MockClient) FilterNodeByAutoScalingInstance(nodes []v1.Node, instance *autoscaling.Instance) (*v1.Node, error) {
 64 | 	mock.Counter["FilterNodeByAutoScalingInstance"]++
 65 | 	for _, node := range nodes {
 66 | 		if node.Spec.ProviderID == fmt.Sprintf("aws:///%s/%s", aws.StringValue(instance.AvailabilityZone), aws.StringValue(instance.InstanceId)) {
 67 | 			return &node, nil
 68 | 		}
 69 | 	}
 70 | 	return nil, errors.New("not found")
 71 | }
 72 | 
 73 | func (mock *MockClient) UpdateNode(node *v1.Node) error {
 74 | 	mock.Counter["UpdateNode"]++
 75 | 	mock.Nodes[node.Name] = *node
 76 | 	return nil
 77 | }
 78 | 
 79 | func (mock *MockClient) Cordon(nodeName string) error {
 80 | 	mock.Counter["Cordon"]++
 81 | 	return nil
 82 | }
 83 | 
 84 | func (mock *MockClient) Drain(nodeName string, ignoreDaemonSets, deleteLocalData bool, podTerminationGracePeriod int) error {
 85 | 	mock.Counter["Drain"]++
 86 | 	return nil
 87 | }
 88 | 
 89 | func CreateTestNode(name, availabilityZone, instanceId, allocatableCpu, allocatableMemory string) v1.Node {
 90 | 	node := v1.Node{
 91 | 		Spec: v1.NodeSpec{
 92 | 			ProviderID: fmt.Sprintf("aws:///%s/%s", availabilityZone, instanceId),
 93 | 		},
 94 | 		Status: v1.NodeStatus{
 95 | 			Allocatable: map[v1.ResourceName]resource.Quantity{
 96 | 				v1.ResourceCPU:    resource.MustParse(allocatableCpu),
 97 | 				v1.ResourceMemory: resource.MustParse(allocatableMemory),
 98 | 			},
 99 | 		},
100 | 	}
101 | 	node.SetName(name)
102 | 	node.SetAnnotations(make(map[string]string))
103 | 	node.SetLabels(make(map[string]string))
104 | 	return node
105 | }
106 | 
107 | func CreateTestPod(name, nodeName, cpuRequest, cpuMemory string, isDaemonSet bool, podPhase v1.PodPhase) v1.Pod {
108 | 	pod := v1.Pod{
109 | 		Spec: v1.PodSpec{
110 | 			NodeName: nodeName,
111 | 			Containers: []v1.Container{{
112 | 				Name: name,
113 | 				Resources: v1.ResourceRequirements{
114 | 					Requests: v1.ResourceList{
115 | 						v1.ResourceCPU:    resource.MustParse(cpuRequest),
116 | 						v1.ResourceMemory: resource.MustParse(cpuMemory),
117 | 					},
118 | 				},
119 | 			}},
120 | 		},
121 | 		Status: v1.PodStatus{Phase: podPhase},
122 | 	}
123 | 	pod.SetName(name)
124 | 	if isDaemonSet {
125 | 		pod.SetOwnerReferences([]metav1.OwnerReference{{Kind: "DaemonSet"}})
126 | 	} else {
127 | 		pod.SetOwnerReferences([]metav1.OwnerReference{{Kind: "ReplicaSet"}})
128 | 	}
129 | 	return pod
130 | }
131 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/TwiN/aws-eks-asg-rolling-update-handler
 2 | 
 3 | go 1.24.4
 4 | 
 5 | require (
 6 | 	github.com/TwiN/gocache/v2 v2.4.0
 7 | 	github.com/aws/aws-sdk-go v1.55.7
 8 | 	github.com/prometheus/client_golang v1.23.2
 9 | 	k8s.io/api v0.34.3
10 | 	k8s.io/apimachinery v0.34.3
11 | 	k8s.io/client-go v0.34.3
12 | 	k8s.io/kubectl v0.34.3
13 | )
14 | 
15 | require (
16 | 	github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
17 | 	github.com/MakeNowJust/heredoc v1.0.0 // indirect
18 | 	github.com/beorn7/perks v1.0.1 // indirect
19 | 	github.com/blang/semver/v4 v4.0.0 // indirect
20 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
21 | 	github.com/chai2010/gettext-go v1.0.2 // indirect
22 | 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
23 | 	github.com/emicklei/go-restful/v3 v3.12.2 // indirect
24 | 	github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect
25 | 	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
26 | 	github.com/go-errors/errors v1.4.2 // indirect
27 | 	github.com/go-logr/logr v1.4.2 // indirect
28 | 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
29 | 	github.com/go-openapi/jsonreference v0.20.2 // indirect
30 | 	github.com/go-openapi/swag v0.23.0 // indirect
31 | 	github.com/gogo/protobuf v1.3.2 // indirect
32 | 	github.com/google/btree v1.1.3 // indirect
33 | 	github.com/google/gnostic-models v0.7.0 // indirect
34 | 	github.com/google/uuid v1.6.0 // indirect
35 | 	github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
36 | 	github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect
37 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
38 | 	github.com/jmespath/go-jmespath v0.4.0 // indirect
39 | 	github.com/josharian/intern v1.0.0 // indirect
40 | 	github.com/json-iterator/go v1.1.12 // indirect
41 | 	github.com/kylelemons/godebug v1.1.0 // indirect
42 | 	github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
43 | 	github.com/mailru/easyjson v0.7.7 // indirect
44 | 	github.com/mitchellh/go-wordwrap v1.0.1 // indirect
45 | 	github.com/moby/spdystream v0.5.0 // indirect
46 | 	github.com/moby/term v0.5.0 // indirect
47 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
48 | 	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
49 | 	github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
50 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
51 | 	github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
52 | 	github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
53 | 	github.com/pkg/errors v0.9.1 // indirect
54 | 	github.com/prometheus/client_model v0.6.2 // indirect
55 | 	github.com/prometheus/common v0.66.1 // indirect
56 | 	github.com/prometheus/procfs v0.16.1 // indirect
57 | 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
58 | 	github.com/spf13/cobra v1.9.1 // indirect
59 | 	github.com/spf13/pflag v1.0.6 // indirect
60 | 	github.com/x448/float16 v0.8.4 // indirect
61 | 	github.com/xlab/treeprint v1.2.0 // indirect
62 | 	go.yaml.in/yaml/v2 v2.4.2 // indirect
63 | 	go.yaml.in/yaml/v3 v3.0.4 // indirect
64 | 	golang.org/x/net v0.43.0 // indirect
65 | 	golang.org/x/oauth2 v0.30.0 // indirect
66 | 	golang.org/x/sync v0.16.0 // indirect
67 | 	golang.org/x/sys v0.35.0 // indirect
68 | 	golang.org/x/term v0.34.0 // indirect
69 | 	golang.org/x/text v0.28.0 // indirect
70 | 	golang.org/x/time v0.9.0 // indirect
71 | 	google.golang.org/protobuf v1.36.8 // indirect
72 | 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
73 | 	gopkg.in/inf.v0 v0.9.1 // indirect
74 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
75 | 	k8s.io/cli-runtime v0.34.3 // indirect
76 | 	k8s.io/component-base v0.34.3 // indirect
77 | 	k8s.io/klog/v2 v2.130.1 // indirect
78 | 	k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect
79 | 	k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect
80 | 	sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
81 | 	sigs.k8s.io/kustomize/api v0.20.1 // indirect
82 | 	sigs.k8s.io/kustomize/kyaml v0.20.1 // indirect
83 | 	sigs.k8s.io/randfill v1.0.0 // indirect
84 | 	sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
85 | 	sigs.k8s.io/yaml v1.6.0 // indirect
86 | )
87 | 


--------------------------------------------------------------------------------
/k8s/util.go:
--------------------------------------------------------------------------------
  1 | package k8s
  2 | 
  3 | import (
  4 | 	"log"
  5 | 
  6 | 	"github.com/aws/aws-sdk-go/service/autoscaling"
  7 | 	v1 "k8s.io/api/core/v1"
  8 | )
  9 | 
 10 | // CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode calculates the resources available in the target nodes
 11 | // and compares them with the resources that would be required if the old node were to be drained
 12 | //
 13 | // This is not foolproof: 2 targetNodes with 1G available in each would cause the assumption that you can fit
 14 | // a 2G pod in the targetNodes when you obviously can't (you'd need 1 node with 2G available, not 2 with 1G)
 15 | // That's alright, because the purpose is to provide a smooth rolling upgrade, not a flawless experience,  and
 16 | // while the latter is definitely possible, it would slow down the process by quite a bit. In a way, this is
 17 | // the beauty of co-existing with the cluster autoscaler; an extra node will be spun up to handle the leftovers,
 18 | // if any.
 19 | func CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(client ClientAPI, oldNode *v1.Node, targetNodes []*v1.Node) bool {
 20 | 	totalAvailableTargetCPU := int64(0)
 21 | 	totalAvailableTargetMemory := int64(0)
 22 | 	// Get resources available in target nodes
 23 | 	for _, targetNode := range targetNodes {
 24 | 		availableTargetCPU := targetNode.Status.Allocatable.Cpu().MilliValue()
 25 | 		availableTargetMemory := targetNode.Status.Allocatable.Memory().MilliValue()
 26 | 		podsInNode, err := client.GetPodsInNode(targetNode.Name)
 27 | 		if err != nil {
 28 | 			continue
 29 | 		}
 30 | 		for _, podInNode := range podsInNode {
 31 | 			// Skip pods that have terminated (e.g. "Evicted" pods that haven't been cleaned up)
 32 | 			if podInNode.Status.Phase == v1.PodFailed {
 33 | 				continue
 34 | 			}
 35 | 			for _, container := range podInNode.Spec.Containers {
 36 | 				if container.Resources.Requests.Cpu() != nil {
 37 | 					// Subtract the cpu request of the pod from the node's total allocatable cpu
 38 | 					availableTargetCPU -= container.Resources.Requests.Cpu().MilliValue()
 39 | 				}
 40 | 				if container.Resources.Requests.Memory() != nil {
 41 | 					// Subtract the memory request of the pod from the node's total allocatable memory
 42 | 					totalAvailableTargetMemory -= container.Resources.Requests.Memory().MilliValue()
 43 | 				}
 44 | 			}
 45 | 		}
 46 | 		totalAvailableTargetCPU += availableTargetCPU
 47 | 		totalAvailableTargetMemory += availableTargetMemory
 48 | 	}
 49 | 	cpuNeeded := int64(0)
 50 | 	memoryNeeded := int64(0)
 51 | 	// Get resources requested in old node
 52 | 	podsInNode, err := client.GetPodsInNode(oldNode.Name)
 53 | 	if err != nil {
 54 | 		log.Printf("Unable to determine resources needed for old node, assuming that enough resources are available")
 55 | 		return true
 56 | 	}
 57 | 	for _, podInNode := range podsInNode {
 58 | 		// Skip pods that have terminated (e.g. "Evicted" pods that haven't been cleaned up)
 59 | 		if podInNode.Status.Phase == v1.PodFailed {
 60 | 			continue
 61 | 		}
 62 | 		// Ignore DaemonSets in the old node, because these pods will also be present in the target nodes
 63 | 		hasDaemonSetOwnerReference := false
 64 | 		for _, owner := range podInNode.GetOwnerReferences() {
 65 | 			if owner.Kind == "DaemonSet" {
 66 | 				hasDaemonSetOwnerReference = true
 67 | 				break
 68 | 			}
 69 | 		}
 70 | 		if hasDaemonSetOwnerReference {
 71 | 			continue
 72 | 		}
 73 | 		for _, container := range podInNode.Spec.Containers {
 74 | 			if container.Resources.Requests.Cpu() != nil {
 75 | 				// Subtract the cpu request of the pod from the node's total allocatable
 76 | 				cpuNeeded += container.Resources.Requests.Cpu().MilliValue()
 77 | 			}
 78 | 			if container.Resources.Requests.Memory() != nil {
 79 | 				// Subtract the cpu request of the pod from the node's total allocatable
 80 | 				memoryNeeded += container.Resources.Requests.Memory().MilliValue()
 81 | 			}
 82 | 		}
 83 | 	}
 84 | 	leftOverCPU := totalAvailableTargetCPU - cpuNeeded
 85 | 	leftOverMemory := totalAvailableTargetMemory - memoryNeeded
 86 | 	return leftOverCPU >= 0 && leftOverMemory >= 0
 87 | }
 88 | 
 89 | // AnnotateNodeByAutoScalingInstance adds an annotation to the Kubernetes node represented by a given AWS instance
 90 | func AnnotateNodeByAutoScalingInstance(client ClientAPI, instance *autoscaling.Instance, key, value string) error {
 91 | 	node, err := client.GetNodeByAutoScalingInstance(instance)
 92 | 	if err != nil {
 93 | 		return err
 94 | 	}
 95 | 	annotations := node.GetAnnotations()
 96 | 	if currentValue := annotations[key]; currentValue != value {
 97 | 		annotations[key] = value
 98 | 		node.SetAnnotations(annotations)
 99 | 		err = client.UpdateNode(node)
100 | 		if err != nil {
101 | 			return err
102 | 		}
103 | 	}
104 | 	return nil
105 | }
106 | 
107 | // LabelNodeByAutoScalingInstance adds a Label to the Kubernetes node represented by a given AWS instance
108 | func LabelNodeByAutoScalingInstance(client ClientAPI, instance *autoscaling.Instance, key, value string) error {
109 | 	node, err := client.GetNodeByAutoScalingInstance(instance)
110 | 	if err != nil {
111 | 		return err
112 | 	}
113 | 	labels := node.GetLabels()
114 | 	if currentValue := labels[key]; currentValue != value {
115 | 		labels[key] = value
116 | 		node.SetLabels(labels)
117 | 		err = client.UpdateNode(node)
118 | 		if err != nil {
119 | 			return err
120 | 		}
121 | 	}
122 | 	return nil
123 | }
124 | 


--------------------------------------------------------------------------------
/cloudtest/cloudtest.go:
--------------------------------------------------------------------------------
  1 | package cloudtest
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 
  6 | 	"github.com/aws/aws-sdk-go/aws"
  7 | 	"github.com/aws/aws-sdk-go/service/autoscaling"
  8 | 	"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
  9 | 	"github.com/aws/aws-sdk-go/service/ec2"
 10 | 	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
 11 | )
 12 | 
 13 | type MockEC2Service struct {
 14 | 	ec2iface.EC2API
 15 | 
 16 | 	Counter   map[string]int64
 17 | 	Templates []*ec2.LaunchTemplate
 18 | }
 19 | 
 20 | func NewMockEC2Service(templates []*ec2.LaunchTemplate) *MockEC2Service {
 21 | 	return &MockEC2Service{
 22 | 		Counter:   make(map[string]int64),
 23 | 		Templates: templates,
 24 | 	}
 25 | }
 26 | 
 27 | func (m *MockEC2Service) DescribeLaunchTemplates(_ *ec2.DescribeLaunchTemplatesInput) (*ec2.DescribeLaunchTemplatesOutput, error) {
 28 | 	m.Counter["DescribeLaunchTemplates"]++
 29 | 	output := &ec2.DescribeLaunchTemplatesOutput{
 30 | 		LaunchTemplates: m.Templates,
 31 | 	}
 32 | 	return output, nil
 33 | }
 34 | 
 35 | func (m *MockEC2Service) DescribeLaunchTemplateByID(input *ec2.DescribeLaunchTemplatesInput) (*ec2.LaunchTemplate, error) {
 36 | 	m.Counter["DescribeLaunchTemplateByID"]++
 37 | 	for _, template := range m.Templates {
 38 | 		if template.LaunchTemplateId == input.LaunchTemplateIds[0] {
 39 | 			return template, nil
 40 | 		}
 41 | 		if template.LaunchTemplateName == input.LaunchTemplateNames[0] {
 42 | 			return template, nil
 43 | 		}
 44 | 	}
 45 | 	return nil, errors.New("not found")
 46 | }
 47 | 
 48 | func CreateTestEc2Instance(id string) *ec2.Instance {
 49 | 	instance := &ec2.Instance{
 50 | 		InstanceId: aws.String(id),
 51 | 	}
 52 | 	return instance
 53 | }
 54 | 
 55 | type MockAutoScalingService struct {
 56 | 	autoscalingiface.AutoScalingAPI
 57 | 
 58 | 	Counter           map[string]int64
 59 | 	AutoScalingGroups map[string]*autoscaling.Group
 60 | }
 61 | 
 62 | func NewMockAutoScalingService(autoScalingGroups []*autoscaling.Group) *MockAutoScalingService {
 63 | 	service := &MockAutoScalingService{
 64 | 		Counter:           make(map[string]int64),
 65 | 		AutoScalingGroups: make(map[string]*autoscaling.Group),
 66 | 	}
 67 | 	for _, autoScalingGroup := range autoScalingGroups {
 68 | 		service.AutoScalingGroups[aws.StringValue(autoScalingGroup.AutoScalingGroupName)] = autoScalingGroup
 69 | 	}
 70 | 	return service
 71 | }
 72 | 
 73 | func (m *MockAutoScalingService) TerminateInstanceInAutoScalingGroup(_ *autoscaling.TerminateInstanceInAutoScalingGroupInput) (*autoscaling.TerminateInstanceInAutoScalingGroupOutput, error) {
 74 | 	m.Counter["TerminateInstanceInAutoScalingGroup"]++
 75 | 	return &autoscaling.TerminateInstanceInAutoScalingGroupOutput{}, nil
 76 | }
 77 | 
 78 | func (m *MockAutoScalingService) DescribeAutoScalingGroups(input *autoscaling.DescribeAutoScalingGroupsInput) (*autoscaling.DescribeAutoScalingGroupsOutput, error) {
 79 | 	m.Counter["DescribeAutoScalingGroups"]++
 80 | 	var autoScalingGroups []*autoscaling.Group
 81 | 	for _, autoScalingGroupName := range input.AutoScalingGroupNames {
 82 | 		for _, autoScalingGroup := range m.AutoScalingGroups {
 83 | 			if aws.StringValue(autoScalingGroupName) == aws.StringValue(autoScalingGroup.AutoScalingGroupName) {
 84 | 				autoScalingGroups = append(autoScalingGroups, autoScalingGroup)
 85 | 			}
 86 | 		}
 87 | 	}
 88 | 	return &autoscaling.DescribeAutoScalingGroupsOutput{
 89 | 		AutoScalingGroups: autoScalingGroups,
 90 | 	}, nil
 91 | }
 92 | 
 93 | func (m *MockAutoScalingService) DescribeAutoScalingGroupsPages(input *autoscaling.DescribeAutoScalingGroupsInput, f func(*autoscaling.DescribeAutoScalingGroupsOutput, bool) bool) error {
 94 | 	idx := 0
 95 | 	for _, asg := range m.AutoScalingGroups {
 96 | 		x := &autoscaling.DescribeAutoScalingGroupsOutput{AutoScalingGroups: []*autoscaling.Group{asg}}
 97 | 		idx++
 98 | 		f(x, idx == len(m.AutoScalingGroups))
 99 | 	}
100 | 	return nil
101 | }
102 | 
103 | func (m *MockAutoScalingService) SetDesiredCapacity(input *autoscaling.SetDesiredCapacityInput) (*autoscaling.SetDesiredCapacityOutput, error) {
104 | 	m.Counter["SetDesiredCapacity"]++
105 | 	m.AutoScalingGroups[aws.StringValue(input.AutoScalingGroupName)].SetDesiredCapacity(aws.Int64Value(input.DesiredCapacity))
106 | 	return &autoscaling.SetDesiredCapacityOutput{}, nil
107 | }
108 | 
109 | func (m *MockAutoScalingService) UpdateAutoScalingGroup(_ *autoscaling.UpdateAutoScalingGroupInput) (*autoscaling.UpdateAutoScalingGroupOutput, error) {
110 | 	m.Counter["UpdateAutoScalingGroup"]++
111 | 	return &autoscaling.UpdateAutoScalingGroupOutput{}, nil
112 | }
113 | 
114 | func CreateTestAutoScalingGroup(name, launchConfigurationName string, launchTemplateSpecification *autoscaling.LaunchTemplateSpecification, instances []*autoscaling.Instance, withMixedInstancesPolicy bool) *autoscaling.Group {
115 | 	asg := &autoscaling.Group{
116 | 		AutoScalingGroupName: aws.String(name),
117 | 		Instances:            instances,
118 | 		DesiredCapacity:      aws.Int64(int64(len(instances))),
119 | 		MinSize:              aws.Int64(0),
120 | 		MaxSize:              aws.Int64(999),
121 | 	}
122 | 	if len(launchConfigurationName) != 0 {
123 | 		asg.SetLaunchConfigurationName(launchConfigurationName)
124 | 	}
125 | 	if withMixedInstancesPolicy {
126 | 		asg.SetMixedInstancesPolicy(&autoscaling.MixedInstancesPolicy{
127 | 			LaunchTemplate: &autoscaling.LaunchTemplate{
128 | 				LaunchTemplateSpecification: launchTemplateSpecification,
129 | 				Overrides: []*autoscaling.LaunchTemplateOverrides{
130 | 					{InstanceType: aws.String("c5.2xlarge")},
131 | 					{InstanceType: aws.String("c5n.2xlarge")},
132 | 					{InstanceType: aws.String("c5d.2xlarge")},
133 | 				},
134 | 			},
135 | 		})
136 | 	} else {
137 | 		if launchTemplateSpecification != nil {
138 | 			asg.SetLaunchTemplate(launchTemplateSpecification)
139 | 		}
140 | 	}
141 | 	return asg
142 | }
143 | 
144 | func CreateTestAutoScalingInstance(id, launchConfigurationName string, launchTemplateSpecification *autoscaling.LaunchTemplateSpecification, lifeCycleState string) *autoscaling.Instance {
145 | 	instance := &autoscaling.Instance{
146 | 		LifecycleState: aws.String(lifeCycleState),
147 | 		InstanceId:     aws.String(id),
148 | 		InstanceType:   aws.String("c5.2xlarge"),
149 | 	}
150 | 	if len(launchConfigurationName) != 0 {
151 | 		instance.SetLaunchConfigurationName(launchConfigurationName)
152 | 	}
153 | 	if launchTemplateSpecification != nil {
154 | 		instance.SetLaunchTemplate(launchTemplateSpecification)
155 | 	}
156 | 	return instance
157 | }
158 | 


--------------------------------------------------------------------------------
/cloud/aws.go:
--------------------------------------------------------------------------------
  1 | package cloud
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"strings"
  7 | 
  8 | 	"github.com/aws/aws-sdk-go/aws"
  9 | 	"github.com/aws/aws-sdk-go/aws/session"
 10 | 	"github.com/aws/aws-sdk-go/service/autoscaling"
 11 | 	"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
 12 | 	"github.com/aws/aws-sdk-go/service/ec2"
 13 | 	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
 14 | )
 15 | 
 16 | var (
 17 | 	ErrCannotIncreaseDesiredCountAboveMax = errors.New("cannot increase ASG desired size above max ASG size")
 18 | )
 19 | 
 20 | // GetServices returns an instance of a EC2 client with a session as well as
 21 | // an instance of an Autoscaling client with a session
 22 | func GetServices(awsRegion string) (ec2iface.EC2API, autoscalingiface.AutoScalingAPI, error) {
 23 | 	awsSession, err := session.NewSession(&aws.Config{Region: aws.String(awsRegion)})
 24 | 	if err != nil {
 25 | 		return nil, nil, err
 26 | 	}
 27 | 	return ec2.New(awsSession), autoscaling.New(awsSession), nil
 28 | }
 29 | 
 30 | func DescribeAutoScalingGroupsByNames(svc autoscalingiface.AutoScalingAPI, names []string) ([]*autoscaling.Group, error) {
 31 | 	input := &autoscaling.DescribeAutoScalingGroupsInput{
 32 | 		AutoScalingGroupNames: aws.StringSlice(names),
 33 | 		MaxRecords:            aws.Int64(100),
 34 | 	}
 35 | 	result, err := svc.DescribeAutoScalingGroups(input)
 36 | 	if err != nil {
 37 | 		return nil, err
 38 | 	}
 39 | 	return result.AutoScalingGroups, nil
 40 | }
 41 | 
 42 | func filterAutoScalingGroupsByTag(autoScalingGroups []*autoscaling.Group, filter func([]*autoscaling.TagDescription) bool) (ret []*autoscaling.Group) {
 43 | 	for _, autoScalingGroup := range autoScalingGroups {
 44 | 		if filter(autoScalingGroup.Tags) {
 45 | 			ret = append(ret, autoScalingGroup)
 46 | 		}
 47 | 	}
 48 | 	return
 49 | }
 50 | 
 51 | // DescribeEnabledAutoScalingGroupsByTags Gets AutoScalingGroups that match the given tags
 52 | func DescribeEnabledAutoScalingGroupsByTags(svc autoscalingiface.AutoScalingAPI, autodiscoveryTags string) ([]*autoscaling.Group, error) {
 53 | 	input := &autoscaling.DescribeAutoScalingGroupsInput{}
 54 | 	var result []*autoscaling.Group
 55 | 	err := svc.DescribeAutoScalingGroupsPages(input, func(page *autoscaling.DescribeAutoScalingGroupsOutput, lastPage bool) bool {
 56 | 		tagFilter := func(tagDescriptions []*autoscaling.TagDescription) bool {
 57 | 			var matches []bool
 58 | 			for _, tag := range strings.Split(autodiscoveryTags, ",") {
 59 | 				kv := strings.Split(tag, "=")
 60 | 				match := false
 61 | 				for _, tagDescription := range tagDescriptions {
 62 | 					if aws.StringValue(tagDescription.Key) == kv[0] && aws.StringValue(tagDescription.Value) == kv[1] {
 63 | 						match = true
 64 | 						break
 65 | 					}
 66 | 				}
 67 | 				matches = append(matches, match)
 68 | 			}
 69 | 			for _, match := range matches {
 70 | 				if !match {
 71 | 					return false
 72 | 				}
 73 | 			}
 74 | 			return true
 75 | 		}
 76 | 		result = append(result, filterAutoScalingGroupsByTag(page.AutoScalingGroups, tagFilter)...)
 77 | 		return !lastPage
 78 | 	})
 79 | 	if err != nil {
 80 | 		return nil, err
 81 | 	}
 82 | 	return result, nil
 83 | }
 84 | 
 85 | func DescribeLaunchTemplateByID(svc ec2iface.EC2API, id string) (*ec2.LaunchTemplate, error) {
 86 | 	input := &ec2.DescribeLaunchTemplatesInput{
 87 | 		LaunchTemplateIds: []*string{
 88 | 			aws.String(id),
 89 | 		},
 90 | 	}
 91 | 	return DescribeLaunchTemplate(svc, input)
 92 | }
 93 | 
 94 | func DescribeLaunchTemplateByName(svc ec2iface.EC2API, name string) (*ec2.LaunchTemplate, error) {
 95 | 	input := &ec2.DescribeLaunchTemplatesInput{
 96 | 		LaunchTemplateNames: []*string{
 97 | 			aws.String(name),
 98 | 		},
 99 | 	}
100 | 	return DescribeLaunchTemplate(svc, input)
101 | }
102 | 
103 | func DescribeLaunchTemplate(svc ec2iface.EC2API, input *ec2.DescribeLaunchTemplatesInput) (*ec2.LaunchTemplate, error) {
104 | 	templatesOutput, err := svc.DescribeLaunchTemplates(input)
105 | 	descriptiveMsg := fmt.Sprintf("%v / %v", aws.StringValueSlice(input.LaunchTemplateIds), aws.StringValueSlice(input.LaunchTemplateNames))
106 | 	if err != nil {
107 | 		return nil, fmt.Errorf("unable to get description for Launch Templates %s: %v", descriptiveMsg, err)
108 | 	}
109 | 	if len(templatesOutput.LaunchTemplates) < 1 {
110 | 		return nil, nil
111 | 	}
112 | 	return templatesOutput.LaunchTemplates[0], nil
113 | }
114 | 
115 | // IncrementAutoScalingGroupDesiredCount retrieves the latest definition of the ASG and increments its current
116 | // desired capacity by 1. The reason why we retrieve the ASG again even though we already have it is to avoid a
117 | // scenario in which the ASG had already been scaled up or down since the last time it was retrieved.
118 | // See https://github.com/TwiN/aws-eks-asg-rolling-update-handler/issues/129 for more information.
119 | func IncrementAutoScalingGroupDesiredCount(svc autoscalingiface.AutoScalingAPI, autoScalingGroupName string) error {
120 | 	latestASGs, err := DescribeAutoScalingGroupsByNames(svc, []string{autoScalingGroupName})
121 | 	if err != nil {
122 | 		return fmt.Errorf("failed to retrieve latest asg with name '%s': %w", autoScalingGroupName, err)
123 | 	}
124 | 	if len(latestASGs) != 1 {
125 | 		// ASG names are unique per region and account, so if there isn't exactly one ASG, there's a problem.
126 | 		return errors.New("failed to retrieve latest asg with name: " + autoScalingGroupName)
127 | 	}
128 | 	asg := latestASGs[0]
129 | 	newDesiredCapacity := aws.Int64Value(asg.DesiredCapacity) + 1
130 | 	if newDesiredCapacity > aws.Int64Value(asg.MaxSize) {
131 | 		return ErrCannotIncreaseDesiredCountAboveMax
132 | 	}
133 | 	desiredInput := &autoscaling.SetDesiredCapacityInput{
134 | 		AutoScalingGroupName: asg.AutoScalingGroupName,
135 | 		DesiredCapacity:      aws.Int64(newDesiredCapacity),
136 | 		HonorCooldown:        aws.Bool(true),
137 | 	}
138 | 	_, err = svc.SetDesiredCapacity(desiredInput)
139 | 	if err != nil {
140 | 		return fmt.Errorf("unable to increase ASG %s desired count to %d: %w", autoScalingGroupName, newDesiredCapacity, err)
141 | 	}
142 | 	return nil
143 | }
144 | 
145 | func TerminateEc2Instance(svc autoscalingiface.AutoScalingAPI, instance *autoscaling.Instance, shouldDecrementDesiredCapacity bool) error {
146 | 	_, err := svc.TerminateInstanceInAutoScalingGroup(&autoscaling.TerminateInstanceInAutoScalingGroupInput{
147 | 		InstanceId:                     instance.InstanceId,
148 | 		ShouldDecrementDesiredCapacity: aws.Bool(shouldDecrementDesiredCapacity),
149 | 	})
150 | 	return err
151 | }
152 | 


--------------------------------------------------------------------------------
/k8s/client.go:
--------------------------------------------------------------------------------
  1 | package k8s
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"time"
  8 | 
  9 | 	"github.com/TwiN/gocache/v2"
 10 | 	"github.com/aws/aws-sdk-go/aws"
 11 | 	"github.com/aws/aws-sdk-go/service/autoscaling"
 12 | 	v1 "k8s.io/api/core/v1"
 13 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 14 | 	"k8s.io/client-go/kubernetes"
 15 | 	"k8s.io/kubectl/pkg/drain"
 16 | )
 17 | 
 18 | const (
 19 | 	AnnotationRollingUpdateStartedTimestamp    = "aws-eks-asg-rolling-update-handler.twin.sh/started-at"
 20 | 	AnnotationRollingUpdateDrainedTimestamp    = "aws-eks-asg-rolling-update-handler.twin.sh/drained-at"
 21 | 	AnnotationRollingUpdateTerminatedTimestamp = "aws-eks-asg-rolling-update-handler.twin.sh/terminated-at"
 22 | 
 23 | 	LabelExcludeFromExternalLoadBalancers = "node.kubernetes.io/exclude-from-external-load-balancers"
 24 | 
 25 | 	nodesCacheKey = "nodes"
 26 | )
 27 | 
 28 | var (
 29 | 	cache = gocache.NewCache().WithMaxSize(1000).WithEvictionPolicy(gocache.LeastRecentlyUsed)
 30 | )
 31 | 
 32 | type ClientAPI interface {
 33 | 	GetNodes() ([]v1.Node, error)
 34 | 	GetPodsInNode(nodeName string) ([]v1.Pod, error)
 35 | 	GetNodeByAutoScalingInstance(instance *autoscaling.Instance) (*v1.Node, error)
 36 | 	FilterNodeByAutoScalingInstance(nodes []v1.Node, instance *autoscaling.Instance) (*v1.Node, error)
 37 | 	UpdateNode(node *v1.Node) error
 38 | 	Cordon(nodeName string) error
 39 | 	Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData bool, podTerminationGracePeriod int) error
 40 | }
 41 | 
 42 | type Client struct {
 43 | 	client kubernetes.Interface
 44 | }
 45 | 
 46 | // NewClient creates a new Client
 47 | func NewClient(client kubernetes.Interface) *Client {
 48 | 	return &Client{
 49 | 		client: client,
 50 | 	}
 51 | }
 52 | 
 53 | // GetNodes retrieves all nodes from the cluster
 54 | func (k *Client) GetNodes() ([]v1.Node, error) {
 55 | 	nodes, exists := cache.Get(nodesCacheKey)
 56 | 	if exists {
 57 | 		if v1Nodes, ok := nodes.([]v1.Node); ok {
 58 | 			// Return cached nodes
 59 | 			return v1Nodes, nil
 60 | 		} else {
 61 | 			log.Println("[k8s.GetNodes] Failed to cast cached nodes to []v1.Node; retrieving nodes from API instead")
 62 | 			cache.Delete(nodesCacheKey)
 63 | 		}
 64 | 	}
 65 | 	nodeList, err := k.client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
 66 | 	if err != nil {
 67 | 		return nil, err
 68 | 	}
 69 | 	cache.SetWithTTL(nodesCacheKey, nodeList.Items, 10*time.Second)
 70 | 	return nodeList.Items, nil
 71 | }
 72 | 
 73 | // GetPodsInNode retrieves all pods from a given node
 74 | func (k *Client) GetPodsInNode(node string) ([]v1.Pod, error) {
 75 | 	podList, err := k.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
 76 | 		FieldSelector:   "spec.nodeName=" + node,
 77 | 		ResourceVersion: "0",
 78 | 	})
 79 | 	if err != nil {
 80 | 		return nil, err
 81 | 	}
 82 | 	return podList.Items, nil
 83 | }
 84 | 
 85 | // GetNodeByAutoScalingInstance gets the Kubernetes node matching an AWS AutoScaling instance
 86 | // Because we cannot filter by spec.providerID, the entire list of nodes is fetched every time
 87 | // this function is called
 88 | func (k *Client) GetNodeByAutoScalingInstance(instance *autoscaling.Instance) (*v1.Node, error) {
 89 | 	nodes, err := k.GetNodes()
 90 | 	if err != nil {
 91 | 		return nil, err
 92 | 	}
 93 | 	return k.FilterNodeByAutoScalingInstance(nodes, instance)
 94 | }
 95 | 
 96 | // FilterNodeByAutoScalingInstance extracts the Kubernetes node belonging to a given AWS instance from a list of nodes
 97 | func (k *Client) FilterNodeByAutoScalingInstance(nodes []v1.Node, instance *autoscaling.Instance) (*v1.Node, error) {
 98 | 	providerId := fmt.Sprintf("aws:///%s/%s", aws.StringValue(instance.AvailabilityZone), aws.StringValue(instance.InstanceId))
 99 | 	for _, node := range nodes {
100 | 		if node.Spec.ProviderID == providerId {
101 | 			return &node, nil
102 | 		}
103 | 	}
104 | 	return nil, fmt.Errorf("node with providerID \"%s\" not found", providerId)
105 | }
106 | 
107 | // UpdateNode updates a node
108 | func (k *Client) UpdateNode(node *v1.Node) error {
109 | 	api := k.client.CoreV1().Nodes()
110 | 	_, err := api.Update(context.TODO(), node, metav1.UpdateOptions{})
111 | 	return err
112 | }
113 | 
114 | // Cordon disables scheduling new pods onto the given node
115 | func (k *Client) Cordon(nodeName string) error {
116 | 	node, err := k.client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
117 | 	if err != nil {
118 | 		return err
119 | 	}
120 | 	drainer := &drain.Helper{
121 | 		Client: k.client,
122 | 		Ctx:    context.TODO(),
123 | 	}
124 | 	if err := drain.RunCordonOrUncordon(drainer, node, true); err != nil {
125 | 		log.Printf("[%s][CORDONER] Failed to cordon node: %v", node.Name, err)
126 | 		return err
127 | 	}
128 | 	return nil
129 | }
130 | 
131 | // Drain gracefully deletes all pods from a given node
132 | func (k *Client) Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData bool, podTerminationGracePeriod int) error {
133 | 	node, err := k.client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
134 | 	if err != nil {
135 | 		return err
136 | 	}
137 | 	drainer := &drain.Helper{
138 | 		Client:              k.client,
139 | 		Force:               true, // Continue even if there are pods not managed by a ReplicationController, ReplicaSet, Job, DaemonSet or StatefulSet
140 | 		IgnoreAllDaemonSets: ignoreDaemonSets,
141 | 		DeleteEmptyDirData:  deleteEmptyDirData,
142 | 		GracePeriodSeconds:  podTerminationGracePeriod,
143 | 		Timeout:             5 * time.Minute,
144 | 		Ctx:                 context.TODO(),
145 | 		Out:                 drainLogger{NodeName: nodeName},
146 | 		ErrOut:              drainLogger{NodeName: nodeName},
147 | 		OnPodDeletedOrEvicted: func(pod *v1.Pod, usingEviction bool) {
148 | 			log.Printf("[%s][DRAINER] evicted pod %s/%s", nodeName, pod.Namespace, pod.Name)
149 | 		},
150 | 	}
151 | 	if !node.Spec.Unschedulable {
152 | 		// Cordon the node if it's not already unschedulable
153 | 		if err := drain.RunCordonOrUncordon(drainer, node, true); err != nil {
154 | 			log.Printf("[%s][DRAINER] Failed to cordon node: %v", node.Name, err)
155 | 			return err
156 | 		}
157 | 	}
158 | 	if err := drain.RunNodeDrain(drainer, node.Name); err != nil {
159 | 		log.Printf("[%s][DRAINER] Failed to drain node: %v", node.Name, err)
160 | 		return err
161 | 	}
162 | 	return nil
163 | }
164 | 
165 | type drainLogger struct {
166 | 	NodeName string
167 | }
168 | 
169 | func (l drainLogger) Write(p []byte) (n int, err error) {
170 | 	log.Printf("[%s][DRAINER] %s", l.NodeName, string(p))
171 | 	return len(p), nil
172 | }
173 | 


--------------------------------------------------------------------------------
/k8s/util_test.go:
--------------------------------------------------------------------------------
  1 | package k8s
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/k8stest"
  7 | 	"k8s.io/api/core/v1"
  8 | )
  9 | 
 10 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(t *testing.T) {
 11 | 	// allocatable cpu & memory aren't used for the old node.
 12 | 	// They're only used by the target nodes (newNode, in this case) to calculate if the leftover resources from moving
 13 | 	// the pods from the old node to the new node are positive (if the leftover is negative, it means there's not enough
 14 | 	// space in the target nodes)
 15 | 	oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
 16 | 	newNode := k8stest.CreateTestNode("new-node-1", "us-west-2b", "i-07550830aef9e4179", "1000m", "1000Mi")
 17 | 	oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning)
 18 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodePod})
 19 | 
 20 | 	hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode})
 21 | 	if !hasEnoughResources {
 22 | 		t.Error("should've had enough space in node")
 23 | 	}
 24 | 	if mockClient.Counter["GetPodsInNode"] != 2 {
 25 | 		t.Error("GetPodInNode should've been called twice")
 26 | 	}
 27 | }
 28 | 
 29 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_whenNotEnoughSpaceInNewNodes(t *testing.T) {
 30 | 	oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
 31 | 	newNode := k8stest.CreateTestNode("new-node-1", "us-west-2c", "i-0b22d79604221412c", "1000m", "1000Mi")
 32 | 	oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "200m", "200Mi", false, v1.PodRunning)
 33 | 	newNodePod := k8stest.CreateTestPod("new-pod-1", newNode.Name, "900m", "200Mi", false, v1.PodRunning)
 34 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodePod, newNodePod})
 35 | 
 36 | 	hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode})
 37 | 	if hasEnoughResources {
 38 | 		t.Error("shouldn't have had enough space in node")
 39 | 	}
 40 | 	if mockClient.Counter["GetPodsInNode"] != 2 {
 41 | 		t.Error("GetPodInNode should've been called twice")
 42 | 	}
 43 | }
 44 | 
 45 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withMultiplePods(t *testing.T) {
 46 | 	oldNode := k8stest.CreateTestNode("old-node", "us-west-2c", "i-0b22d79604221412c", "0m", "0m")
 47 | 	newNode := k8stest.CreateTestNode("new-node-1", "us-west-2b", "i-07550830aef9e4179", "1000m", "1000Mi")
 48 | 	oldNodeFirstPod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "300m", "0", false, v1.PodRunning)
 49 | 	oldNodeSecondPod := k8stest.CreateTestPod("old-pod-2", oldNode.Name, "300m", "0", false, v1.PodRunning)
 50 | 	oldNodeThirdPod := k8stest.CreateTestPod("old-pod-3", oldNode.Name, "300m", "0", false, v1.PodRunning)
 51 | 	newNodePod := k8stest.CreateTestPod("new-pod-1", newNode.Name, "200m", "200Mi", false, v1.PodRunning)
 52 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod, newNodePod})
 53 | 
 54 | 	hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode})
 55 | 	if hasEnoughResources {
 56 | 		t.Error("shouldn't have had enough space in node")
 57 | 	}
 58 | 	if mockClient.Counter["GetPodsInNode"] != 2 {
 59 | 		t.Error("GetPodInNode should've been called twice")
 60 | 	}
 61 | }
 62 | 
 63 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withMultipleTargetNodes(t *testing.T) {
 64 | 	oldNode := k8stest.CreateTestNode("old-node", "us-west-2b", "i-07550830aef9e4179", "0m", "0m")
 65 | 	firstNewNode := k8stest.CreateTestNode("new-node-1", "us-west-2a", "i-034fa1dfbfd35f8bb", "1000m", "1000Mi")
 66 | 	secondNewNode := k8stest.CreateTestNode("new-node-2", "us-west-2b", "i-0918aff89347cef0c", "1000m", "1000Mi")
 67 | 	oldNodeFirstPod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "500m", "0", false, v1.PodRunning)
 68 | 	oldNodeSecondPod := k8stest.CreateTestPod("old-node-pod-2", oldNode.Name, "500m", "0", false, v1.PodRunning)
 69 | 	oldNodeThirdPod := k8stest.CreateTestPod("old-node-pod-3", oldNode.Name, "500m", "0", false, v1.PodRunning)
 70 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode, firstNewNode, secondNewNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod})
 71 | 
 72 | 	hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&firstNewNode, &secondNewNode})
 73 | 	if !hasEnoughResources {
 74 | 		t.Error("should've had enough space in node")
 75 | 	}
 76 | 	if mockClient.Counter["GetPodsInNode"] != 3 {
 77 | 		t.Error("GetPodInNode should've been called thrice")
 78 | 	}
 79 | }
 80 | 
 81 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withPodsSpreadAcrossMultipleTargetNodes(t *testing.T) {
 82 | 	oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
 83 | 	firstNewNode := k8stest.CreateTestNode("new-node-1", "us-west-2a", "i-07550830aef9e4179", "1000m", "1000Mi")
 84 | 	secondNewNode := k8stest.CreateTestNode("new-node-2", "us-west-2a", "i-0147ad0816c210dae", "1000m", "1000Mi")
 85 | 	firstNewNodePod := k8stest.CreateTestPod("new-node-1-pod-1", oldNode.Name, "0", "300Mi", false, v1.PodRunning)
 86 | 	secondNewNodePod := k8stest.CreateTestPod("new-node-2-pod-1", oldNode.Name, "0", "300Mi", false, v1.PodRunning)
 87 | 	oldNodeFirstPod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "0", "500Mi", false, v1.PodRunning)
 88 | 	oldNodeSecondPod := k8stest.CreateTestPod("old-node-pod-2", oldNode.Name, "0", "500Mi", false, v1.PodRunning)
 89 | 	oldNodeThirdPod := k8stest.CreateTestPod("old-node-pod-3", oldNode.Name, "0", "500Mi", false, v1.PodRunning)
 90 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode, firstNewNode, secondNewNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod, firstNewNodePod, secondNewNodePod})
 91 | 
 92 | 	hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&firstNewNode, &secondNewNode})
 93 | 	if hasEnoughResources {
 94 | 		t.Error("shouldn't have had enough space in node")
 95 | 	}
 96 | 	if mockClient.Counter["GetPodsInNode"] != 3 {
 97 | 		t.Error("GetPodInNode should've been called thrice")
 98 | 	}
 99 | }
100 | 
101 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withNoTargetNodes(t *testing.T) {
102 | 	oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
103 | 	oldNodePod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "500Mi", "500Mi", false, v1.PodRunning)
104 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod})
105 | 
106 | 	hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{})
107 | 	if hasEnoughResources {
108 | 		t.Error("there's no target nodes; there definitely shouldn't have been enough space")
109 | 	}
110 | }
111 | 
112 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withNoTargetNodesButOldNodeOnlyHasPodsFromDaemonSets(t *testing.T) {
113 | 	oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
114 | 	oldNodePod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "500Mi", "500Mi", true, v1.PodRunning)
115 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod})
116 | 
117 | 	hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{})
118 | 	if !hasEnoughResources {
119 | 		t.Error("there's no target nodes, but the only pods in the old node are from daemon sets")
120 | 	}
121 | }
122 | 


--------------------------------------------------------------------------------
/config/config.go:
--------------------------------------------------------------------------------
  1 | package config
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"log"
  6 | 	"os"
  7 | 	"strconv"
  8 | 	"strings"
  9 | 	"time"
 10 | )
 11 | 
 12 | var cfg *config
 13 | 
 14 | const (
 15 | 	EnvEnvironment                      = "ENVIRONMENT"
 16 | 	EnvDebug                            = "DEBUG"
 17 | 	EnvIgnoreDaemonSets                 = "IGNORE_DAEMON_SETS"
 18 | 	EnvDeleteLocalData                  = "DELETE_LOCAL_DATA" // Deprecated: in favor of DeleteEmptyDirData (DELETE_EMPTY_DIR_DATA)
 19 | 	EnvDeleteEmptyDirData               = "DELETE_EMPTY_DIR_DATA"
 20 | 	EnvClusterName                      = "CLUSTER_NAME"
 21 | 	EnvAutodiscoveryTags                = "AUTODISCOVERY_TAGS"
 22 | 	EnvAutoScalingGroupNames            = "AUTO_SCALING_GROUP_NAMES"
 23 | 	EnvAwsRegion                        = "AWS_REGION"
 24 | 	EnvExecutionInterval                = "EXECUTION_INTERVAL"
 25 | 	EnvExecutionTimeout                 = "EXECUTION_TIMEOUT"
 26 | 	EnvPodTerminationGracePeriod        = "POD_TERMINATION_GRACE_PERIOD"
 27 | 	EnvMetrics                          = "METRICS"
 28 | 	EnvMetricsPort                      = "METRICS_PORT"
 29 | 	EnvSlowMode                         = "SLOW_MODE"
 30 | 	EnvEagerCordoning                   = "EAGER_CORDONING"
 31 | 	EnvExcludeFromExternalLoadBalancers = "EXCLUDE_FROM_EXTERNAL_LOAD_BALANCERS"
 32 | )
 33 | 
 34 | type config struct {
 35 | 	Environment                      string        // Optional
 36 | 	Debug                            bool          // Defaults to false
 37 | 	AutoScalingGroupNames            []string      // Required if AutodiscoveryTags not provided
 38 | 	AutodiscoveryTags                string        // Required if AutoScalingGroupNames not provided
 39 | 	AwsRegion                        string        // Defaults to us-west-2
 40 | 	IgnoreDaemonSets                 bool          // Defaults to true
 41 | 	DeleteEmptyDirData               bool          // Defaults to true
 42 | 	ExecutionInterval                time.Duration // Defaults to 20s
 43 | 	ExecutionTimeout                 time.Duration // Defaults to 900s
 44 | 	PodTerminationGracePeriod        int           // Defaults to -1
 45 | 	Metrics                          bool          // Defaults to false
 46 | 	MetricsPort                      int           // Defaults to 8080
 47 | 	SlowMode                         bool          // Defaults to false
 48 | 	EagerCordoning                   bool          // Defaults to false
 49 | 	ExcludeFromExternalLoadBalancers bool          // Defaults to false
 50 | }
 51 | 
 52 | // Initialize is used to initialize the application's configuration
 53 | func Initialize() error {
 54 | 	cfg = &config{
 55 | 		Environment:                      strings.ToLower(os.Getenv(EnvEnvironment)),
 56 | 		Debug:                            strings.ToLower(os.Getenv(EnvDebug)) == "true",
 57 | 		SlowMode:                         strings.ToLower(os.Getenv(EnvSlowMode)) == "true",
 58 | 		EagerCordoning:                   strings.ToLower(os.Getenv(EnvEagerCordoning)) == "true",
 59 | 		ExcludeFromExternalLoadBalancers: strings.ToLower(os.Getenv(EnvExcludeFromExternalLoadBalancers)) == "true",
 60 | 	}
 61 | 	if clusterName := os.Getenv(EnvClusterName); len(clusterName) > 0 {
 62 | 		// See "Prerequisites" in https://docs.aws.amazon.com/eks/latest/userguide/autoscaling.html
 63 | 		cfg.AutodiscoveryTags = fmt.Sprintf("k8s.io/cluster-autoscaler/%s=owned,k8s.io/cluster-autoscaler/enabled=true", clusterName)
 64 | 	} else if autodiscoveryTags := os.Getenv(EnvAutodiscoveryTags); len(autodiscoveryTags) > 0 {
 65 | 		cfg.AutodiscoveryTags = autodiscoveryTags
 66 | 	} else if autoScalingGroupNames := os.Getenv(EnvAutoScalingGroupNames); len(autoScalingGroupNames) > 0 {
 67 | 		cfg.AutoScalingGroupNames = strings.Split(strings.TrimSpace(autoScalingGroupNames), ",")
 68 | 	} else {
 69 | 		return fmt.Errorf("environment variables '%s', '%s' or '%s' are not set", EnvAutoScalingGroupNames, EnvClusterName, EnvAutodiscoveryTags)
 70 | 	}
 71 | 	if ignoreDaemonSets := strings.ToLower(os.Getenv(EnvIgnoreDaemonSets)); len(ignoreDaemonSets) == 0 || ignoreDaemonSets == "true" {
 72 | 		cfg.IgnoreDaemonSets = true
 73 | 	}
 74 | 	// if the deprecated EnvDeleteLocalData is set, we need to set EnvDeleteEmptyDirData to its value
 75 | 	if deleteLocalData := strings.ToLower(os.Getenv(EnvDeleteLocalData)); len(deleteLocalData) > 0 {
 76 | 		log.Println("NOTICE: Environment variable '" + EnvDeleteLocalData + "' has been deprecated in favor of '" + EnvDeleteEmptyDirData + "'.")
 77 | 		log.Println("NOTICE: Make sure to update your configuration, as said deprecated environment variable will be removed in a future release.")
 78 | 		if len(os.Getenv(EnvDeleteEmptyDirData)) == 0 {
 79 | 			_ = os.Setenv(EnvDeleteEmptyDirData, deleteLocalData)
 80 | 		} else {
 81 | 			log.Println("WARNING: Both '" + EnvDeleteLocalData + "' and '" + EnvDeleteEmptyDirData + "' are set. The former is deprecated, and will be ignored.")
 82 | 		}
 83 | 	}
 84 | 	if deleteEmptyDirData := strings.ToLower(os.Getenv(EnvDeleteEmptyDirData)); len(deleteEmptyDirData) == 0 || deleteEmptyDirData == "true" {
 85 | 		cfg.DeleteEmptyDirData = true
 86 | 	}
 87 | 	if awsRegion := strings.ToLower(os.Getenv(EnvAwsRegion)); len(awsRegion) == 0 {
 88 | 		log.Printf("Environment variable '%s' not specified, defaulting to us-west-2", EnvAwsRegion)
 89 | 		cfg.AwsRegion = "us-west-2"
 90 | 	} else {
 91 | 		cfg.AwsRegion = awsRegion
 92 | 	}
 93 | 	if metricsPort := os.Getenv(EnvMetricsPort); len(metricsPort) == 0 {
 94 | 		log.Printf("Environment variable '%s' not specified, defaulting to 8080", EnvMetricsPort)
 95 | 		cfg.MetricsPort = 8080
 96 | 	} else {
 97 | 		port, err := strconv.Atoi(metricsPort)
 98 | 		if err != nil {
 99 | 			return fmt.Errorf("invalid value for '%s': %s", EnvMetricsPort, err)
100 | 		}
101 | 		cfg.MetricsPort = port
102 | 	}
103 | 	if metrics := strings.ToLower(os.Getenv(EnvMetrics)); len(metrics) != 0 {
104 | 		cfg.Metrics = true
105 | 	}
106 | 	if executionInterval := os.Getenv(EnvExecutionInterval); len(executionInterval) > 0 {
107 | 		if interval, err := strconv.Atoi(executionInterval); err != nil {
108 | 			return fmt.Errorf("environment variable '%s' must be an integer", EnvExecutionInterval)
109 | 		} else {
110 | 			cfg.ExecutionInterval = time.Second * time.Duration(interval)
111 | 		}
112 | 	} else {
113 | 		log.Printf("Environment variable '%s' not specified, defaulting to 20 seconds", EnvExecutionInterval)
114 | 		cfg.ExecutionInterval = time.Second * 20
115 | 	}
116 | 	if executionTimeout := os.Getenv(EnvExecutionTimeout); len(executionTimeout) > 0 {
117 | 		if timeout, err := strconv.Atoi(executionTimeout); err != nil {
118 | 			return fmt.Errorf("environment variable '%s' must be an integer", EnvExecutionTimeout)
119 | 		} else {
120 | 			cfg.ExecutionTimeout = time.Second * time.Duration(timeout)
121 | 		}
122 | 	} else {
123 | 		log.Printf("Environment variable '%s' not specified, defaulting to 900 seconds", EnvExecutionTimeout)
124 | 		cfg.ExecutionTimeout = time.Second * 900
125 | 	}
126 | 	if terminationGracePeriod := os.Getenv(EnvPodTerminationGracePeriod); len(terminationGracePeriod) > 0 {
127 | 		if gracePeriod, err := strconv.Atoi(terminationGracePeriod); err != nil {
128 | 			return fmt.Errorf("environment variable '%s' must be an integer", EnvPodTerminationGracePeriod)
129 | 		} else {
130 | 			cfg.PodTerminationGracePeriod = gracePeriod
131 | 		}
132 | 	} else {
133 | 		log.Printf("Environment variable '%s' not specified, defaulting to -1 (pod's terminationGracePeriodSeconds)", EnvPodTerminationGracePeriod)
134 | 		cfg.PodTerminationGracePeriod = -1
135 | 	}
136 | 	return nil
137 | }
138 | 
139 | // Set sets the application's configuration and is intended to be used for testing purposes.
140 | // See Initialize() for production
141 | func Set(autoScalingGroupNames []string, ignoreDaemonSets, deleteEmptyDirData, eagerCordoning bool, excludeFromExternalLoadBalancers bool) {
142 | 	cfg = &config{
143 | 		AutoScalingGroupNames:            autoScalingGroupNames,
144 | 		IgnoreDaemonSets:                 ignoreDaemonSets,
145 | 		DeleteEmptyDirData:               deleteEmptyDirData,
146 | 		EagerCordoning:                   eagerCordoning,
147 | 		ExcludeFromExternalLoadBalancers: excludeFromExternalLoadBalancers,
148 | 		ExecutionInterval:                time.Second * 20,
149 | 		ExecutionTimeout:                 time.Second * 900,
150 | 	}
151 | }
152 | 
153 | func Get() *config {
154 | 	if cfg == nil {
155 | 		log.Println("Config wasn't initialized prior to being called. Assuming this is a test.")
156 | 		Set(nil, true, true, false, false)
157 | 	}
158 | 	return cfg
159 | }
160 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # aws-eks-asg-rolling-update-handler
  2 | 
  3 | ![test](https://github.com/TwiN/aws-eks-asg-rolling-update-handler/actions/workflows/test.yml/badge.svg)
  4 | [![Go Report Card](https://goreportcard.com/badge/github.com/TwiN/aws-eks-asg-rolling-update-handler)](https://goreportcard.com/report/github.com/TwiN/aws-eks-asg-rolling-update-handler)
  5 | [![Docker pulls](https://img.shields.io/docker/pulls/twinproduction/aws-eks-asg-rolling-update-handler.svg)](https://cloud.docker.com/repository/docker/twinproduction/aws-eks-asg-rolling-update-handler)
  6 | 
  7 | This application handles rolling upgrades for AWS ASGs for EKS by replacing outdated nodes by new nodes.
  8 | Outdated nodes are defined as nodes whose current configuration does not match its ASG's current launch 
  9 | template version or launch configuration.
 10 | 
 11 | Inspired by aws-asg-roller, this application only has one purpose: Scale down outdated nodes gracefully.
 12 | 
 13 | Unlike aws-asg-roller, it will not attempt to control the amount of nodes at all; it will scale up enough new nodes
 14 | to move the pods from the old nodes to the new nodes, and then evict the old nodes. 
 15 | 
 16 | It will not adjust the desired size back to its initial desired size like aws-asg-roller does, it will simply leave
 17 | everything else up to cluster-autoscaler.
 18 | 
 19 | Note that unlike other solutions, this application actually uses the resources to determine how many instances should 
 20 | be spun up before draining the old nodes. This is much better, because simply using the initial number of instances is 
 21 | completely useless in the event that the ASG's update on the launch configuration/template is a change of instance type.
 22 | 
 23 | 
 24 | ## Behavior
 25 | 
 26 | On interval, this application:
 27 | 1. Iterates over each ASG discovered by the `CLUSTER_NAME`, `AUTODISCOVERY_TAGS` environment variables or the ones defined in the `AUTO_SCALING_GROUP_NAMES` environment variable, in that order.
 28 | 2. Iterates over each instance of each ASG
 29 | 3. Checks if there's any instance with an outdated launch template version
 30 | 4. **If ASG uses MixedInstancesPolicy**, checks if there's any instances with an instance type that isn't part of the list of instance type overrides
 31 | 5. Checks if there's any instance with an outdated launch configuration
 32 | 6. If any of the conditions defined in the step 3, 4 or 5 are met for any instance, begin the rolling update process for that instance
 33 | 
 34 | The steps of each action are persisted directly on the old nodes via annotations (i.e. when the old node starts rolling out, gets drained, and gets scheduled for termination).
 35 | Therefore, this application will not run into any issues if it is restarted, rescheduled or stopped at any point in time.
 36 | 
 37 | 
 38 | **NOTE**: Ensure that your PodDisruptionBudgets - if you have any - are properly configured. This usually means having at least 1 allowed disruption at all time (i.e. at least `minAvailable: 1` with at least 2 replicas OR `maxUnavailable: 1`)
 39 | 
 40 | 
 41 | ## Usage
 42 | 
 43 | | Environment variable                 | Description                                                                                                                                                                                                                                                                  | Required | Default     |
 44 | |:-------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------|:------------|
 45 | | CLUSTER_NAME                         | Name of the eks-cluster, used in place of `AUTODISCOVERRY_TAGS` and `AUTO_SCALING_GROUP_NAMES`. Checks for `k8s.io/cluster-autoscaler/<CLUSTER_NAME>: owned` and `k8s.io/cluster-autoscaler/enabled: true` tags on ASG                                                       | yes      | `""`        |
 46 | | AUTODISCOVERY_TAGS                   | Comma separated key value string with tags to autodiscover ASGs, used in place of `CLUSTER_NAME` and `AUTO_SCALING_GROUP_NAMES`.                                                                                                                                             | yes      | `""`        |
 47 | | AUTO_SCALING_GROUP_NAMES             | Comma-separated list of ASGs, CLUSTER_NAME takes priority.                                                                                                                                                                                                                   | yes      | `""`        |
 48 | | IGNORE_DAEMON_SETS                   | Whether to ignore DaemonSets when draining the nodes                                                                                                                                                                                                                         | no       | `true`      |
 49 | | DELETE_EMPTY_DIR_DATA                | Whether to delete empty dir data when draining the nodes                                                                                                                                                                                                                     | no       | `true`      |
 50 | | AWS_REGION                           | Self-explanatory                                                                                                                                                                                                                                                             | no       | `us-west-2` |
 51 | | ENVIRONMENT                          | If set to `dev`, will try to create the Kubernetes client using your local kubeconfig. Any other values will use the in-cluster configuration                                                                                                                                | no       | `""`        |
 52 | | EXECUTION_INTERVAL                   | Duration to sleep between each execution in seconds                                                                                                                                                                                                                          | no       | `20`        |
 53 | | EXECUTION_TIMEOUT                    | Maximum execution duration before timing out in seconds                                                                                                                                                                                                                      | no       | `900`       |
 54 | | POD_TERMINATION_GRACE_PERIOD         | How long to wait for a pod to terminate in seconds; 0 means "delete immediately"; set to a negative value to use the pod's terminationGracePeriodSeconds.                                                                                                                    | no       | `-1`        |
 55 | | METRICS_PORT                         | Port to bind metrics server to                                                                                                                                                                                                                                               | no       | `8080`      |
 56 | | METRICS                              | Expose metrics in Prometheus format at `:${METRICS_PORT}/metrics`                                                                                                                                                                                                            | no       | `""`        | 
 57 | | SLOW_MODE                            | If enabled, every time a node is terminated during an execution, the current execution will stop rather than continuing to the next ASG                                                                                                                                      | no       | `false`     |
 58 | | EAGER_CORDONING                      | If enabled, all outdated nodes will get cordoned before any rolling update action. The default mode is to cordon a node just before draining it. See [#41](https://github.com/TwiN/aws-eks-asg-rolling-update-handler/issues/41) for possible consequences of enabling this. | no       | `false`     |
 59 | | EXCLUDE_FROM_EXTERNAL_LOAD_BALANCERS | If enabled, node label `node.kubernetes.io/exclude-from-external-load-balancers=true` will be added to nodes before draining. See [#131](https://github.com/TwiN/aws-eks-asg-rolling-update-handler/pull/131) for more information                                           | no       | `false`     |
 60 | 
 61 | **NOTE:** Only one of `CLUSTER_NAME`, `AUTODISCOVERY_TAGS` or `AUTO_SCALING_GROUP_NAMES` must be set.
 62 | 
 63 | 
 64 | ## Metrics
 65 | 
 66 | | Metric name                                | Metric type | Labels       | Description                           |
 67 | |--------------------------------------------|-------------|--------------|---------------------------------------|
 68 | | rolling_update_handler_node_groups         | Gauge       |              | Node groups managed by the handler    |
 69 | | rolling_update_handler_outdated_nodes      | Gauge       | `node_group` | The number of outdated nodes          |
 70 | | rolling_update_handler_updated_nodes       | Gauge       | `node_group` | The number of updated nodes           |
 71 | | rolling_update_handler_scaled_up_nodes     | Counter     | `node_group` | The total number of nodes scaled up   |
 72 | | rolling_update_handler_scaled_down_nodes   | Counter     | `node_group` | The total number of nodes scaled down |
 73 | | rolling_update_handler_drained_nodes_total | Counter     | `node_group` | The total number of drained nodes     |
 74 | | rolling_update_handler_errors              | Counter     |              | The total number of errors            |
 75 | 
 76 | 
 77 | ## Permissions
 78 | 
 79 | To function properly, this application requires the following permissions on AWS:
 80 | - autoscaling:DescribeAutoScalingGroups
 81 | - autoscaling:DescribeAutoScalingInstances
 82 | - autoscaling:DescribeLaunchConfigurations
 83 | - autoscaling:SetDesiredCapacity
 84 | - autoscaling:TerminateInstanceInAutoScalingGroup
 85 | - autoscaling:UpdateAutoScalingGroup
 86 | - ec2:DescribeLaunchTemplates
 87 | - ec2:DescribeInstances
 88 | 
 89 | 
 90 | ## Deploying on Kubernetes
 91 | 
 92 | ```yaml
 93 | apiVersion: v1
 94 | kind: ServiceAccount
 95 | metadata:
 96 |   name: aws-eks-asg-rolling-update-handler
 97 |   namespace: kube-system
 98 |   labels:
 99 |     app: aws-eks-asg-rolling-update-handler
100 | ---
101 | apiVersion: rbac.authorization.k8s.io/v1
102 | kind: ClusterRole
103 | metadata:
104 |   name: aws-eks-asg-rolling-update-handler
105 |   labels:
106 |     app: aws-eks-asg-rolling-update-handler
107 | rules:
108 |   - apiGroups:
109 |       - "*"
110 |     resources:
111 |       - "*"
112 |     verbs:
113 |       - get
114 |       - list
115 |       - watch
116 |   - apiGroups:
117 |       - "*"
118 |     resources:
119 |       - nodes
120 |     verbs:
121 |       - get
122 |       - list
123 |       - watch
124 |       - update
125 |       - patch
126 |   - apiGroups:
127 |       - "*"
128 |     resources:
129 |       - pods/eviction
130 |     verbs:
131 |       - get
132 |       - list
133 |       - create
134 |   - apiGroups:
135 |       - "*"
136 |     resources:
137 |       - pods
138 |     verbs:
139 |       - get
140 |       - list
141 | ---
142 | apiVersion: rbac.authorization.k8s.io/v1
143 | kind: ClusterRoleBinding
144 | metadata:
145 |   name: aws-eks-asg-rolling-update-handler
146 |   labels:
147 |     app: aws-eks-asg-rolling-update-handler
148 | roleRef:
149 |   kind: ClusterRole
150 |   name: aws-eks-asg-rolling-update-handler
151 |   apiGroup: rbac.authorization.k8s.io
152 | subjects:
153 |   - kind: ServiceAccount
154 |     name: aws-eks-asg-rolling-update-handler
155 |     namespace: kube-system
156 | ---
157 | apiVersion: apps/v1
158 | kind: Deployment
159 | metadata:
160 |   name: aws-eks-asg-rolling-update-handler
161 |   namespace: kube-system
162 |   labels:
163 |     app: aws-eks-asg-rolling-update-handler
164 | spec:
165 |   replicas: 1
166 |   selector:
167 |     matchLabels:
168 |       app: aws-eks-asg-rolling-update-handler
169 |   template:
170 |     metadata:
171 |       labels:
172 |         app: aws-eks-asg-rolling-update-handler
173 |     spec:
174 |       automountServiceAccountToken: true
175 |       serviceAccountName: aws-eks-asg-rolling-update-handler
176 |       restartPolicy: Always
177 |       dnsPolicy: Default
178 |       containers:
179 |         - name: aws-eks-asg-rolling-update-handler
180 |           image: twinproduction/aws-eks-asg-rolling-update-handler
181 |           imagePullPolicy: Always
182 |           env:
183 |             - name: AUTO_SCALING_GROUP_NAMES
184 |               value: "asg-1,asg-2,asg-3" # REPLACE THESE VALUES FOR THE NAMES OF THE ASGs
185 | ```
186 | 
187 | 
188 | ## Deploying with Helm
189 | 
190 | For the chart associated to this project, see [TwiN/helm-charts](https://github.com/TwiN/helm-charts):
191 | ```sh
192 | helm repo add twin https://twin.github.io/helm-charts
193 | helm repo update
194 | helm install aws-eks-asg-rolling-update-handler twin/aws-eks-asg-rolling-update-handler
195 | ```
196 | 
197 | 
198 | ## Developing
199 | 
200 | To run the application locally, make sure your local kubeconfig file is configured properly (i.e. you can use kubectl).
201 | 
202 | Once you've done that, set the local environment variable `ENVIRONMENT` to `dev` and `AUTO_SCALING_GROUP_NAMES` 
203 | to a comma-separated list of auto scaling group names.
204 | 
205 | Your local aws credentials must also be valid (i.e. you can use `awscli`)
206 | 
207 | 
208 | ## Special thanks
209 | 
210 | I had originally worked on [deitch/aws-asg-roller](https://github.com/deitch/aws-asg-roller), but due to the numerous conflicts it had with cluster-autoscaler, 
211 | I decided to make a project that heavily relies on cluster-autoscaler rather than simply coexist with it, with a much bigger emphasis on maintaining 
212 | high availability during rolling upgrades.
213 | 
214 | In any case, this project was inspired by aws-asg-roller and the code for comparing launch template versions also comes from there, hence why this special thanks section exists.
215 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
  1 | github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
  2 | github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
  3 | github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ=
  4 | github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE=
  5 | github.com/TwiN/gocache/v2 v2.4.0 h1:BZ/TqvhipDQE23MFFTjC0MiI1qZ7GEVtSdOFVVXyr18=
  6 | github.com/TwiN/gocache/v2 v2.4.0/go.mod h1:Cl1c0qNlQlXzJhTpAARVqpQDSuGDM5RhtzPYAM1x17g=
  7 | github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
  8 | github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
  9 | github.com/aws/aws-sdk-go v1.55.7 h1:UJrkFq7es5CShfBwlWAC8DA077vp8PyVbQd3lqLiztE=
 10 | github.com/aws/aws-sdk-go v1.55.7/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
 11 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 12 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 13 | github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
 14 | github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
 15 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 16 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 17 | github.com/chai2010/gettext-go v1.0.2 h1:1Lwwip6Q2QGsAdl/ZKPCwTe9fe0CjlUbqj5bFNSjIRk=
 18 | github.com/chai2010/gettext-go v1.0.2/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA=
 19 | github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 20 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 21 | github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
 22 | github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
 23 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 24 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 25 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 26 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 27 | github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU=
 28 | github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
 29 | github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f h1:Wl78ApPPB2Wvf/TIe2xdyJxTlb6obmF18d8QdkxNDu4=
 30 | github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f/go.mod h1:OSYXu++VVOHnXeitef/D8n/6y4QV8uLHSFXX4NeXMGc=
 31 | github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
 32 | github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
 33 | github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA=
 34 | github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
 35 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
 36 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 37 | github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
 38 | github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
 39 | github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
 40 | github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
 41 | github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
 42 | github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
 43 | github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
 44 | github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
 45 | github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
 46 | github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
 47 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 48 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 49 | github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
 50 | github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
 51 | github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo=
 52 | github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ=
 53 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 54 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 55 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 56 | github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo=
 57 | github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
 58 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 59 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 60 | github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo=
 61 | github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
 62 | github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA=
 63 | github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
 64 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 65 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 66 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
 67 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
 68 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
 69 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
 70 | github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 71 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 72 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 73 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 74 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 75 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 76 | github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
 77 | github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
 78 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 79 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 80 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 81 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 82 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 83 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 84 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 85 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
 86 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
 87 | github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0=
 88 | github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE=
 89 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 90 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 91 | github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0=
 92 | github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0=
 93 | github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
 94 | github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
 95 | github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
 96 | github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
 97 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 98 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 99 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
100 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
101 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8=
102 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
103 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0=
104 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4=
105 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
106 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
107 | github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
108 | github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
109 | github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM=
110 | github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
111 | github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4=
112 | github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
113 | github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI=
114 | github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
115 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
116 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
117 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
118 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
119 | github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
120 | github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
121 | github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
122 | github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
123 | github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
124 | github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
125 | github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
126 | github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
127 | github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
128 | github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
129 | github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
130 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
131 | github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ=
132 | github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
133 | github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
134 | github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
135 | github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
136 | github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
137 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
138 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
139 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
140 | github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
141 | github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
142 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
143 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
144 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
145 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
146 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
147 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
148 | github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
149 | github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
150 | github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
151 | github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
152 | github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ=
153 | github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0=
154 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
155 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
156 | go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
157 | go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
158 | go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
159 | go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
160 | go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
161 | go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
162 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
163 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
164 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
165 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
166 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
167 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
168 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
169 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
170 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
171 | golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
172 | golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
173 | golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI=
174 | golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU=
175 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
176 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
177 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
178 | golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw=
179 | golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
180 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
181 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
182 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
183 | golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
184 | golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
185 | golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
186 | golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4=
187 | golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw=
188 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
189 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
190 | golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng=
191 | golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU=
192 | golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
193 | golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
194 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
195 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
196 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
197 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
198 | golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0=
199 | golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw=
200 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
201 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
202 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
203 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
204 | google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
205 | google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
206 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
207 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
208 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
209 | gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4=
210 | gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
211 | gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
212 | gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
213 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
214 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
215 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
216 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
217 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
218 | k8s.io/api v0.34.3 h1:D12sTP257/jSH2vHV2EDYrb16bS7ULlHpdNdNhEw2S4=
219 | k8s.io/api v0.34.3/go.mod h1:PyVQBF886Q5RSQZOim7DybQjAbVs8g7gwJNhGtY5MBk=
220 | k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE=
221 | k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
222 | k8s.io/cli-runtime v0.34.3 h1:YRyMhiwX0dT9lmG0AtZDaeG33Nkxgt9OlCTZhRXj9SI=
223 | k8s.io/cli-runtime v0.34.3/go.mod h1:GVwL1L5uaGEgM7eGeKjaTG2j3u134JgG4dAI6jQKhMc=
224 | k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A=
225 | k8s.io/client-go v0.34.3/go.mod h1:OxxeYagaP9Kdf78UrKLa3YZixMCfP6bgPwPwNBQBzpM=
226 | k8s.io/component-base v0.34.3 h1:zsEgw6ELqK0XncCQomgO9DpUIzlrYuZYA0Cgo+JWpVk=
227 | k8s.io/component-base v0.34.3/go.mod h1:5iIlD8wPfWE/xSHTRfbjuvUul2WZbI2nOUK65XL0E/c=
228 | k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
229 | k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
230 | k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA=
231 | k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
232 | k8s.io/kubectl v0.34.3 h1:vpM6//153gh5gvsYHXWHVJ4l4xmN5QFwTSmlfd8icm8=
233 | k8s.io/kubectl v0.34.3/go.mod h1:zZQHtIZoUqTP1bAnPzq/3W1jfc0NeOeunFgcswrfg1c=
234 | k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y=
235 | k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
236 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE=
237 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
238 | sigs.k8s.io/kustomize/api v0.20.1 h1:iWP1Ydh3/lmldBnH/S5RXgT98vWYMaTUL1ADcr+Sv7I=
239 | sigs.k8s.io/kustomize/api v0.20.1/go.mod h1:t6hUFxO+Ph0VxIk1sKp1WS0dOjbPCtLJ4p8aADLwqjM=
240 | sigs.k8s.io/kustomize/kyaml v0.20.1 h1:PCMnA2mrVbRP3NIB6v9kYCAc38uvFLVs8j/CD567A78=
241 | sigs.k8s.io/kustomize/kyaml v0.20.1/go.mod h1:0EmkQHRUsJxY8Ug9Niig1pUMSCGHxQ5RklbpV/Ri6po=
242 | sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
243 | sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
244 | sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco=
245 | sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
246 | sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
247 | sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
248 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"math/rand"
  8 | 	"time"
  9 | 
 10 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/cloud"
 11 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/config"
 12 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/k8s"
 13 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/metrics"
 14 | 	"github.com/aws/aws-sdk-go/aws"
 15 | 	"github.com/aws/aws-sdk-go/service/autoscaling"
 16 | 	"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
 17 | 	"github.com/aws/aws-sdk-go/service/ec2"
 18 | 	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
 19 | 	v1 "k8s.io/api/core/v1"
 20 | )
 21 | 
 22 | const (
 23 | 	MaximumFailedExecutionBeforePanic = 10 // Maximum number of allowed failed executions before panicking
 24 | 
 25 | 	MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio = 0.11 // To help with larger clusters
 26 | 	MaximumNumberOfUpdatedNonReadyNodes                      = 5    // To prevent too many non-ready nodes from being taken into account when calculating resources available in one node
 27 | )
 28 | 
 29 | var (
 30 | 	ErrTimedOut = errors.New("execution timed out")
 31 | 
 32 | 	executionFailedCounter = 0
 33 | )
 34 | 
 35 | func init() {
 36 | 	rand.Seed(time.Now().UnixNano())
 37 | }
 38 | 
 39 | func main() {
 40 | 	err := config.Initialize()
 41 | 	if err != nil {
 42 | 		log.Fatalf("Unable to initialize configuration: %s", err.Error())
 43 | 	}
 44 | 	if config.Get().Metrics {
 45 | 		go metrics.Server.Listen(config.Get().MetricsPort)
 46 | 	}
 47 | 	ec2Service, autoScalingService, err := cloud.GetServices(config.Get().AwsRegion)
 48 | 	if err != nil {
 49 | 		log.Fatalf("Unable to create AWS services: %s", err.Error())
 50 | 	}
 51 | 	for {
 52 | 		start := time.Now()
 53 | 		if err := run(ec2Service, autoScalingService); err != nil {
 54 | 			log.Printf("Error during execution: %s", err.Error())
 55 | 			metrics.Server.Errors.Inc()
 56 | 			executionFailedCounter++
 57 | 			if executionFailedCounter > MaximumFailedExecutionBeforePanic {
 58 | 				panic(fmt.Errorf("execution failed %d times: %v", executionFailedCounter, err))
 59 | 			}
 60 | 		} else if executionFailedCounter > 0 {
 61 | 			log.Printf("Execution was successful after %d failed attempts, resetting counter to 0", executionFailedCounter)
 62 | 			executionFailedCounter = 0
 63 | 		}
 64 | 		log.Printf("Execution took %dms, sleeping for %s", time.Since(start).Milliseconds(), config.Get().ExecutionInterval)
 65 | 		time.Sleep(config.Get().ExecutionInterval)
 66 | 	}
 67 | }
 68 | 
 69 | func run(ec2Service ec2iface.EC2API, autoScalingService autoscalingiface.AutoScalingAPI) error {
 70 | 	log.Println("Starting execution")
 71 | 	cfg := config.Get()
 72 | 	client, err := k8s.CreateClientSet()
 73 | 	if err != nil {
 74 | 		return errors.New("unable to create Kubernetes client: " + err.Error())
 75 | 	}
 76 | 	kubernetesClient := k8s.NewClient(client)
 77 | 	if cfg.Debug {
 78 | 		log.Println("Created Kubernetes Client successfully")
 79 | 	}
 80 | 
 81 | 	var autoScalingGroups []*autoscaling.Group
 82 | 	if len(cfg.AutodiscoveryTags) > 0 {
 83 | 		autoScalingGroups, err = cloud.DescribeEnabledAutoScalingGroupsByTags(autoScalingService, cfg.AutodiscoveryTags)
 84 | 	} else {
 85 | 		autoScalingGroups, err = cloud.DescribeAutoScalingGroupsByNames(autoScalingService, cfg.AutoScalingGroupNames)
 86 | 	}
 87 | 	if err != nil {
 88 | 		return errors.New("unable to describe AutoScalingGroups: " + err.Error())
 89 | 	}
 90 | 	if cfg.Debug {
 91 | 		log.Println("Described AutoScalingGroups successfully")
 92 | 	}
 93 | 	return HandleRollingUpgrade(kubernetesClient, ec2Service, autoScalingService, autoScalingGroups)
 94 | }
 95 | 
 96 | // HandleRollingUpgrade handles rolling upgrades.
 97 | //
 98 | // Returns an error if an execution lasts for longer than ExecutionTimeout
 99 | func HandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, autoScalingService autoscalingiface.AutoScalingAPI, autoScalingGroups []*autoscaling.Group) error {
100 | 	metrics.Server.NodeGroups.WithLabelValues().Set(float64(len(autoScalingGroups)))
101 | 	timeout := make(chan bool, 1)
102 | 	result := make(chan bool, 1)
103 | 	go func() {
104 | 		time.Sleep(config.Get().ExecutionTimeout)
105 | 		timeout <- true
106 | 	}()
107 | 	go func() {
108 | 		result <- DoHandleRollingUpgrade(client, ec2Service, autoScalingService, autoScalingGroups)
109 | 	}()
110 | 	select {
111 | 	case <-timeout:
112 | 		return ErrTimedOut
113 | 	case <-result:
114 | 		return nil
115 | 	}
116 | }
117 | 
118 | // DoHandleRollingUpgrade handles rolling upgrades by iterating over every single AutoScalingGroups' outdated
119 | // instances
120 | func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, autoScalingService autoscalingiface.AutoScalingAPI, autoScalingGroups []*autoscaling.Group) bool {
121 | 	for _, autoScalingGroup := range autoScalingGroups {
122 | 		outdatedInstances, updatedInstances, err := SeparateOutdatedFromUpdatedInstances(autoScalingGroup, ec2Service)
123 | 		if err != nil {
124 | 			metrics.Server.Errors.Inc()
125 | 			log.Printf("[%s] Skipping because unable to separate outdated instances from updated instances: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), err.Error())
126 | 			continue
127 | 		}
128 | 		metrics.Server.UpdatedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Set(float64(len(updatedInstances)))
129 | 		metrics.Server.OutdatedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Set(float64(len(outdatedInstances)))
130 | 		if config.Get().Debug {
131 | 			log.Printf("[%s] outdatedInstances: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), outdatedInstances)
132 | 			log.Printf("[%s] updatedInstances: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), updatedInstances)
133 | 		}
134 | 		// Get the updated and ready nodes from the list of updated instances
135 | 		// This will be used to determine if the desired number of updated instances need to scale up or not
136 | 		// We also use this to clean up, if necessary
137 | 		updatedReadyNodes, numberOfNonReadyUpdatedNodesOrInstances := getReadyNodesAndNumberOfNonReadyNodesOrInstances(client, updatedInstances, autoScalingGroup)
138 | 		if len(outdatedInstances) == 0 {
139 | 			log.Printf("[%s] All instances are up to date", aws.StringValue(autoScalingGroup.AutoScalingGroupName))
140 | 			continue
141 | 		} else {
142 | 			log.Printf("[%s] outdated=%d; updated=%d; updatedAndReady=%d; asgCurrent=%d; asgDesired=%d; asgMax=%d", aws.StringValue(autoScalingGroup.AutoScalingGroupName), len(outdatedInstances), len(updatedInstances), len(updatedReadyNodes), len(autoScalingGroup.Instances), aws.Int64Value(autoScalingGroup.DesiredCapacity), aws.Int64Value(autoScalingGroup.MaxSize))
143 | 		}
144 | 		if int64(len(autoScalingGroup.Instances)) < aws.Int64Value(autoScalingGroup.DesiredCapacity) {
145 | 			log.Printf("[%s] Skipping because ASG has a desired capacity of %d, but only has %d instances", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.Int64Value(autoScalingGroup.DesiredCapacity), len(autoScalingGroup.Instances))
146 | 			continue
147 | 		}
148 | 		if !HasAcceptableNumberOfUpdatedNonReadyNodes(numberOfNonReadyUpdatedNodesOrInstances, len(updatedReadyNodes)) {
149 | 			log.Printf("[%s] ASG has too many non-ready updated nodes/instances (%d), waiting until they become ready", aws.StringValue(autoScalingGroup.AutoScalingGroupName), numberOfNonReadyUpdatedNodesOrInstances)
150 | 			continue
151 | 		}
152 | 		// Shuffle the outdated instances, so that we don't always try to terminate the same instance.
153 | 		// This is also useful if you want to have more than one aws-eks-asg-rolling-update-handler running
154 | 		rand.Shuffle(len(outdatedInstances), func(i, j int) {
155 | 			outdatedInstances[i], outdatedInstances[j] = outdatedInstances[j], outdatedInstances[i]
156 | 		})
157 | 		for _, outdatedInstance := range outdatedInstances {
158 | 			node, err := client.GetNodeByAutoScalingInstance(outdatedInstance)
159 | 			if err != nil {
160 | 				log.Printf("[%s][%s] Skipping because unable to get outdated node from Kubernetes: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error())
161 | 				continue
162 | 			}
163 | 			if config.Get().EagerCordoning {
164 | 				if !node.Spec.Unschedulable {
165 | 					// If EagerCordoning is enabled and the node is schedulable, we need to cordon it.
166 | 					if err := client.Cordon(node.Name); err != nil {
167 | 						metrics.Server.Errors.Inc()
168 | 						log.Printf("[%s][%s] Skipping because ran into error while cordoning node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error())
169 | 						continue
170 | 					}
171 | 				}
172 | 			}
173 | 			minutesSinceStarted, minutesSinceDrained, minutesSinceTerminated := getRollingUpdateTimestampsFromNode(node)
174 | 			// Check if outdated nodes in k8s have been marked with annotation from aws-eks-asg-rolling-update-handler
175 | 			if minutesSinceStarted == -1 {
176 | 				log.Printf("[%s][%s] Starting node rollout process", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
177 | 				// Annotate the node to persist the fact that the rolling update process has begun
178 | 				err := k8s.AnnotateNodeByAutoScalingInstance(client, outdatedInstance, k8s.AnnotationRollingUpdateStartedTimestamp, time.Now().Format(time.RFC3339))
179 | 				if err != nil {
180 | 					log.Printf("[%s][%s] Skipping because unable to annotate node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error())
181 | 					continue
182 | 				}
183 | 			} else {
184 | 				log.Printf("[%s][%s] Node already started rollout process", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
185 | 				// check if existing updatedInstances have the capacity to support what's inside this node
186 | 				hasEnoughResources := k8s.CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(client, node, updatedReadyNodes)
187 | 				if hasEnoughResources {
188 | 					log.Printf("[%s][%s] Updated nodes have enough resources available", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
189 | 					if minutesSinceDrained == -1 {
190 | 						if config.Get().ExcludeFromExternalLoadBalancers {
191 | 							log.Printf("[%s][%s] Label node to exclude from external load balancers", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
192 | 							k8s.LabelNodeByAutoScalingInstance(client, outdatedInstance, k8s.LabelExcludeFromExternalLoadBalancers, "true")
193 | 						}
194 | 						log.Printf("[%s][%s] Draining node", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
195 | 						err := client.Drain(node.Name, config.Get().IgnoreDaemonSets, config.Get().DeleteEmptyDirData, config.Get().PodTerminationGracePeriod)
196 | 						if err != nil {
197 | 							metrics.Server.Errors.Inc()
198 | 							log.Printf("[%s][%s] Skipping because ran into error while draining node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error())
199 | 							continue
200 | 						} else {
201 | 							metrics.Server.DrainedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Inc()
202 | 							// Only annotate if no error was encountered
203 | 							_ = k8s.AnnotateNodeByAutoScalingInstance(client, outdatedInstance, k8s.AnnotationRollingUpdateDrainedTimestamp, time.Now().Format(time.RFC3339))
204 | 						}
205 | 					} else {
206 | 						log.Printf("[%s][%s] Node has already been drained %d minutes ago, skipping", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), minutesSinceDrained)
207 | 					}
208 | 					if minutesSinceTerminated == -1 {
209 | 						// Terminate node
210 | 						log.Printf("[%s][%s] Terminating node", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
211 | 						shouldDecrementDesiredCapacity := aws.Int64Value(autoScalingGroup.DesiredCapacity) != aws.Int64Value(autoScalingGroup.MinSize)
212 | 						err = cloud.TerminateEc2Instance(autoScalingService, outdatedInstance, shouldDecrementDesiredCapacity)
213 | 						if err != nil {
214 | 							metrics.Server.Errors.Inc()
215 | 							log.Printf("[%s][%s] Ran into error while terminating node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error())
216 | 							continue
217 | 						} else {
218 | 							metrics.Server.ScaledDownNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Inc()
219 | 							// Only annotate if no error was encountered
220 | 							_ = k8s.AnnotateNodeByAutoScalingInstance(client, outdatedInstance, k8s.AnnotationRollingUpdateTerminatedTimestamp, time.Now().Format(time.RFC3339))
221 | 						}
222 | 					} else {
223 | 						log.Printf("[%s][%s] Node is already in the process of being terminated since %d minutes ago, skipping", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), minutesSinceTerminated)
224 | 						// TODO: check if minutesSinceTerminated > 10. If that happens, then there's clearly a problem, so we should do something about it
225 | 						// The node has already been terminated, there's nothing to do here, continue to the next one
226 | 						continue
227 | 					}
228 | 					// If this code is reached, it means that the current node has been successfully drained and
229 | 					// scheduled for termination.
230 | 					// As a result, we return here to make sure that multiple old instances didn't use the same updated
231 | 					// instances to calculate resources available
232 | 					log.Printf("[%s][%s] Node has been drained and scheduled for termination successfully", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
233 | 					if config.Get().SlowMode {
234 | 						// If SlowMode is enabled, we'll return after draining a node and wait for the next execution
235 | 						return true
236 | 					}
237 | 					// Move on to the next ASG
238 | 					break
239 | 				} else {
240 | 					// Don't increase the ASG if the node has already been drained or scheduled for termination
241 | 					if minutesSinceDrained != -1 || minutesSinceTerminated != -1 {
242 | 						continue
243 | 					}
244 | 					log.Printf("[%s][%s] Updated nodes do not have enough resources available, increasing desired count by 1", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
245 | 					err := cloud.IncrementAutoScalingGroupDesiredCount(autoScalingService, aws.StringValue(autoScalingGroup.AutoScalingGroupName))
246 | 					if err != nil {
247 | 						log.Printf("[%s][%s] Unable to increase ASG desired size: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error())
248 | 						log.Printf("[%s][%s] Skipping", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
249 | 						continue
250 | 					} else {
251 | 						metrics.Server.ScaledUpNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Inc()
252 | 						// ASG was scaled up already, stop iterating over outdated instances in current ASG so we can
253 | 						// move on to the next ASG
254 | 						break
255 | 					}
256 | 				}
257 | 			}
258 | 		}
259 | 	}
260 | 	return true
261 | }
262 | 
263 | func getReadyNodesAndNumberOfNonReadyNodesOrInstances(client k8s.ClientAPI, updatedInstances []*autoscaling.Instance, autoScalingGroup *autoscaling.Group) ([]*v1.Node, int) {
264 | 	var updatedReadyNodes []*v1.Node
265 | 	numberOfNonReadyNodesOrInstances := 0
266 | 	for _, updatedInstance := range updatedInstances {
267 | 		if aws.StringValue(updatedInstance.LifecycleState) != "InService" {
268 | 			numberOfNonReadyNodesOrInstances++
269 | 			log.Printf("[%s][%s] Skipping because instance is not in LifecycleState 'InService', but is in '%s' instead", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), aws.StringValue(updatedInstance.LifecycleState))
270 | 			continue
271 | 		}
272 | 		updatedNode, err := client.GetNodeByAutoScalingInstance(updatedInstance)
273 | 		if err != nil {
274 | 			numberOfNonReadyNodesOrInstances++
275 | 			log.Printf("[%s][%s] Skipping because unable to get updated node from Kubernetes: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), err.Error())
276 | 			continue
277 | 		}
278 | 		// Check if Kubelet is ready to accept pods on that node
279 | 		conditions := updatedNode.Status.Conditions
280 | 		if len(conditions) == 0 {
281 | 			log.Printf("[%s][%s] For some magical reason, %s doesn't have any conditions, therefore it is impossible to determine whether the node is ready to accept new pods or not", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), updatedNode.Name)
282 | 			numberOfNonReadyNodesOrInstances++
283 | 		} else if kubeletCondition := conditions[len(conditions)-1]; kubeletCondition.Type == v1.NodeReady {
284 | 			if kubeletCondition.Status == v1.ConditionTrue {
285 | 				updatedReadyNodes = append(updatedReadyNodes, updatedNode)
286 | 			} else {
287 | 				log.Printf("[%s][%s] Skipping because kubelet condition %s is reporting as %s", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), kubeletCondition.Type, kubeletCondition.Status)
288 | 				numberOfNonReadyNodesOrInstances++
289 | 			}
290 | 		} else {
291 | 			log.Printf("[%s][%s] Skipping because expected kubelet on node to have condition %s with value %s, but it didn't", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), v1.NodeReady, v1.ConditionTrue)
292 | 			numberOfNonReadyNodesOrInstances++
293 | 		}
294 | 
295 | 		// Cleaning up
296 | 		// This is an edge case, but it may happen that an ASG's launch template is modified, creating a new
297 | 		// template version, but then that new template version is deleted before the node has been terminated.
298 | 		// To make it even more of an edge case, the draining function would've had to time out, meaning that
299 | 		// the termination would be skipped until the next run.
300 | 		// This would cause an instance to be considered as updated, even though it has been drained therefore
301 | 		// cordoned (NoSchedule).
302 | 		if startedAtValue, ok := updatedNode.Annotations[k8s.AnnotationRollingUpdateStartedTimestamp]; ok {
303 | 			// An updated node should never have k8s.AnnotationRollingUpdateStartedTimestamp, so this indicates that
304 | 			// at one point, this node was considered old compared to the ASG's current LT/LC
305 | 			// First, check if there's a NoSchedule taint
306 | 			for i, taint := range updatedNode.Spec.Taints {
307 | 				if taint.Effect == v1.TaintEffectNoSchedule {
308 | 					// There's a taint, but we need to make sure it was added after the rolling update started
309 | 					startedAt, err := time.Parse(time.RFC3339, startedAtValue)
310 | 					// If the annotation can't be parsed OR the taint was added after the rolling updated started,
311 | 					// we need to remove that taint
312 | 					if err != nil || taint.TimeAdded.Time.After(startedAt) {
313 | 						log.Printf("[%s] EDGE-0001: Attempting to remove taint from updated node %s", aws.StringValue(autoScalingGroup.AutoScalingGroupName), updatedNode.Name)
314 | 						// Remove the taint
315 | 						updatedNode.Spec.Taints = append(updatedNode.Spec.Taints[:i], updatedNode.Spec.Taints[i+1:]...)
316 | 						// Remove the annotation
317 | 						delete(updatedNode.Annotations, k8s.AnnotationRollingUpdateStartedTimestamp)
318 | 						// Update the node
319 | 						err = client.UpdateNode(updatedNode)
320 | 						if err != nil {
321 | 							log.Printf("[%s] EDGE-0001: Unable to update tainted node %s: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), updatedNode.Name, err.Error())
322 | 						}
323 | 						break
324 | 					}
325 | 				}
326 | 			}
327 | 		}
328 | 	}
329 | 	return updatedReadyNodes, numberOfNonReadyNodesOrInstances
330 | }
331 | 
332 | func getRollingUpdateTimestampsFromNode(node *v1.Node) (minutesSinceStarted, minutesSinceDrained, minutesSinceTerminated int) {
333 | 	rollingUpdateStartedAt, ok := node.Annotations[k8s.AnnotationRollingUpdateStartedTimestamp]
334 | 	if ok {
335 | 		startedAt, err := time.Parse(time.RFC3339, rollingUpdateStartedAt)
336 | 		if err == nil {
337 | 			minutesSinceStarted = int(time.Since(startedAt).Minutes())
338 | 		}
339 | 	} else {
340 | 		minutesSinceStarted = -1
341 | 	}
342 | 	drainedAtValue, ok := node.Annotations[k8s.AnnotationRollingUpdateDrainedTimestamp]
343 | 	if ok {
344 | 		drainedAt, err := time.Parse(time.RFC3339, drainedAtValue)
345 | 		if err == nil {
346 | 			minutesSinceDrained = int(time.Since(drainedAt).Minutes())
347 | 		}
348 | 	} else {
349 | 		minutesSinceDrained = -1
350 | 	}
351 | 	terminatedAtValue, ok := node.Annotations[k8s.AnnotationRollingUpdateTerminatedTimestamp]
352 | 	if ok {
353 | 		terminatedAt, err := time.Parse(time.RFC3339, terminatedAtValue)
354 | 		if err == nil {
355 | 			minutesSinceTerminated = int(time.Since(terminatedAt).Minutes())
356 | 		}
357 | 	} else {
358 | 		minutesSinceTerminated = -1
359 | 	}
360 | 	return
361 | }
362 | 
363 | // SeparateOutdatedFromUpdatedInstances splits a list of instances into a list of outdated
364 | // instances and a list of updated instances.
365 | func SeparateOutdatedFromUpdatedInstances(asg *autoscaling.Group, ec2Svc ec2iface.EC2API) ([]*autoscaling.Instance, []*autoscaling.Instance, error) {
366 | 	if config.Get().Debug {
367 | 		log.Printf("[%s] Separating outdated from updated instances", aws.StringValue(asg.AutoScalingGroupName))
368 | 	}
369 | 	targetLaunchConfiguration := asg.LaunchConfigurationName
370 | 	targetLaunchTemplate := asg.LaunchTemplate
371 | 	var targetLaunchTemplateOverrides []*autoscaling.LaunchTemplateOverrides
372 | 	if targetLaunchTemplate == nil && asg.MixedInstancesPolicy != nil && asg.MixedInstancesPolicy.LaunchTemplate != nil {
373 | 		if config.Get().Debug {
374 | 			log.Printf("[%s] using mixed instances policy launch template", aws.StringValue(asg.AutoScalingGroupName))
375 | 		}
376 | 		targetLaunchTemplate = asg.MixedInstancesPolicy.LaunchTemplate.LaunchTemplateSpecification
377 | 		targetLaunchTemplateOverrides = asg.MixedInstancesPolicy.LaunchTemplate.Overrides
378 | 	}
379 | 	if targetLaunchTemplate != nil {
380 | 		return SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate(aws.StringValue(asg.AutoScalingGroupName), targetLaunchTemplate, targetLaunchTemplateOverrides, asg.Instances, ec2Svc)
381 | 	} else if targetLaunchConfiguration != nil {
382 | 		return SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(targetLaunchConfiguration, asg.Instances)
383 | 	}
384 | 	return nil, nil, errors.New("AutoScalingGroup has neither launch template nor launch configuration")
385 | }
386 | 
387 | // SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate separates a list of instances into a list of outdated
388 | // instances and a list of updated instances.
389 | func SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate(asgName string, targetLaunchTemplate *autoscaling.LaunchTemplateSpecification, overrides []*autoscaling.LaunchTemplateOverrides, instances []*autoscaling.Instance, ec2Svc ec2iface.EC2API) ([]*autoscaling.Instance, []*autoscaling.Instance, error) {
390 | 	var (
391 | 		oldInstances   []*autoscaling.Instance
392 | 		newInstances   []*autoscaling.Instance
393 | 		targetTemplate *ec2.LaunchTemplate
394 | 		err            error
395 | 	)
396 | 	switch {
397 | 	case targetLaunchTemplate.LaunchTemplateId != nil && aws.StringValue(targetLaunchTemplate.LaunchTemplateId) != "":
398 | 		if targetTemplate, err = cloud.DescribeLaunchTemplateByID(ec2Svc, aws.StringValue(targetLaunchTemplate.LaunchTemplateId)); err != nil {
399 | 			return nil, nil, fmt.Errorf("error retrieving information about launch template %s: %v", aws.StringValue(targetLaunchTemplate.LaunchTemplateId), err)
400 | 		}
401 | 	case targetLaunchTemplate.LaunchTemplateName != nil && aws.StringValue(targetLaunchTemplate.LaunchTemplateName) != "":
402 | 		if targetTemplate, err = cloud.DescribeLaunchTemplateByName(ec2Svc, aws.StringValue(targetLaunchTemplate.LaunchTemplateName)); err != nil {
403 | 			return nil, nil, fmt.Errorf("error retrieving information about launch template name %s: %v", aws.StringValue(targetLaunchTemplate.LaunchTemplateName), err)
404 | 		}
405 | 	default:
406 | 		return nil, nil, fmt.Errorf("invalid launch template name")
407 | 	}
408 | 	// extra safety check
409 | 	if targetTemplate == nil {
410 | 		return nil, nil, fmt.Errorf("no template found")
411 | 	}
412 | 	// now we can loop through each node and compare
413 | 	for _, instance := range instances {
414 | 		if isInstanceTypePartOfLaunchTemplateOverrides(overrides, instance.InstanceType) {
415 | 			var (
416 | 				overrideTargetTemplate       *ec2.LaunchTemplate
417 | 				overrideTargetLaunchTemplate *autoscaling.LaunchTemplateSpecification
418 | 			)
419 | 			for _, override := range overrides {
420 | 				if aws.StringValue(override.InstanceType) == aws.StringValue(instance.InstanceType) && override.LaunchTemplateSpecification != nil {
421 | 					if overrideTargetTemplate, err = cloud.DescribeLaunchTemplateByName(ec2Svc, aws.StringValue(override.LaunchTemplateSpecification.LaunchTemplateName)); err != nil {
422 | 						log.Printf("[%s][%s] Unable to retrieve information for launch template with name '%s': %v", asgName, aws.StringValue(instance.InstanceId), aws.StringValue(override.LaunchTemplateSpecification.LaunchTemplateName), err)
423 | 					}
424 | 					overrideTargetLaunchTemplate = override.LaunchTemplateSpecification
425 | 				}
426 | 			}
427 | 			if overrideTargetTemplate != nil && overrideTargetLaunchTemplate != nil {
428 | 				targetTemplate = overrideTargetTemplate
429 | 				targetLaunchTemplate = overrideTargetLaunchTemplate
430 | 			}
431 | 		}
432 | 		switch {
433 | 		case instance.LaunchTemplate == nil:
434 | 			fallthrough
435 | 		case aws.StringValue(instance.LaunchTemplate.LaunchTemplateName) != aws.StringValue(targetLaunchTemplate.LaunchTemplateName):
436 | 			fallthrough
437 | 		case aws.StringValue(instance.LaunchTemplate.LaunchTemplateId) != aws.StringValue(targetLaunchTemplate.LaunchTemplateId):
438 | 			fallthrough
439 | 		case !compareLaunchTemplateVersions(targetTemplate, targetLaunchTemplate, instance.LaunchTemplate):
440 | 			fallthrough
441 | 		case overrides != nil && len(overrides) > 0 && !isInstanceTypePartOfLaunchTemplateOverrides(overrides, instance.InstanceType):
442 | 			oldInstances = append(oldInstances, instance)
443 | 		default:
444 | 			newInstances = append(newInstances, instance)
445 | 		}
446 | 	}
447 | 	return oldInstances, newInstances, nil
448 | }
449 | 
450 | func isInstanceTypePartOfLaunchTemplateOverrides(overrides []*autoscaling.LaunchTemplateOverrides, instanceType *string) bool {
451 | 	for _, override := range overrides {
452 | 		if aws.StringValue(override.InstanceType) == aws.StringValue(instanceType) {
453 | 			return true
454 | 		}
455 | 	}
456 | 	return false
457 | }
458 | 
459 | // SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration separates a list of instances into a list of outdated
460 | // instances and a list of updated instances.
461 | func SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(targetLaunchConfigurationName *string, instances []*autoscaling.Instance) ([]*autoscaling.Instance, []*autoscaling.Instance, error) {
462 | 	var (
463 | 		oldInstances []*autoscaling.Instance
464 | 		newInstances []*autoscaling.Instance
465 | 	)
466 | 	for _, i := range instances {
467 | 		if i.LaunchConfigurationName != nil && *i.LaunchConfigurationName == *targetLaunchConfigurationName {
468 | 			newInstances = append(newInstances, i)
469 | 		} else {
470 | 			oldInstances = append(oldInstances, i)
471 | 		}
472 | 	}
473 | 	return oldInstances, newInstances, nil
474 | }
475 | 
476 | // compareLaunchTemplateVersions compare two launch template versions and see if they match
477 | // can handle `$Latest` and `$Default` by resolving to the actual version in use
478 | func compareLaunchTemplateVersions(targetTemplate *ec2.LaunchTemplate, lt1, lt2 *autoscaling.LaunchTemplateSpecification) bool {
479 | 	// if both versions do not start with `$`, then just compare
480 | 	if lt1 == nil && lt2 == nil {
481 | 		return true
482 | 	}
483 | 	if (lt1 == nil && lt2 != nil) || (lt1 != nil && lt2 == nil) {
484 | 		return false
485 | 	}
486 | 	if lt1.Version == nil && lt2.Version == nil {
487 | 		return true
488 | 	}
489 | 	if (lt1.Version == nil && lt2.Version != nil) || (lt1.Version != nil && lt2.Version == nil) {
490 | 		return false
491 | 	}
492 | 	// if either version starts with `$`, then resolve to actual version from LaunchTemplate
493 | 	var lt1version, lt2version string
494 | 	switch aws.StringValue(lt1.Version) {
495 | 	case "$Default":
496 | 		lt1version = fmt.Sprintf("%d", aws.Int64Value(targetTemplate.DefaultVersionNumber))
497 | 	case "$Latest":
498 | 		lt1version = fmt.Sprintf("%d", aws.Int64Value(targetTemplate.LatestVersionNumber))
499 | 	default:
500 | 		lt1version = aws.StringValue(lt1.Version)
501 | 	}
502 | 	switch aws.StringValue(lt2.Version) {
503 | 	case "$Default":
504 | 		lt2version = fmt.Sprintf("%d", aws.Int64Value(targetTemplate.DefaultVersionNumber))
505 | 	case "$Latest":
506 | 		lt2version = fmt.Sprintf("%d", aws.Int64Value(targetTemplate.LatestVersionNumber))
507 | 	default:
508 | 		lt2version = aws.StringValue(lt2.Version)
509 | 	}
510 | 	return lt1version == lt2version
511 | }
512 | 
513 | // HasAcceptableNumberOfUpdatedNonReadyNodes checks if there's a sufficient amount of updated
514 | // and ready nodes to move on to the next step (drain & terminate an outdated node) for a number of non-ready nodes.
515 | //
516 | // The logic behind this is that the more nodes are ready and updated, the higher the confidence we have that the
517 | // upgrade is going well, so we can ramp things up faster the deeper we are in the upgrade process.
518 | func HasAcceptableNumberOfUpdatedNonReadyNodes(numberOfUpdatedNonReadyNodes, numberOfUpdatedReadyNodes int) bool {
519 | 	if numberOfUpdatedNonReadyNodes == 0 {
520 | 		return true // all updated nodes are ready, so we can proceed
521 | 	}
522 | 	if numberOfUpdatedReadyNodes == 0 {
523 | 		return false // there are no ready nodes AND there are non-ready nodes (we know this because of the previous check), so we cannot proceed
524 | 	}
525 | 	if numberOfUpdatedNonReadyNodes > MaximumNumberOfUpdatedNonReadyNodes {
526 | 		return false // there are too many non-ready nodes, so we cannot proceed
527 | 	}
528 | 	return float64(numberOfUpdatedNonReadyNodes)/float64(numberOfUpdatedReadyNodes) <= MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio
529 | }
530 | 


--------------------------------------------------------------------------------
/main_test.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/cloudtest"
  7 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/config"
  8 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/k8s"
  9 | 	"github.com/TwiN/aws-eks-asg-rolling-update-handler/k8stest"
 10 | 	"github.com/aws/aws-sdk-go/aws"
 11 | 	"github.com/aws/aws-sdk-go/service/autoscaling"
 12 | 	"github.com/aws/aws-sdk-go/service/ec2"
 13 | 	v1 "k8s.io/api/core/v1"
 14 | )
 15 | 
 16 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration_whenInstanceIsOutdated(t *testing.T) {
 17 | 	instance := cloudtest.CreateTestAutoScalingInstance("instance", "v1", nil, "InService")
 18 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(aws.String("v2"), []*autoscaling.Instance{instance})
 19 | 	if err != nil {
 20 | 		t.Fatal("Shouldn't have returned an error, but returned", err)
 21 | 	}
 22 | 	if len(outdated) != 1 || len(updated) != 0 {
 23 | 		t.Error("Instance should've been outdated")
 24 | 	}
 25 | }
 26 | 
 27 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration_whenInstanceIsUpdated(t *testing.T) {
 28 | 	instance := cloudtest.CreateTestAutoScalingInstance("instance", "v1", nil, "InService")
 29 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(aws.String("v1"), []*autoscaling.Instance{instance})
 30 | 	if err != nil {
 31 | 		t.Fatal("Shouldn't have returned an error, but returned", err)
 32 | 	}
 33 | 	if len(outdated) != 0 || len(updated) != 1 {
 34 | 		t.Error("Instance should've been updated")
 35 | 	}
 36 | }
 37 | 
 38 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration_whenOneInstanceIsUpdatedAndTwoInstancesAreOutdated(t *testing.T) {
 39 | 	firstInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
 40 | 	secondInstance := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService")
 41 | 	thirdInstance := cloudtest.CreateTestAutoScalingInstance("new", "v2", nil, "InService")
 42 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(aws.String("v2"), []*autoscaling.Instance{firstInstance, secondInstance, thirdInstance})
 43 | 	if err != nil {
 44 | 		t.Fatal("Shouldn't have returned an error, but returned", err)
 45 | 	}
 46 | 	if len(outdated) != 2 {
 47 | 		t.Error("2 instances should've been outdated")
 48 | 	}
 49 | 	if len(updated) != 1 {
 50 | 		t.Error("1 instance should've been outdated")
 51 | 	}
 52 | }
 53 | 
 54 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceIsOutdated(t *testing.T) {
 55 | 	outdatedLaunchTemplate := &autoscaling.LaunchTemplateSpecification{
 56 | 		LaunchTemplateId:   aws.String("id"),
 57 | 		LaunchTemplateName: aws.String("name"),
 58 | 		Version:            aws.String("v1"),
 59 | 	}
 60 | 	updatedLaunchTemplate := &autoscaling.LaunchTemplateSpecification{
 61 | 		LaunchTemplateId:   aws.String("id"),
 62 | 		LaunchTemplateName: aws.String("name"),
 63 | 		Version:            aws.String("v2"),
 64 | 	}
 65 | 	updatedEc2LaunchTemplate := &ec2.LaunchTemplate{
 66 | 		DefaultVersionNumber: aws.Int64(1),
 67 | 		LatestVersionNumber:  aws.Int64(10),
 68 | 		LaunchTemplateId:     updatedLaunchTemplate.LaunchTemplateId,
 69 | 		LaunchTemplateName:   updatedLaunchTemplate.LaunchTemplateName,
 70 | 	}
 71 | 	instance := cloudtest.CreateTestAutoScalingInstance("instance", "", outdatedLaunchTemplate, "InService")
 72 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", updatedLaunchTemplate, nil, []*autoscaling.Instance{instance}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate}))
 73 | 	if err != nil {
 74 | 		t.Fatal("Shouldn't have returned an error, but returned:", err)
 75 | 	}
 76 | 	if len(outdated) != 1 || len(updated) != 0 {
 77 | 		t.Error("Instance should've been outdated")
 78 | 	}
 79 | }
 80 | 
 81 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceIsOutdatedDueToMixedInstancesPolicyInstanceTypeGettingRemoved(t *testing.T) {
 82 | 	launchTemplate := &autoscaling.LaunchTemplateSpecification{
 83 | 		LaunchTemplateId:   aws.String("id"),
 84 | 		LaunchTemplateName: aws.String("name"),
 85 | 		Version:            aws.String("v1"),
 86 | 	}
 87 | 	updatedEc2LaunchTemplate := &ec2.LaunchTemplate{
 88 | 		DefaultVersionNumber: aws.Int64(1),
 89 | 		LatestVersionNumber:  aws.Int64(10),
 90 | 		LaunchTemplateId:     launchTemplate.LaunchTemplateId,
 91 | 		LaunchTemplateName:   launchTemplate.LaunchTemplateName,
 92 | 	}
 93 | 	instance := cloudtest.CreateTestAutoScalingInstance("instance", "", launchTemplate, "InService")
 94 | 	instance.SetInstanceType("c5n.2xlarge")
 95 | 	overrides := []*autoscaling.LaunchTemplateOverrides{
 96 | 		{InstanceType: aws.String("c5.2xlarge")},
 97 | 		{InstanceType: aws.String("c5d.2xlarge")},
 98 | 	}
 99 | 	// Notice: The instance's instance type isn't part of the overrides.
100 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", launchTemplate, overrides, []*autoscaling.Instance{instance}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate}))
101 | 	if err != nil {
102 | 		t.Fatal("Shouldn't have returned an error, but returned:", err)
103 | 	}
104 | 	if len(outdated) != 1 || len(updated) != 0 {
105 | 		t.Error("Instance should've been outdated")
106 | 	}
107 | }
108 | 
109 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceIsUpdated(t *testing.T) {
110 | 	updatedLaunchTemplate := &autoscaling.LaunchTemplateSpecification{
111 | 		LaunchTemplateId:   aws.String("id"),
112 | 		LaunchTemplateName: aws.String("name"),
113 | 		Version:            aws.String("v1"),
114 | 	}
115 | 	updatedEc2LaunchTemplate := &ec2.LaunchTemplate{
116 | 		DefaultVersionNumber: aws.Int64(1),
117 | 		LatestVersionNumber:  aws.Int64(10),
118 | 		LaunchTemplateId:     updatedLaunchTemplate.LaunchTemplateId,
119 | 		LaunchTemplateName:   updatedLaunchTemplate.LaunchTemplateName,
120 | 	}
121 | 	instance := cloudtest.CreateTestAutoScalingInstance("instance", "", updatedLaunchTemplate, "InService")
122 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", updatedLaunchTemplate, nil, []*autoscaling.Instance{instance}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate}))
123 | 	if err != nil {
124 | 		t.Fatal("Shouldn't have returned an error, but returned:", err)
125 | 	}
126 | 	if len(outdated) != 0 || len(updated) != 1 {
127 | 		t.Error("Instance should've been updated")
128 | 	}
129 | }
130 | 
131 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceWithMixedInstancesPolicyIsUpdated(t *testing.T) {
132 | 	launchTemplate := &autoscaling.LaunchTemplateSpecification{
133 | 		LaunchTemplateId:   aws.String("id"),
134 | 		LaunchTemplateName: aws.String("name"),
135 | 		Version:            aws.String("v1"),
136 | 	}
137 | 	updatedEc2LaunchTemplate := &ec2.LaunchTemplate{
138 | 		DefaultVersionNumber: aws.Int64(1),
139 | 		LatestVersionNumber:  aws.Int64(10),
140 | 		LaunchTemplateId:     launchTemplate.LaunchTemplateId,
141 | 		LaunchTemplateName:   launchTemplate.LaunchTemplateName,
142 | 	}
143 | 	instance := cloudtest.CreateTestAutoScalingInstance("instance", "", launchTemplate, "InService")
144 | 	instance.SetInstanceType("c5d.2xlarge")
145 | 	overrides := []*autoscaling.LaunchTemplateOverrides{
146 | 		{InstanceType: aws.String("c5.2xlarge")},
147 | 		{InstanceType: aws.String("c5d.2xlarge")},
148 | 	}
149 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", launchTemplate, overrides, []*autoscaling.Instance{instance}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate}))
150 | 	if err != nil {
151 | 		t.Fatal("Shouldn't have returned an error, but returned:", err)
152 | 	}
153 | 	if len(outdated) != 0 || len(updated) != 1 {
154 | 		t.Error("Instance should've been updated")
155 | 	}
156 | }
157 | 
158 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceWithMixedInstancesPolicyAndOverrideIsUpdated(t *testing.T) {
159 | 	launchTemplate := &autoscaling.LaunchTemplateSpecification{
160 | 		LaunchTemplateId:   aws.String("id"),
161 | 		LaunchTemplateName: aws.String("name"),
162 | 		Version:            aws.String("v1"),
163 | 	}
164 | 	updatedEc2LaunchTemplate := &ec2.LaunchTemplate{
165 | 		DefaultVersionNumber: aws.Int64(1),
166 | 		LatestVersionNumber:  aws.Int64(10),
167 | 		LaunchTemplateId:     launchTemplate.LaunchTemplateId,
168 | 		LaunchTemplateName:   launchTemplate.LaunchTemplateName,
169 | 	}
170 | 	instance := cloudtest.CreateTestAutoScalingInstance("instance", "", launchTemplate, "InService")
171 | 	instance.SetInstanceType("c5d.2xlarge")
172 | 	instanceWithLaunchTemplateOverride := cloudtest.CreateTestAutoScalingInstance("instance", "", launchTemplate, "InService")
173 | 	instanceWithLaunchTemplateOverride.SetInstanceType("c5d.2xlarge")
174 | 	overrides := []*autoscaling.LaunchTemplateOverrides{
175 | 		{InstanceType: aws.String("c5.2xlarge"), LaunchTemplateSpecification: launchTemplate},
176 | 		{InstanceType: aws.String("c5d.2xlarge")},
177 | 	}
178 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", launchTemplate, overrides, []*autoscaling.Instance{instance, instanceWithLaunchTemplateOverride}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate}))
179 | 	if err != nil {
180 | 		t.Fatal("Shouldn't have returned an error, but returned:", err)
181 | 	}
182 | 	if len(outdated) != 0 || len(updated) != 2 {
183 | 		t.Error("Instance should've been updated")
184 | 	}
185 | }
186 | 
187 | func TestSeparateOutdatedFromUpdatedInstances_withLaunchConfigurationWhenOneInstanceIsUpdatedAndTwoInstancesAreOutdated(t *testing.T) {
188 | 	firstInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
189 | 	secondInstance := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService")
190 | 	thirdInstance := cloudtest.CreateTestAutoScalingInstance("new", "v2", nil, "InService")
191 | 
192 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{firstInstance, secondInstance, thirdInstance}, false)
193 | 
194 | 	outdated, updated, err := SeparateOutdatedFromUpdatedInstances(asg, nil)
195 | 	if err != nil {
196 | 		t.Fatal("Shouldn't have returned an error, but returned", err)
197 | 	}
198 | 	if len(outdated) != 2 {
199 | 		t.Error("2 instances should've been outdated")
200 | 	}
201 | 	if len(updated) != 1 {
202 | 		t.Error("1 instance should've been outdated")
203 | 	}
204 | }
205 | 
206 | func TestHandleRollingUpgrade(t *testing.T) {
207 | 	oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
208 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance}, false)
209 | 
210 | 	oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi")
211 | 	oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning)
212 | 
213 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod})
214 | 	mockEc2Service := cloudtest.NewMockEC2Service(nil)
215 | 	mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})
216 | 
217 | 	// First run (Node rollout process gets marked as started)
218 | 	err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
219 | 	if err != nil {
220 | 		t.Error("unexpected error:", err)
221 | 	}
222 | 	if mockClient.Counter["UpdateNode"] != 1 {
223 | 		t.Error("Node should've been annotated, meaning that UpdateNode should've been called once")
224 | 	}
225 | 	oldNode = mockClient.Nodes[oldNode.Name]
226 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateStartedTimestamp]; !ok {
227 | 		t.Error("Node should've been annotated with", k8s.AnnotationRollingUpdateStartedTimestamp)
228 | 	}
229 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
230 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
231 | 	}
232 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; ok {
233 | 		t.Error("Node shouldn't have been terminated yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateTerminatedTimestamp)
234 | 	}
235 | 
236 | 	// Second run (ASG's desired capacity gets increased)
237 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
238 | 	if err != nil {
239 | 		t.Error("unexpected error:", err)
240 | 	}
241 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
242 | 		t.Error("ASG should've been increased because there's no updated nodes yet")
243 | 	}
244 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
245 | 	if aws.Int64Value(asg.DesiredCapacity) != 2 {
246 | 		t.Error("The desired capacity of the ASG should've been increased to 2")
247 | 	}
248 | 	oldNode = mockClient.Nodes[oldNode.Name]
249 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
250 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
251 | 	}
252 | 
253 | 	// Third run (Nothing changed)
254 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
255 | 	if err != nil {
256 | 		t.Error("unexpected error:", err)
257 | 	}
258 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
259 | 		t.Error("Desired capacity shouldn't have been updated")
260 | 	}
261 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
262 | 	if aws.Int64Value(asg.DesiredCapacity) != 2 {
263 | 		t.Error("The desired capacity of the ASG should've stayed at 2")
264 | 	}
265 | 	oldNode = mockClient.Nodes[oldNode.Name]
266 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
267 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
268 | 	}
269 | 
270 | 	// Fourth run (new instance has been registered to ASG, but is pending)
271 | 	newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "v2", nil, "Pending")
272 | 	asg.Instances = append(asg.Instances, newInstance)
273 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
274 | 	if err != nil {
275 | 		t.Error("unexpected error:", err)
276 | 	}
277 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
278 | 		t.Error("Desired capacity shouldn't have been updated")
279 | 	}
280 | 	oldNode = mockClient.Nodes[oldNode.Name]
281 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
282 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
283 | 	}
284 | 
285 | 	// Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found))
286 | 	newInstance.SetLifecycleState("InService")
287 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
288 | 	if err != nil {
289 | 		t.Error("unexpected error:", err)
290 | 	}
291 | 	oldNode = mockClient.Nodes[oldNode.Name]
292 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
293 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
294 | 	}
295 | 
296 | 	// Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet)
297 | 	newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi")
298 | 	newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}
299 | 	mockClient.Nodes[newNode.Name] = newNode
300 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
301 | 	if err != nil {
302 | 		t.Error("unexpected error:", err)
303 | 	}
304 | 	oldNode = mockClient.Nodes[oldNode.Name]
305 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
306 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
307 | 	}
308 | 
309 | 	// Seventh run (Kubelet is ready to accept new pods. Old node gets drained and terminated)
310 | 	newNode = mockClient.Nodes[newNode.Name]
311 | 	newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}}
312 | 	mockClient.Nodes[newNode.Name] = newNode
313 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
314 | 	if err != nil {
315 | 		t.Error("unexpected error:", err)
316 | 	}
317 | 	oldNode = mockClient.Nodes[oldNode.Name]
318 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; !ok {
319 | 		t.Error("Node should've been drained")
320 | 	}
321 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; !ok {
322 | 		t.Error("Node should've been terminated")
323 | 	}
324 | }
325 | 
326 | func TestHandleRollingUpgrade_withLaunchTemplate(t *testing.T) {
327 | 	oldLaunchTemplateSpecification := &autoscaling.LaunchTemplateSpecification{
328 | 		LaunchTemplateId:   aws.String("lt1"),
329 | 		LaunchTemplateName: aws.String("lt1"),
330 | 		Version:            aws.String("1"),
331 | 	}
332 | 	newLaunchTemplateSpecification := &autoscaling.LaunchTemplateSpecification{
333 | 		LaunchTemplateId:   aws.String("lt1"),
334 | 		LaunchTemplateName: aws.String("lt1"),
335 | 		Version:            aws.String("2"),
336 | 	}
337 | 	lt := &ec2.LaunchTemplate{
338 | 		DefaultVersionNumber: aws.Int64(1),
339 | 		LatestVersionNumber:  aws.Int64(1),
340 | 		LaunchTemplateId:     aws.String("lt1"),
341 | 		LaunchTemplateName:   aws.String("lt1"),
342 | 	}
343 | 	oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "", oldLaunchTemplateSpecification, "InService")
344 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "", newLaunchTemplateSpecification, []*autoscaling.Instance{oldInstance}, false)
345 | 
346 | 	oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi")
347 | 	oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning)
348 | 
349 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod})
350 | 	mockEc2Service := cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{lt})
351 | 	mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})
352 | 
353 | 	// First run (Node rollout process gets marked as started)
354 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
355 | 	if mockClient.Counter["UpdateNode"] != 1 {
356 | 		t.Error("Node should've been annotated, meaning that UpdateNode should've been called once")
357 | 	}
358 | 	oldNode = mockClient.Nodes[oldNode.Name]
359 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateStartedTimestamp]; !ok {
360 | 		t.Error("Node should've been annotated with", k8s.AnnotationRollingUpdateStartedTimestamp)
361 | 	}
362 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
363 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
364 | 	}
365 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; ok {
366 | 		t.Error("Node shouldn't have been terminated yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateTerminatedTimestamp)
367 | 	}
368 | 
369 | 	// Second run (ASG's desired capacity gets increased)
370 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
371 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
372 | 		t.Error("ASG should've been increased because there's no updated nodes yet")
373 | 	}
374 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
375 | 	if aws.Int64Value(asg.DesiredCapacity) != 2 {
376 | 		t.Error("The desired capacity of the ASG should've been increased to 2")
377 | 	}
378 | 	oldNode = mockClient.Nodes[oldNode.Name]
379 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
380 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
381 | 	}
382 | 
383 | 	// Third run (Nothing changed)
384 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
385 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
386 | 		t.Error("Desired capacity shouldn't have been updated")
387 | 	}
388 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
389 | 	if aws.Int64Value(asg.DesiredCapacity) != 2 {
390 | 		t.Error("The desired capacity of the ASG should've stayed at 2")
391 | 	}
392 | 	oldNode = mockClient.Nodes[oldNode.Name]
393 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
394 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
395 | 	}
396 | 
397 | 	// Fourth run (new instance has been registered to ASG, but is pending)
398 | 	newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "", newLaunchTemplateSpecification, "Pending")
399 | 	asg.Instances = append(asg.Instances, newInstance)
400 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
401 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
402 | 		t.Error("Desired capacity shouldn't have been updated")
403 | 	}
404 | 	oldNode = mockClient.Nodes[oldNode.Name]
405 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
406 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
407 | 	}
408 | 
409 | 	// Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found))
410 | 	newInstance.SetLifecycleState("InService")
411 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
412 | 	oldNode = mockClient.Nodes[oldNode.Name]
413 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
414 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
415 | 	}
416 | 
417 | 	// Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet)
418 | 	newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi")
419 | 	newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}
420 | 	mockClient.Nodes[newNode.Name] = newNode
421 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
422 | 	oldNode = mockClient.Nodes[oldNode.Name]
423 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
424 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
425 | 	}
426 | 
427 | 	// Seventh run (Kubelet is ready to accept new pods. Old node gets drained and terminated)
428 | 	newNode = mockClient.Nodes[newNode.Name]
429 | 	newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}}
430 | 	mockClient.Nodes[newNode.Name] = newNode
431 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
432 | 	oldNode = mockClient.Nodes[oldNode.Name]
433 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; !ok {
434 | 		t.Error("Node should've been drained")
435 | 	}
436 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; !ok {
437 | 		t.Error("Node should've been terminated")
438 | 	}
439 | }
440 | 
441 | func TestHandleRollingUpgrade_withLaunchTemplateWhenLaunchTemplateDidNotUpdate(t *testing.T) {
442 | 	launchTemplateSpecification := &autoscaling.LaunchTemplateSpecification{
443 | 		LaunchTemplateId:   aws.String("lt1"),
444 | 		LaunchTemplateName: aws.String("lt1"),
445 | 		Version:            aws.String("1"),
446 | 	}
447 | 	lt := &ec2.LaunchTemplate{
448 | 		DefaultVersionNumber: aws.Int64(1),
449 | 		LatestVersionNumber:  aws.Int64(1),
450 | 		LaunchTemplateId:     aws.String("lt1"),
451 | 		LaunchTemplateName:   aws.String("lt1"),
452 | 	}
453 | 	oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "", launchTemplateSpecification, "InService")
454 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "", launchTemplateSpecification, []*autoscaling.Instance{oldInstance}, false)
455 | 
456 | 	oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi")
457 | 
458 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{})
459 | 	mockEc2Service := cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{lt})
460 | 	mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})
461 | 
462 | 	// First run (No changes, no updates)
463 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
464 | 	if mockClient.Counter["UpdateNode"] != 0 {
465 | 		t.Error("The LT hasn't been updated, therefore nothing should've changed")
466 | 	}
467 | }
468 | 
469 | func TestHandleRollingUpgrade_withEnoughPodsToRequireTwoNewNodes(t *testing.T) {
470 | 	oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
471 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance}, false)
472 | 
473 | 	oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi")
474 | 	oldNodeFirstPod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "300m", "300Mi", false, v1.PodRunning)
475 | 	oldNodeSecondPod := k8stest.CreateTestPod("old-pod-2", oldNode.Name, "300m", "300Mi", false, v1.PodRunning)
476 | 	oldNodeThirdPod := k8stest.CreateTestPod("old-pod-3", oldNode.Name, "300m", "300Mi", false, v1.PodRunning)
477 | 	oldNodeFourthPod := k8stest.CreateTestPod("old-pod-4", oldNode.Name, "300m", "300Mi", false, v1.PodRunning)
478 | 	// This pod should be ignored, because the pod.Status.Phase is v1.PodFailed
479 | 	oldNodeFifthPod := k8stest.CreateTestPod("old-pod-5-evicted", oldNode.Name, "99999m", "99999Mi", false, v1.PodFailed)
480 | 
481 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod, oldNodeFourthPod, oldNodeFifthPod})
482 | 	mockEc2Service := cloudtest.NewMockEC2Service(nil)
483 | 	mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})
484 | 
485 | 	// First run (Node rollout process gets marked as started)
486 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
487 | 	if mockClient.Counter["UpdateNode"] != 1 {
488 | 		t.Error("Node should've been annotated, meaning that UpdateNode should've been called once")
489 | 	}
490 | 	oldNode = mockClient.Nodes[oldNode.Name]
491 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateStartedTimestamp]; !ok {
492 | 		t.Error("Node should've been annotated with", k8s.AnnotationRollingUpdateStartedTimestamp)
493 | 	}
494 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
495 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
496 | 	}
497 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; ok {
498 | 		t.Error("Node shouldn't have been terminated yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateTerminatedTimestamp)
499 | 	}
500 | 
501 | 	// Second run (ASG's desired capacity gets increased)
502 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
503 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
504 | 		t.Error("ASG should've been increased because there's no updated nodes yet")
505 | 	}
506 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
507 | 	if aws.Int64Value(asg.DesiredCapacity) != 2 {
508 | 		t.Error("The desired capacity of the ASG should've been increased to 2")
509 | 	}
510 | 	oldNode = mockClient.Nodes[oldNode.Name]
511 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
512 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
513 | 	}
514 | 
515 | 	// Third run (Nothing changed)
516 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
517 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
518 | 		t.Error("Desired capacity shouldn't have been updated")
519 | 	}
520 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
521 | 	if aws.Int64Value(asg.DesiredCapacity) != 2 {
522 | 		t.Error("The desired capacity of the ASG should've stayed at 2")
523 | 	}
524 | 	oldNode = mockClient.Nodes[oldNode.Name]
525 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
526 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
527 | 	}
528 | 
529 | 	// Fourth run (new instance has been registered to ASG, but is pending)
530 | 	newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "v2", nil, "Pending")
531 | 	asg.Instances = append(asg.Instances, newInstance)
532 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
533 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
534 | 		t.Error("Desired capacity shouldn't have been updated")
535 | 	}
536 | 	oldNode = mockClient.Nodes[oldNode.Name]
537 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
538 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
539 | 	}
540 | 
541 | 	// Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found))
542 | 	newInstance.SetLifecycleState("InService")
543 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
544 | 	oldNode = mockClient.Nodes[oldNode.Name]
545 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
546 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
547 | 	}
548 | 
549 | 	// Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet)
550 | 	newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi")
551 | 	newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}
552 | 	mockClient.Nodes[newNode.Name] = newNode
553 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
554 | 	oldNode = mockClient.Nodes[oldNode.Name]
555 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
556 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
557 | 	}
558 | 
559 | 	// Seventh run (Kubelet is ready to accept new pods)
560 | 	newNode = mockClient.Nodes[newNode.Name]
561 | 	newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}}
562 | 	mockClient.Nodes[newNode.Name] = newNode
563 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
564 | 	oldNode = mockClient.Nodes[oldNode.Name]
565 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
566 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
567 | 	}
568 | 
569 | 	// Eight run (ASG's desired capacity gets increased)
570 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
571 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 2 {
572 | 		t.Error("ASG should've been increased again")
573 | 	}
574 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
575 | 	if aws.Int64Value(asg.DesiredCapacity) != 3 {
576 | 		t.Error("The desired capacity of the ASG should've been increased to 3")
577 | 	}
578 | 	oldNode = mockClient.Nodes[oldNode.Name]
579 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok {
580 | 		t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp)
581 | 	}
582 | 
583 | 	// Ninth run (fast-forward new instance, node and kubelet ready to accept. Old node gets drained and terminated)
584 | 	newSecondInstance := cloudtest.CreateTestAutoScalingInstance("new-2", "v2", nil, "InService")
585 | 	asg.Instances = append(asg.Instances, newSecondInstance)
586 | 	newSecondNode := k8stest.CreateTestNode("new-node-2", aws.StringValue(newSecondInstance.AvailabilityZone), aws.StringValue(newSecondInstance.InstanceId), "1000m", "1000Mi")
587 | 	newSecondNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}}
588 | 	mockClient.Nodes[newSecondNode.Name] = newSecondNode
589 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
590 | 	oldNode = mockClient.Nodes[oldNode.Name]
591 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; !ok {
592 | 		t.Error("Node should've been drained")
593 | 	}
594 | 	if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; !ok {
595 | 		t.Error("Node should've been terminated")
596 | 	}
597 | }
598 | 
599 | // The mixed instance policy is not part of the launch template; it's part of the ASG itself.
600 | // This means that not only must we check the launch template version (it doesn't change in this test), but
601 | // we must also check if the instance's instance type is part of the MixedInstancesPolicy's instance types.
602 | // If it isn't, then it means the ASG has been modified, and the instance is old.
603 | func TestHandleRollingUpgrade_withMixedInstancePolicyWhenOneOfTheInstanceTypesOverrideChanges(t *testing.T) {
604 | 	launchTemplateSpecification := &autoscaling.LaunchTemplateSpecification{
605 | 		LaunchTemplateId:   aws.String("lt1"),
606 | 		LaunchTemplateName: aws.String("lt1"),
607 | 		Version:            aws.String("1"),
608 | 	}
609 | 	lt := &ec2.LaunchTemplate{
610 | 		DefaultVersionNumber: aws.Int64(1),
611 | 		LatestVersionNumber:  aws.Int64(1),
612 | 		LaunchTemplateId:     aws.String("lt1"),
613 | 		LaunchTemplateName:   aws.String("lt1"),
614 | 	}
615 | 	oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "", launchTemplateSpecification, "InService")
616 | 	// The LT has NOT changed, but we're setting withMixedInstancesPolicy to true
617 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "", launchTemplateSpecification, []*autoscaling.Instance{oldInstance}, true)
618 | 	// We set the instance type to something isn't the default instance type, because the first one has the same value as the
619 | 	// Launch template version, meaning that modifying that one would likely trigger a new version to be created.
620 | 	// What we're trying to test here is whether we're able to trigger a rolling update on an instance type that is no
621 | 	// longer part of the MixedInstancesPolicy overrides
622 | 	oldInstance.SetInstanceType(aws.StringValue(asg.MixedInstancesPolicy.LaunchTemplate.Overrides[1].InstanceType))
623 | 
624 | 	oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi")
625 | 
626 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{})
627 | 	mockEc2Service := cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{lt})
628 | 	mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})
629 | 
630 | 	// First run (Nothing changed)
631 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
632 | 	if mockClient.Counter["UpdateNode"] != 0 {
633 | 		t.Error("Nothing should've changed")
634 | 	}
635 | 
636 | 	// Suddenly, the ASG's MixedInstancePolicy gets updated, and only the first instance type override is kept
637 | 	// The second instance type is the one that our old instance uses
638 | 	asg.MixedInstancesPolicy.SetLaunchTemplate(&autoscaling.LaunchTemplate{
639 | 		LaunchTemplateSpecification: asg.MixedInstancesPolicy.LaunchTemplate.LaunchTemplateSpecification,
640 | 		Overrides:                   asg.MixedInstancesPolicy.LaunchTemplate.Overrides[0:1],
641 | 	})
642 | 
643 | 	// Second run
644 | 	HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
645 | 	if mockClient.Counter["UpdateNode"] != 1 {
646 | 		t.Error("The old instance's instance type is no longer part of the ASG's MixedInstancePolicy's LaunchTemplate overrides, therefore, it is outdated and should've been annotated")
647 | 	}
648 | }
649 | 
650 | func TestHasAcceptableNumberOfUpdatedNonReadyNodes(t *testing.T) {
651 | 	// false: there's too many non-ready nodes
652 | 	// true:  there's an acceptable amount of non-ready nodes given how many ready nodes there are
653 | 	if HasAcceptableNumberOfUpdatedNonReadyNodes(100, 0) {
654 | 		t.Error("100NR/0R ready should not be acceptable")
655 | 	}
656 | 	if HasAcceptableNumberOfUpdatedNonReadyNodes(50, 50) {
657 | 		t.Error("50NR/50R should not be acceptable")
658 | 	}
659 | 	if HasAcceptableNumberOfUpdatedNonReadyNodes(6, 10000) {
660 | 		t.Error("6NR/10000R should not be acceptable, because MaximumNumberOfUpdatedNonReadyNodes is set to", MaximumNumberOfUpdatedNonReadyNodes)
661 | 	}
662 | 	if !HasAcceptableNumberOfUpdatedNonReadyNodes(5, 10000) {
663 | 		t.Error("5NR/10000R should be acceptable")
664 | 	}
665 | 	if !HasAcceptableNumberOfUpdatedNonReadyNodes(4, 100) {
666 | 		t.Error("4NR/100R should be acceptable")
667 | 	}
668 | 	if !HasAcceptableNumberOfUpdatedNonReadyNodes(1, 99) {
669 | 		t.Error("1NR/99R should be acceptable")
670 | 	}
671 | 	if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 100) {
672 | 		t.Error("0NR/100R should be acceptable")
673 | 	}
674 | 	if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 1) {
675 | 		t.Error("0NR/1R should be acceptable")
676 | 	}
677 | 	if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 0) {
678 | 		t.Error("0NR/0R should be acceptable")
679 | 	}
680 | 	if !HasAcceptableNumberOfUpdatedNonReadyNodes(1, 11) {
681 | 		t.Error("1NR/11R should be acceptable")
682 | 	}
683 | }
684 | 
685 | func TestHandleRollingUpgrade_withEagerCordoning(t *testing.T) {
686 | 	config.Set(nil, true, true, true, false)
687 | 	defer config.Set(nil, true, true, false, false)
688 | 
689 | 	oldInstance1 := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
690 | 	oldInstance2 := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService")
691 | 	oldInstance3 := cloudtest.CreateTestAutoScalingInstance("old-3", "v1", nil, "InService")
692 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance1, oldInstance2, oldInstance3}, false)
693 | 
694 | 	oldNode1 := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance1.AvailabilityZone), aws.StringValue(oldInstance1.InstanceId), "1000m", "1000Mi")
695 | 	oldNode2 := k8stest.CreateTestNode("old-node-2", aws.StringValue(oldInstance2.AvailabilityZone), aws.StringValue(oldInstance2.InstanceId), "1000m", "1000Mi")
696 | 	oldNode3 := k8stest.CreateTestNode("old-node-3", aws.StringValue(oldInstance3.AvailabilityZone), aws.StringValue(oldInstance3.InstanceId), "1000m", "1000Mi")
697 | 	oldNodePod1 := k8stest.CreateTestPod("old-pod-1", oldNode1.Name, "600m", "600Mi", false, v1.PodRunning)
698 | 	oldNodePod2 := k8stest.CreateTestPod("old-pod-2", oldNode2.Name, "600m", "600Mi", false, v1.PodRunning)
699 | 	oldNodePod3 := k8stest.CreateTestPod("old-pod-3", oldNode3.Name, "600m", "600Mi", false, v1.PodRunning)
700 | 
701 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode1, oldNode2, oldNode3}, []v1.Pod{oldNodePod1, oldNodePod2, oldNodePod3})
702 | 	mockEc2Service := cloudtest.NewMockEC2Service(nil)
703 | 	mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})
704 | 
705 | 	// First run (Node rollout process gets marked as started)
706 | 	err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
707 | 	if err != nil {
708 | 		t.Error("unexpected error:", err)
709 | 	}
710 | 	if mockClient.Counter["UpdateNode"] != 3 {
711 | 		t.Error("Node should've been annotated as started, meaning that UpdateNode should've been called once")
712 | 	}
713 | 	// Make sure that all nodes were "eagerly cordoned"
714 | 	if mockClient.Counter["Cordon"] != 3 {
715 | 		t.Error("Node should've been annotated, meaning that Cordon should've been called thrice, but was called", mockClient.Counter["Cordon"], "times")
716 | 	}
717 | }
718 | 
719 | func TestHandleRollingUpgrade_withEagerCordoningDisabled(t *testing.T) {
720 | 	// explicitly setting this, but eager cordoning is disabled by default anyways
721 | 	config.Set(nil, true, true, false, true)
722 | 	defer config.Set(nil, true, true, true, false)
723 | 
724 | 	oldInstance1 := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
725 | 	oldInstance2 := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService")
726 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance1, oldInstance2}, false)
727 | 
728 | 	oldNode1 := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance1.AvailabilityZone), aws.StringValue(oldInstance1.InstanceId), "1000m", "1000Mi")
729 | 	oldNode2 := k8stest.CreateTestNode("old-node-2", aws.StringValue(oldInstance2.AvailabilityZone), aws.StringValue(oldInstance2.InstanceId), "1000m", "1000Mi")
730 | 	oldNodePod1 := k8stest.CreateTestPod("old-pod-1", oldNode1.Name, "600m", "600Mi", false, v1.PodRunning)
731 | 	oldNodePod2 := k8stest.CreateTestPod("old-pod-2", oldNode2.Name, "600m", "600Mi", false, v1.PodRunning)
732 | 
733 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode1, oldNode2}, []v1.Pod{oldNodePod1, oldNodePod2})
734 | 	mockEc2Service := cloudtest.NewMockEC2Service(nil)
735 | 	mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})
736 | 
737 | 	// First run (Node rollout process gets marked as started)
738 | 	err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
739 | 	if err != nil {
740 | 		t.Error("unexpected error:", err)
741 | 	}
742 | 	if mockClient.Counter["UpdateNode"] != 2 {
743 | 		t.Error("Nodes should've been annotated as started, meaning that UpdateNode should've been called twice")
744 | 	}
745 | 	// Make sure that all nodes were NOT "eagerly cordoned"
746 | 	if mockClient.Counter["Cordon"] != 0 {
747 | 		t.Error("Eager cordoning is not enabled, so no node should have been cordoned on the first execution")
748 | 	}
749 | }
750 | 
751 | func TestHandleRollingUpgrade_withExcludeFromExternalLoadBalancers(t *testing.T) {
752 | 	config.Set(nil, true, true, false, true)
753 | 	defer config.Set(nil, true, true, false, false)
754 | 
755 | 	oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
756 | 	asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance}, false)
757 | 
758 | 	oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi")
759 | 	oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning)
760 | 
761 | 	mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod})
762 | 	mockEc2Service := cloudtest.NewMockEC2Service(nil)
763 | 	mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})
764 | 
765 | 	// First run (Node rollout process gets marked as started)
766 | 	err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
767 | 	if err != nil {
768 | 		t.Error("unexpected error:", err)
769 | 	}
770 | 	if mockClient.Counter["UpdateNode"] != 1 {
771 | 		t.Error("Node should've been annotated, meaning that UpdateNode should've been called once")
772 | 	}
773 | 
774 | 	// Second run (ASG's desired capacity gets increased)
775 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
776 | 	if err != nil {
777 | 		t.Error("unexpected error:", err)
778 | 	}
779 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
780 | 		t.Error("ASG should've been increased because there's no updated nodes yet")
781 | 	}
782 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
783 | 	if aws.Int64Value(asg.DesiredCapacity) != 2 {
784 | 		t.Error("The desired capacity of the ASG should've been increased to 2")
785 | 	}
786 | 
787 | 	// Third run (Nothing changed)
788 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
789 | 	if err != nil {
790 | 		t.Error("unexpected error:", err)
791 | 	}
792 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
793 | 		t.Error("Desired capacity shouldn't have been updated")
794 | 	}
795 | 	asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
796 | 	if aws.Int64Value(asg.DesiredCapacity) != 2 {
797 | 		t.Error("The desired capacity of the ASG should've stayed at 2")
798 | 	}
799 | 
800 | 	// Fourth run (new instance has been registered to ASG, but is pending)
801 | 	newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "v2", nil, "Pending")
802 | 	asg.Instances = append(asg.Instances, newInstance)
803 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
804 | 	if err != nil {
805 | 		t.Error("unexpected error:", err)
806 | 	}
807 | 	if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
808 | 		t.Error("Desired capacity shouldn't have been updated")
809 | 	}
810 | 
811 | 	// Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found))
812 | 	newInstance.SetLifecycleState("InService")
813 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
814 | 	if err != nil {
815 | 		t.Error("unexpected error:", err)
816 | 	}
817 | 
818 | 	// Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet)
819 | 	newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi")
820 | 	newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}
821 | 	mockClient.Nodes[newNode.Name] = newNode
822 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
823 | 	if err != nil {
824 | 		t.Error("unexpected error:", err)
825 | 	}
826 | 
827 | 	// Seventh run (Kubelet is ready to accept new pods. Old node gets drained and terminated)
828 | 	newNode = mockClient.Nodes[newNode.Name]
829 | 	newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}}
830 | 	mockClient.Nodes[newNode.Name] = newNode
831 | 	err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
832 | 	if err != nil {
833 | 		t.Error("unexpected error:", err)
834 | 	}
835 | 	oldNode = mockClient.Nodes[oldNode.Name]
836 | 	if _, ok := oldNode.GetLabels()[k8s.LabelExcludeFromExternalLoadBalancers]; !ok {
837 | 		t.Error("Node should've been labeled")
838 | 	}
839 | }
840 | 


--------------------------------------------------------------------------------