├── .dockerignore ├── .gitignore ├── .github ├── dependabot.yml └── workflows │ ├── test.yml │ ├── publish-release.yml │ └── publish-latest.yml ├── Dockerfile ├── k8s ├── client_test.go ├── k8s.go ├── util.go ├── client.go └── util_test.go ├── metrics ├── metrics_test.go └── metrics.go ├── config ├── config_test.go └── config.go ├── cloud ├── aws_test.go └── aws.go ├── k8stest └── k8stest.go ├── go.mod ├── cloudtest └── cloudtest.go ├── LICENSE ├── README.md ├── go.sum ├── main.go └── main_test.go /.dockerignore: -------------------------------------------------------------------------------- 1 | .github 2 | .idea -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | *.iml 3 | .idea 4 | .vscode 5 | 6 | # OS 7 | .DS_Store 8 | 9 | # JS 10 | node_modules 11 | 12 | # Go 13 | /vendor 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | labels: ["dependencies"] 6 | schedule: 7 | interval: "weekly" 8 | day: "friday" 9 | - package-ecosystem: "gomod" 10 | directory: "/" 11 | labels: ["dependencies"] 12 | schedule: 13 | interval: "weekly" 14 | day: "friday" 15 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | on: 3 | pull_request: 4 | paths-ignore: 5 | - '*.md' 6 | push: 7 | branches: 8 | - master 9 | paths-ignore: 10 | - '*.md' 11 | jobs: 12 | test: 13 | name: test 14 | runs-on: ubuntu-latest 15 | timeout-minutes: 5 16 | steps: 17 | - uses: actions/setup-go@v5 18 | with: 19 | go-version: 1.23.5 20 | - uses: actions/checkout@v6 21 | - run: go build 22 | - run: go test ./... -cover 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the go application into a binary 2 | FROM golang:alpine as builder 3 | WORKDIR /app 4 | ADD . ./ 5 | RUN go mod tidy 6 | RUN CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -a -installsuffix cgo -o aws-eks-asg-rolling-update-handler . 7 | RUN apk --update add ca-certificates 8 | 9 | # Run the binary on an empty container 10 | FROM scratch 11 | COPY --from=builder /app/aws-eks-asg-rolling-update-handler . 12 | COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt 13 | ENTRYPOINT ["/aws-eks-asg-rolling-update-handler"] 14 | -------------------------------------------------------------------------------- /k8s/client_test.go: -------------------------------------------------------------------------------- 1 | package k8s 2 | 3 | import ( 4 | "testing" 5 | 6 | v1 "k8s.io/api/core/v1" 7 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | fakekubernetes "k8s.io/client-go/kubernetes/fake" 9 | ) 10 | 11 | func TestClient_Drain(t *testing.T) { 12 | fakeKubernetesClient := fakekubernetes.NewSimpleClientset(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "default"}}) 13 | kc := NewClient(fakeKubernetesClient) 14 | if err := kc.Cordon("default"); err != nil { 15 | t.Errorf("Unexpected error: %v", err) 16 | } 17 | if err := kc.Drain("default", true, true, -1); err != nil { 18 | t.Errorf("Unexpected error: %v", err) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /.github/workflows/publish-release.yml: -------------------------------------------------------------------------------- 1 | name: publish-release 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | publish-release: 7 | name: publish-release 8 | runs-on: ubuntu-latest 9 | timeout-minutes: 60 10 | steps: 11 | - uses: actions/checkout@v6 12 | - name: Get image repository 13 | run: echo IMAGE_REPOSITORY=$(echo ${{ secrets.DOCKER_USERNAME }}/${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV 14 | - name: Get the release 15 | run: echo RELEASE=${GITHUB_REF/refs\/tags\//} >> $GITHUB_ENV 16 | - name: Set up QEMU 17 | uses: docker/setup-qemu-action@v3 18 | - name: Set up Docker Buildx 19 | uses: docker/setup-buildx-action@v3 20 | - name: Login to Docker Registry 21 | uses: docker/login-action@v3 22 | with: 23 | username: ${{ secrets.DOCKER_USERNAME }} 24 | password: ${{ secrets.DOCKER_PASSWORD }} 25 | - name: Build and push docker image 26 | uses: docker/build-push-action@v6 27 | with: 28 | platforms: linux/amd64,linux/arm/v7,linux/arm64 29 | pull: true 30 | push: true 31 | tags: | 32 | ${{ env.IMAGE_REPOSITORY }}:${{ env.RELEASE }} 33 | ${{ env.IMAGE_REPOSITORY }}:stable 34 | ${{ env.IMAGE_REPOSITORY }}:latest 35 | -------------------------------------------------------------------------------- /k8s/k8s.go: -------------------------------------------------------------------------------- 1 | package k8s 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | 7 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/config" 8 | "k8s.io/client-go/kubernetes" 9 | "k8s.io/client-go/rest" 10 | "k8s.io/client-go/tools/clientcmd" 11 | ) 12 | 13 | // CreateClientSet Creates a Kubernetes ClientSet for authenticating with a cluster 14 | // If the current environment is dev, use the user's kubeconfig 15 | // If it isn't, then it means that the application is inside the cluster, which means 16 | // we'll use the service account token 17 | func CreateClientSet() (*kubernetes.Clientset, error) { 18 | var cfg *rest.Config 19 | if config.Get().Environment == "dev" { 20 | var kubeConfig string 21 | if home := homeDir(); home != "" { 22 | kubeConfig = filepath.Join(home, ".kube", "config") 23 | } else { 24 | panic("Home directory not found") 25 | } 26 | // use the current context in kubeconfig 27 | clientConfig, err := clientcmd.BuildConfigFromFlags("", kubeConfig) 28 | if err != nil { 29 | return nil, err 30 | } 31 | cfg = clientConfig 32 | cfg.UserAgent = "aws-eks-asg-rolling-update-handler/1.0" 33 | } else { 34 | clientConfig, err := rest.InClusterConfig() 35 | if err != nil { 36 | return nil, err 37 | } 38 | cfg = clientConfig 39 | } 40 | return kubernetes.NewForConfig(cfg) 41 | } 42 | 43 | func homeDir() string { 44 | if home := os.Getenv("HOME"); home != "" { 45 | return home 46 | } 47 | return os.Getenv("USERPROFILE") // windows 48 | } 49 | -------------------------------------------------------------------------------- /.github/workflows/publish-latest.yml: -------------------------------------------------------------------------------- 1 | name: publish-latest 2 | on: 3 | workflow_run: 4 | workflows: [test] 5 | branches: [master] 6 | types: [completed] 7 | concurrency: 8 | group: ${{ github.event.workflow_run.head_repository.full_name }}::${{ github.event.workflow_run.head_branch }}::${{ github.workflow }} 9 | cancel-in-progress: true 10 | jobs: 11 | publish-latest: 12 | name: publish-latest 13 | runs-on: ubuntu-latest 14 | if: ${{ (github.event.workflow_run.conclusion == 'success') && (github.event.workflow_run.head_repository.full_name == github.repository) }} 15 | timeout-minutes: 60 16 | steps: 17 | - uses: actions/checkout@v6 18 | - name: Get image repository 19 | run: echo IMAGE_REPOSITORY=$(echo ${{ secrets.DOCKER_USERNAME }}/${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV 20 | - name: Set up QEMU 21 | uses: docker/setup-qemu-action@v3 22 | - name: Set up Docker Buildx 23 | uses: docker/setup-buildx-action@v3 24 | - name: Login to Docker Registry 25 | uses: docker/login-action@v3 26 | with: 27 | username: ${{ secrets.DOCKER_USERNAME }} 28 | password: ${{ secrets.DOCKER_PASSWORD }} 29 | - name: Build and push docker image 30 | uses: docker/build-push-action@v6 31 | with: 32 | platforms: linux/amd64,linux/arm64 33 | pull: true 34 | push: true 35 | tags: | 36 | ${{ env.IMAGE_REPOSITORY }}:latest 37 | -------------------------------------------------------------------------------- /metrics/metrics_test.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/prometheus/client_golang/prometheus" 8 | "github.com/prometheus/client_golang/prometheus/testutil" 9 | ) 10 | 11 | func TestMetricServer(t *testing.T) { 12 | 13 | Server.NodeGroups.WithLabelValues().Set(5) 14 | Server.Errors.Add(2) 15 | Server.ScaledUpNodes.WithLabelValues("nodeg-1").Inc() 16 | Server.ScaledUpNodes.WithLabelValues("nodeg-2").Inc() 17 | Server.ScaledDownNodes.WithLabelValues("nodeg-1").Inc() 18 | Server.ScaledDownNodes.WithLabelValues("nodeg-2").Inc() 19 | Server.OutdatedNodes.WithLabelValues("nodeg-1").Set(1) 20 | Server.OutdatedNodes.WithLabelValues("nodeg-2").Set(1) 21 | Server.UpdatedNodes.WithLabelValues("nodeg-1").Set(1) 22 | Server.UpdatedNodes.WithLabelValues("nodeg-2").Set(1) 23 | Server.DrainedNodes.WithLabelValues("nodeg-1").Inc() 24 | Server.DrainedNodes.WithLabelValues("nodeg-2").Inc() 25 | 26 | err := testutil.GatherAndCompare(prometheus.Gatherers{Server.registry}, bytes.NewBufferString(` 27 | # HELP rolling_update_handler_drained_nodes_total The total number of drained nodes 28 | # TYPE rolling_update_handler_drained_nodes_total counter 29 | rolling_update_handler_drained_nodes_total{node_group="nodeg-1"} 1 30 | rolling_update_handler_drained_nodes_total{node_group="nodeg-2"} 1 31 | # HELP rolling_update_handler_errors The total number of errors 32 | # TYPE rolling_update_handler_errors counter 33 | rolling_update_handler_errors 2 34 | # HELP rolling_update_handler_node_groups The total number of node groups managed 35 | # TYPE rolling_update_handler_node_groups gauge 36 | rolling_update_handler_node_groups 5 37 | # HELP rolling_update_handler_outdated_nodes The number of outdated nodes 38 | # TYPE rolling_update_handler_outdated_nodes gauge 39 | rolling_update_handler_outdated_nodes{node_group="nodeg-1"} 1 40 | rolling_update_handler_outdated_nodes{node_group="nodeg-2"} 1 41 | # HELP rolling_update_handler_scaled_down_nodes The total number of nodes scaled down 42 | # TYPE rolling_update_handler_scaled_down_nodes counter 43 | rolling_update_handler_scaled_down_nodes{node_group="nodeg-1"} 1 44 | rolling_update_handler_scaled_down_nodes{node_group="nodeg-2"} 1 45 | # HELP rolling_update_handler_scaled_up_nodes The total number of nodes scaled up 46 | # TYPE rolling_update_handler_scaled_up_nodes counter 47 | rolling_update_handler_scaled_up_nodes{node_group="nodeg-1"} 1 48 | rolling_update_handler_scaled_up_nodes{node_group="nodeg-2"} 1 49 | # HELP rolling_update_handler_updated_nodes The number of updated nodes 50 | # TYPE rolling_update_handler_updated_nodes gauge 51 | rolling_update_handler_updated_nodes{node_group="nodeg-1"} 1 52 | rolling_update_handler_updated_nodes{node_group="nodeg-2"} 1 53 | `)) 54 | 55 | if err != nil { 56 | t.Errorf("Expected no errors but got: %v", err) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /metrics/metrics.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "net/http" 5 | "reflect" 6 | "strconv" 7 | 8 | "github.com/prometheus/client_golang/prometheus" 9 | "github.com/prometheus/client_golang/prometheus/promhttp" 10 | ) 11 | 12 | var ( 13 | namespace = "rolling_update_handler" 14 | Server *metricServer 15 | ) 16 | 17 | type metricServer struct { 18 | registry *prometheus.Registry 19 | 20 | NodeGroups *prometheus.GaugeVec 21 | OutdatedNodes *prometheus.GaugeVec 22 | UpdatedNodes *prometheus.GaugeVec 23 | ScaledUpNodes *prometheus.CounterVec 24 | ScaledDownNodes *prometheus.CounterVec 25 | DrainedNodes *prometheus.CounterVec 26 | Errors prometheus.Counter 27 | } 28 | 29 | func init() { 30 | Server = newMetricServer() 31 | } 32 | 33 | func newMetricServer() *metricServer { 34 | m := &metricServer{ 35 | registry: prometheus.NewPedanticRegistry(), 36 | NodeGroups: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 37 | Namespace: namespace, 38 | Name: "node_groups", 39 | Help: "The total number of node groups managed"}, 40 | []string{}), 41 | OutdatedNodes: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 42 | Namespace: namespace, 43 | Name: "outdated_nodes", 44 | Help: "The number of outdated nodes", 45 | }, []string{"node_group"}), 46 | UpdatedNodes: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 47 | Namespace: namespace, 48 | Name: "updated_nodes", 49 | Help: "The number of updated nodes", 50 | }, []string{"node_group"}), 51 | ScaledUpNodes: prometheus.NewCounterVec(prometheus.CounterOpts{ 52 | Namespace: namespace, 53 | Name: "scaled_up_nodes", 54 | Help: "The total number of nodes scaled up", 55 | }, []string{"node_group"}), 56 | ScaledDownNodes: prometheus.NewCounterVec(prometheus.CounterOpts{ 57 | Namespace: namespace, 58 | Name: "scaled_down_nodes", 59 | Help: "The total number of nodes scaled down", 60 | }, []string{"node_group"}), 61 | DrainedNodes: prometheus.NewCounterVec(prometheus.CounterOpts{ 62 | Namespace: namespace, 63 | Name: "drained_nodes_total", 64 | Help: "The total number of drained nodes", 65 | }, []string{"node_group"}), 66 | Errors: prometheus.NewCounter(prometheus.CounterOpts{ 67 | Namespace: namespace, 68 | Name: "errors", 69 | Help: "The total number of errors", 70 | }), 71 | } 72 | m.register() 73 | return m 74 | } 75 | 76 | func (m *metricServer) register() { 77 | v := reflect.ValueOf(*m) 78 | for i := 0; i < v.NumField(); i++ { 79 | if v.Field(i).CanInterface() { 80 | if metric, ok := v.Field(i).Interface().(prometheus.Collector); ok { 81 | m.registry.MustRegister(metric) 82 | } 83 | } 84 | } 85 | } 86 | 87 | func (m *metricServer) Listen(port int) error { 88 | gatherers := prometheus.Gatherers{prometheus.DefaultGatherer, m.registry} 89 | http.Handle("/metrics", promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{})) 90 | return http.ListenAndServe(":"+strconv.Itoa(port), nil) 91 | } 92 | -------------------------------------------------------------------------------- /config/config_test.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "os" 5 | "reflect" 6 | "testing" 7 | ) 8 | 9 | func TestInitialize(t *testing.T) { 10 | _ = os.Setenv(EnvAutoScalingGroupNames, "asg-a,asg-b,asg-c") 11 | _ = os.Setenv(EnvIgnoreDaemonSets, "false") 12 | _ = os.Setenv(EnvDeleteLocalData, "false") 13 | _ = os.Setenv(EnvSlowMode, "true") 14 | defer os.Clearenv() 15 | _ = Initialize() 16 | config := Get() 17 | if len(config.AutoScalingGroupNames) != 3 { 18 | t.Error() 19 | } 20 | if config.IgnoreDaemonSets { 21 | t.Error("IgnoreDaemonSets should be false") 22 | } 23 | if config.DeleteEmptyDirData { 24 | t.Error("DeleteEmptyDirData should be false") 25 | } 26 | if !config.SlowMode { 27 | t.Error("SlowMode should be true") 28 | } 29 | } 30 | 31 | func TestInitialize_withDefaultNonRequiredValues(t *testing.T) { 32 | _ = os.Setenv(EnvAutoScalingGroupNames, "asg-a,asg-b,asg-c") 33 | defer os.Clearenv() 34 | _ = Initialize() 35 | config := Get() 36 | if len(config.AutoScalingGroupNames) != 3 { 37 | t.Error() 38 | } 39 | if !config.IgnoreDaemonSets { 40 | t.Error("should've defaulted to ignoring daemon sets") 41 | } 42 | if !config.DeleteEmptyDirData { 43 | t.Error("should've defaulted to deleting local data") 44 | } 45 | if config.SlowMode { 46 | t.Error("SlowMode should be false") 47 | } 48 | } 49 | 50 | func TestInitialize_withMissingRequiredValues(t *testing.T) { 51 | if err := Initialize(); err == nil { 52 | t.Error("expected error because required environment variables are missing") 53 | } 54 | } 55 | 56 | func TestSet(t *testing.T) { 57 | Set([]string{"asg-a", "asg-b", "asg-c"}, true, true, false, false) 58 | config := Get() 59 | if len(config.AutoScalingGroupNames) != 3 { 60 | t.Error() 61 | } 62 | if !config.IgnoreDaemonSets { 63 | t.Error() 64 | } 65 | if !config.DeleteEmptyDirData { 66 | t.Error() 67 | } 68 | } 69 | 70 | func TestInitialize_withClusterName(t *testing.T) { 71 | _ = os.Setenv(EnvClusterName, "foo") 72 | _ = os.Setenv(EnvAutodiscoveryTags, "foo=bar") 73 | _ = os.Setenv(EnvAutoScalingGroupNames, "foo,bar") 74 | defer os.Clearenv() 75 | _ = Initialize() 76 | config := Get() 77 | if config.AutodiscoveryTags != "k8s.io/cluster-autoscaler/foo=owned,k8s.io/cluster-autoscaler/enabled=true" { 78 | t.Error() 79 | } else if len(config.AutoScalingGroupNames) != 0 { 80 | t.Error() 81 | } 82 | } 83 | 84 | func TestInitialize_withAutodiscoveryTags(t *testing.T) { 85 | _ = os.Unsetenv(EnvClusterName) 86 | _ = os.Setenv(EnvAutodiscoveryTags, "foo=bar,foobar=true") 87 | _ = os.Setenv(EnvAutoScalingGroupNames, "foo,bar") 88 | defer os.Clearenv() 89 | _ = Initialize() 90 | config := Get() 91 | if config.AutodiscoveryTags != "foo=bar,foobar=true" { 92 | t.Error() 93 | } else if len(config.AutoScalingGroupNames) != 0 { 94 | t.Error() 95 | } 96 | } 97 | 98 | func TestInitialize_withAutoScalingGroupNames(t *testing.T) { 99 | _ = os.Unsetenv(EnvClusterName) 100 | _ = os.Unsetenv(EnvAutodiscoveryTags) 101 | _ = os.Setenv(EnvAutoScalingGroupNames, "foo,bar") 102 | defer os.Clearenv() 103 | _ = Initialize() 104 | config := Get() 105 | if !reflect.DeepEqual(config.AutoScalingGroupNames, []string{"foo", "bar"}) { 106 | t.Error() 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /cloud/aws_test.go: -------------------------------------------------------------------------------- 1 | package cloud_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/cloud" 7 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/cloudtest" 8 | "github.com/aws/aws-sdk-go/service/autoscaling" 9 | ) 10 | 11 | func TestDescribeEnabledAutoScalingGroupsByTags(t *testing.T) { 12 | type testCase struct { 13 | autoScalingGroups []struct { 14 | name string 15 | tags map[string]string 16 | } 17 | inputTags string 18 | name string 19 | outputNames []string 20 | } 21 | 22 | testCases := []testCase{ 23 | { 24 | name: "match foo but not bar", 25 | inputTags: "foo=bar", 26 | outputNames: []string{"foo"}, 27 | autoScalingGroups: []struct { 28 | name string 29 | tags map[string]string 30 | }{ 31 | { 32 | name: "bar", 33 | tags: map[string]string{ 34 | "bar": "foo", 35 | }, 36 | }, 37 | { 38 | name: "foo", 39 | tags: map[string]string{ 40 | "foo": "bar", 41 | }, 42 | }, 43 | }, 44 | }, 45 | { 46 | name: "match foo and bar", 47 | inputTags: "foo=bar", 48 | outputNames: []string{"foo", "bar"}, 49 | autoScalingGroups: []struct { 50 | name string 51 | tags map[string]string 52 | }{ 53 | { 54 | name: "bar", 55 | tags: map[string]string{ 56 | "foo": "bar", 57 | }, 58 | }, 59 | { 60 | name: "foo", 61 | tags: map[string]string{ 62 | "foo": "bar", 63 | }, 64 | }, 65 | }, 66 | }, 67 | { 68 | name: "match foo but not bar with multiple input tags", 69 | inputTags: "foo=bar,foobar=true", 70 | outputNames: []string{"foo"}, 71 | autoScalingGroups: []struct { 72 | name string 73 | tags map[string]string 74 | }{ 75 | { 76 | name: "bar", 77 | tags: map[string]string{ 78 | "bar": "foo", 79 | }, 80 | }, 81 | { 82 | name: "foo", 83 | tags: map[string]string{ 84 | "foo": "bar", 85 | "foobar": "true", 86 | }, 87 | }, 88 | }, 89 | }, 90 | { 91 | name: "match foo and bar with multiple input tags", 92 | inputTags: "foo=bar,foobar=true", 93 | outputNames: []string{"foo", "bar"}, 94 | autoScalingGroups: []struct { 95 | name string 96 | tags map[string]string 97 | }{ 98 | { 99 | name: "bar", 100 | tags: map[string]string{ 101 | "foo": "bar", 102 | "foobar": "true", 103 | }, 104 | }, 105 | { 106 | name: "foo", 107 | tags: map[string]string{ 108 | "foo": "bar", 109 | "foobar": "true", 110 | }, 111 | }, 112 | }, 113 | }, 114 | } 115 | 116 | for _, test := range testCases { 117 | autoScalingGroups := []*autoscaling.Group{} 118 | for i, asg := range test.autoScalingGroups { 119 | autoScalingGroup := autoscaling.Group{AutoScalingGroupName: &test.autoScalingGroups[i].name} 120 | for k, v := range asg.tags { 121 | key := k 122 | value := v 123 | autoScalingGroup.Tags = append(autoScalingGroup.Tags, &autoscaling.TagDescription{ 124 | Key: &key, 125 | Value: &value, 126 | }) 127 | } 128 | autoScalingGroups = append(autoScalingGroups, &autoScalingGroup) 129 | } 130 | svc := cloudtest.NewMockAutoScalingService(autoScalingGroups) 131 | output, err := cloud.DescribeEnabledAutoScalingGroupsByTags(svc, test.inputTags) 132 | if err != nil { 133 | t.Error(err) 134 | } 135 | 136 | outMap := map[string]bool{} 137 | for _, outputAutoScalingGroup := range output { 138 | outMap[*outputAutoScalingGroup.AutoScalingGroupName] = false 139 | } 140 | for _, name := range test.outputNames { 141 | if _, ok := outMap[name]; ok { 142 | outMap[name] = true 143 | } else { 144 | t.Errorf("in '%s', expected '%s' to be present in output: %v", test.name, name, output) 145 | } 146 | } 147 | for name, v := range outMap { 148 | if !v { 149 | t.Errorf("in '%s', not expected '%s' to be present in output: %v", test.name, name, output) 150 | } 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /k8stest/k8stest.go: -------------------------------------------------------------------------------- 1 | package k8stest 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | 7 | "github.com/aws/aws-sdk-go/aws" 8 | "github.com/aws/aws-sdk-go/service/autoscaling" 9 | v1 "k8s.io/api/core/v1" 10 | "k8s.io/apimachinery/pkg/api/resource" 11 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 12 | ) 13 | 14 | // TODO: replace this by Kubernetes' official fake client (k8s.io/client-go/kubernetes/fake) 15 | 16 | type MockClient struct { 17 | Counter map[string]int64 18 | Nodes map[string]v1.Node 19 | Pods map[string]v1.Pod 20 | } 21 | 22 | func NewMockClient(nodes []v1.Node, pods []v1.Pod) *MockClient { 23 | client := &MockClient{ 24 | Counter: make(map[string]int64), 25 | Nodes: make(map[string]v1.Node), 26 | Pods: make(map[string]v1.Pod), 27 | } 28 | for _, node := range nodes { 29 | client.Nodes[node.Name] = node 30 | } 31 | for _, pod := range pods { 32 | client.Pods[pod.Name] = pod 33 | } 34 | return client 35 | } 36 | 37 | func (mock *MockClient) GetNodes() ([]v1.Node, error) { 38 | mock.Counter["GetNodes"]++ 39 | var nodes []v1.Node 40 | for _, node := range mock.Nodes { 41 | nodes = append(nodes, node) 42 | } 43 | return nodes, nil 44 | } 45 | 46 | func (mock *MockClient) GetPodsInNode(node string) ([]v1.Pod, error) { 47 | mock.Counter["GetPodsInNode"]++ 48 | var pods []v1.Pod 49 | for _, pod := range mock.Pods { 50 | if pod.Spec.NodeName == node { 51 | pods = append(pods, pod) 52 | } 53 | } 54 | return pods, nil 55 | } 56 | 57 | func (mock *MockClient) GetNodeByAutoScalingInstance(instance *autoscaling.Instance) (*v1.Node, error) { 58 | mock.Counter["GetNodeByAutoScalingInstance"]++ 59 | nodes, _ := mock.GetNodes() 60 | return mock.FilterNodeByAutoScalingInstance(nodes, instance) 61 | } 62 | 63 | func (mock *MockClient) FilterNodeByAutoScalingInstance(nodes []v1.Node, instance *autoscaling.Instance) (*v1.Node, error) { 64 | mock.Counter["FilterNodeByAutoScalingInstance"]++ 65 | for _, node := range nodes { 66 | if node.Spec.ProviderID == fmt.Sprintf("aws:///%s/%s", aws.StringValue(instance.AvailabilityZone), aws.StringValue(instance.InstanceId)) { 67 | return &node, nil 68 | } 69 | } 70 | return nil, errors.New("not found") 71 | } 72 | 73 | func (mock *MockClient) UpdateNode(node *v1.Node) error { 74 | mock.Counter["UpdateNode"]++ 75 | mock.Nodes[node.Name] = *node 76 | return nil 77 | } 78 | 79 | func (mock *MockClient) Cordon(nodeName string) error { 80 | mock.Counter["Cordon"]++ 81 | return nil 82 | } 83 | 84 | func (mock *MockClient) Drain(nodeName string, ignoreDaemonSets, deleteLocalData bool, podTerminationGracePeriod int) error { 85 | mock.Counter["Drain"]++ 86 | return nil 87 | } 88 | 89 | func CreateTestNode(name, availabilityZone, instanceId, allocatableCpu, allocatableMemory string) v1.Node { 90 | node := v1.Node{ 91 | Spec: v1.NodeSpec{ 92 | ProviderID: fmt.Sprintf("aws:///%s/%s", availabilityZone, instanceId), 93 | }, 94 | Status: v1.NodeStatus{ 95 | Allocatable: map[v1.ResourceName]resource.Quantity{ 96 | v1.ResourceCPU: resource.MustParse(allocatableCpu), 97 | v1.ResourceMemory: resource.MustParse(allocatableMemory), 98 | }, 99 | }, 100 | } 101 | node.SetName(name) 102 | node.SetAnnotations(make(map[string]string)) 103 | node.SetLabels(make(map[string]string)) 104 | return node 105 | } 106 | 107 | func CreateTestPod(name, nodeName, cpuRequest, cpuMemory string, isDaemonSet bool, podPhase v1.PodPhase) v1.Pod { 108 | pod := v1.Pod{ 109 | Spec: v1.PodSpec{ 110 | NodeName: nodeName, 111 | Containers: []v1.Container{{ 112 | Name: name, 113 | Resources: v1.ResourceRequirements{ 114 | Requests: v1.ResourceList{ 115 | v1.ResourceCPU: resource.MustParse(cpuRequest), 116 | v1.ResourceMemory: resource.MustParse(cpuMemory), 117 | }, 118 | }, 119 | }}, 120 | }, 121 | Status: v1.PodStatus{Phase: podPhase}, 122 | } 123 | pod.SetName(name) 124 | if isDaemonSet { 125 | pod.SetOwnerReferences([]metav1.OwnerReference{{Kind: "DaemonSet"}}) 126 | } else { 127 | pod.SetOwnerReferences([]metav1.OwnerReference{{Kind: "ReplicaSet"}}) 128 | } 129 | return pod 130 | } 131 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/TwiN/aws-eks-asg-rolling-update-handler 2 | 3 | go 1.24.4 4 | 5 | require ( 6 | github.com/TwiN/gocache/v2 v2.4.0 7 | github.com/aws/aws-sdk-go v1.55.7 8 | github.com/prometheus/client_golang v1.23.2 9 | k8s.io/api v0.34.3 10 | k8s.io/apimachinery v0.34.3 11 | k8s.io/client-go v0.34.3 12 | k8s.io/kubectl v0.34.3 13 | ) 14 | 15 | require ( 16 | github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect 17 | github.com/MakeNowJust/heredoc v1.0.0 // indirect 18 | github.com/beorn7/perks v1.0.1 // indirect 19 | github.com/blang/semver/v4 v4.0.0 // indirect 20 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 21 | github.com/chai2010/gettext-go v1.0.2 // indirect 22 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 23 | github.com/emicklei/go-restful/v3 v3.12.2 // indirect 24 | github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect 25 | github.com/fxamacker/cbor/v2 v2.9.0 // indirect 26 | github.com/go-errors/errors v1.4.2 // indirect 27 | github.com/go-logr/logr v1.4.2 // indirect 28 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 29 | github.com/go-openapi/jsonreference v0.20.2 // indirect 30 | github.com/go-openapi/swag v0.23.0 // indirect 31 | github.com/gogo/protobuf v1.3.2 // indirect 32 | github.com/google/btree v1.1.3 // indirect 33 | github.com/google/gnostic-models v0.7.0 // indirect 34 | github.com/google/uuid v1.6.0 // indirect 35 | github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect 36 | github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect 37 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 38 | github.com/jmespath/go-jmespath v0.4.0 // indirect 39 | github.com/josharian/intern v1.0.0 // indirect 40 | github.com/json-iterator/go v1.1.12 // indirect 41 | github.com/kylelemons/godebug v1.1.0 // indirect 42 | github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect 43 | github.com/mailru/easyjson v0.7.7 // indirect 44 | github.com/mitchellh/go-wordwrap v1.0.1 // indirect 45 | github.com/moby/spdystream v0.5.0 // indirect 46 | github.com/moby/term v0.5.0 // indirect 47 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 48 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect 49 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect 50 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 51 | github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect 52 | github.com/peterbourgon/diskv v2.0.1+incompatible // indirect 53 | github.com/pkg/errors v0.9.1 // indirect 54 | github.com/prometheus/client_model v0.6.2 // indirect 55 | github.com/prometheus/common v0.66.1 // indirect 56 | github.com/prometheus/procfs v0.16.1 // indirect 57 | github.com/russross/blackfriday/v2 v2.1.0 // indirect 58 | github.com/spf13/cobra v1.9.1 // indirect 59 | github.com/spf13/pflag v1.0.6 // indirect 60 | github.com/x448/float16 v0.8.4 // indirect 61 | github.com/xlab/treeprint v1.2.0 // indirect 62 | go.yaml.in/yaml/v2 v2.4.2 // indirect 63 | go.yaml.in/yaml/v3 v3.0.4 // indirect 64 | golang.org/x/net v0.43.0 // indirect 65 | golang.org/x/oauth2 v0.30.0 // indirect 66 | golang.org/x/sync v0.16.0 // indirect 67 | golang.org/x/sys v0.35.0 // indirect 68 | golang.org/x/term v0.34.0 // indirect 69 | golang.org/x/text v0.28.0 // indirect 70 | golang.org/x/time v0.9.0 // indirect 71 | google.golang.org/protobuf v1.36.8 // indirect 72 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 73 | gopkg.in/inf.v0 v0.9.1 // indirect 74 | gopkg.in/yaml.v3 v3.0.1 // indirect 75 | k8s.io/cli-runtime v0.34.3 // indirect 76 | k8s.io/component-base v0.34.3 // indirect 77 | k8s.io/klog/v2 v2.130.1 // indirect 78 | k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect 79 | k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect 80 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect 81 | sigs.k8s.io/kustomize/api v0.20.1 // indirect 82 | sigs.k8s.io/kustomize/kyaml v0.20.1 // indirect 83 | sigs.k8s.io/randfill v1.0.0 // indirect 84 | sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect 85 | sigs.k8s.io/yaml v1.6.0 // indirect 86 | ) 87 | -------------------------------------------------------------------------------- /k8s/util.go: -------------------------------------------------------------------------------- 1 | package k8s 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/aws/aws-sdk-go/service/autoscaling" 7 | v1 "k8s.io/api/core/v1" 8 | ) 9 | 10 | // CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode calculates the resources available in the target nodes 11 | // and compares them with the resources that would be required if the old node were to be drained 12 | // 13 | // This is not foolproof: 2 targetNodes with 1G available in each would cause the assumption that you can fit 14 | // a 2G pod in the targetNodes when you obviously can't (you'd need 1 node with 2G available, not 2 with 1G) 15 | // That's alright, because the purpose is to provide a smooth rolling upgrade, not a flawless experience, and 16 | // while the latter is definitely possible, it would slow down the process by quite a bit. In a way, this is 17 | // the beauty of co-existing with the cluster autoscaler; an extra node will be spun up to handle the leftovers, 18 | // if any. 19 | func CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(client ClientAPI, oldNode *v1.Node, targetNodes []*v1.Node) bool { 20 | totalAvailableTargetCPU := int64(0) 21 | totalAvailableTargetMemory := int64(0) 22 | // Get resources available in target nodes 23 | for _, targetNode := range targetNodes { 24 | availableTargetCPU := targetNode.Status.Allocatable.Cpu().MilliValue() 25 | availableTargetMemory := targetNode.Status.Allocatable.Memory().MilliValue() 26 | podsInNode, err := client.GetPodsInNode(targetNode.Name) 27 | if err != nil { 28 | continue 29 | } 30 | for _, podInNode := range podsInNode { 31 | // Skip pods that have terminated (e.g. "Evicted" pods that haven't been cleaned up) 32 | if podInNode.Status.Phase == v1.PodFailed { 33 | continue 34 | } 35 | for _, container := range podInNode.Spec.Containers { 36 | if container.Resources.Requests.Cpu() != nil { 37 | // Subtract the cpu request of the pod from the node's total allocatable cpu 38 | availableTargetCPU -= container.Resources.Requests.Cpu().MilliValue() 39 | } 40 | if container.Resources.Requests.Memory() != nil { 41 | // Subtract the memory request of the pod from the node's total allocatable memory 42 | totalAvailableTargetMemory -= container.Resources.Requests.Memory().MilliValue() 43 | } 44 | } 45 | } 46 | totalAvailableTargetCPU += availableTargetCPU 47 | totalAvailableTargetMemory += availableTargetMemory 48 | } 49 | cpuNeeded := int64(0) 50 | memoryNeeded := int64(0) 51 | // Get resources requested in old node 52 | podsInNode, err := client.GetPodsInNode(oldNode.Name) 53 | if err != nil { 54 | log.Printf("Unable to determine resources needed for old node, assuming that enough resources are available") 55 | return true 56 | } 57 | for _, podInNode := range podsInNode { 58 | // Skip pods that have terminated (e.g. "Evicted" pods that haven't been cleaned up) 59 | if podInNode.Status.Phase == v1.PodFailed { 60 | continue 61 | } 62 | // Ignore DaemonSets in the old node, because these pods will also be present in the target nodes 63 | hasDaemonSetOwnerReference := false 64 | for _, owner := range podInNode.GetOwnerReferences() { 65 | if owner.Kind == "DaemonSet" { 66 | hasDaemonSetOwnerReference = true 67 | break 68 | } 69 | } 70 | if hasDaemonSetOwnerReference { 71 | continue 72 | } 73 | for _, container := range podInNode.Spec.Containers { 74 | if container.Resources.Requests.Cpu() != nil { 75 | // Subtract the cpu request of the pod from the node's total allocatable 76 | cpuNeeded += container.Resources.Requests.Cpu().MilliValue() 77 | } 78 | if container.Resources.Requests.Memory() != nil { 79 | // Subtract the cpu request of the pod from the node's total allocatable 80 | memoryNeeded += container.Resources.Requests.Memory().MilliValue() 81 | } 82 | } 83 | } 84 | leftOverCPU := totalAvailableTargetCPU - cpuNeeded 85 | leftOverMemory := totalAvailableTargetMemory - memoryNeeded 86 | return leftOverCPU >= 0 && leftOverMemory >= 0 87 | } 88 | 89 | // AnnotateNodeByAutoScalingInstance adds an annotation to the Kubernetes node represented by a given AWS instance 90 | func AnnotateNodeByAutoScalingInstance(client ClientAPI, instance *autoscaling.Instance, key, value string) error { 91 | node, err := client.GetNodeByAutoScalingInstance(instance) 92 | if err != nil { 93 | return err 94 | } 95 | annotations := node.GetAnnotations() 96 | if currentValue := annotations[key]; currentValue != value { 97 | annotations[key] = value 98 | node.SetAnnotations(annotations) 99 | err = client.UpdateNode(node) 100 | if err != nil { 101 | return err 102 | } 103 | } 104 | return nil 105 | } 106 | 107 | // LabelNodeByAutoScalingInstance adds a Label to the Kubernetes node represented by a given AWS instance 108 | func LabelNodeByAutoScalingInstance(client ClientAPI, instance *autoscaling.Instance, key, value string) error { 109 | node, err := client.GetNodeByAutoScalingInstance(instance) 110 | if err != nil { 111 | return err 112 | } 113 | labels := node.GetLabels() 114 | if currentValue := labels[key]; currentValue != value { 115 | labels[key] = value 116 | node.SetLabels(labels) 117 | err = client.UpdateNode(node) 118 | if err != nil { 119 | return err 120 | } 121 | } 122 | return nil 123 | } 124 | -------------------------------------------------------------------------------- /cloudtest/cloudtest.go: -------------------------------------------------------------------------------- 1 | package cloudtest 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/aws/aws-sdk-go/aws" 7 | "github.com/aws/aws-sdk-go/service/autoscaling" 8 | "github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface" 9 | "github.com/aws/aws-sdk-go/service/ec2" 10 | "github.com/aws/aws-sdk-go/service/ec2/ec2iface" 11 | ) 12 | 13 | type MockEC2Service struct { 14 | ec2iface.EC2API 15 | 16 | Counter map[string]int64 17 | Templates []*ec2.LaunchTemplate 18 | } 19 | 20 | func NewMockEC2Service(templates []*ec2.LaunchTemplate) *MockEC2Service { 21 | return &MockEC2Service{ 22 | Counter: make(map[string]int64), 23 | Templates: templates, 24 | } 25 | } 26 | 27 | func (m *MockEC2Service) DescribeLaunchTemplates(_ *ec2.DescribeLaunchTemplatesInput) (*ec2.DescribeLaunchTemplatesOutput, error) { 28 | m.Counter["DescribeLaunchTemplates"]++ 29 | output := &ec2.DescribeLaunchTemplatesOutput{ 30 | LaunchTemplates: m.Templates, 31 | } 32 | return output, nil 33 | } 34 | 35 | func (m *MockEC2Service) DescribeLaunchTemplateByID(input *ec2.DescribeLaunchTemplatesInput) (*ec2.LaunchTemplate, error) { 36 | m.Counter["DescribeLaunchTemplateByID"]++ 37 | for _, template := range m.Templates { 38 | if template.LaunchTemplateId == input.LaunchTemplateIds[0] { 39 | return template, nil 40 | } 41 | if template.LaunchTemplateName == input.LaunchTemplateNames[0] { 42 | return template, nil 43 | } 44 | } 45 | return nil, errors.New("not found") 46 | } 47 | 48 | func CreateTestEc2Instance(id string) *ec2.Instance { 49 | instance := &ec2.Instance{ 50 | InstanceId: aws.String(id), 51 | } 52 | return instance 53 | } 54 | 55 | type MockAutoScalingService struct { 56 | autoscalingiface.AutoScalingAPI 57 | 58 | Counter map[string]int64 59 | AutoScalingGroups map[string]*autoscaling.Group 60 | } 61 | 62 | func NewMockAutoScalingService(autoScalingGroups []*autoscaling.Group) *MockAutoScalingService { 63 | service := &MockAutoScalingService{ 64 | Counter: make(map[string]int64), 65 | AutoScalingGroups: make(map[string]*autoscaling.Group), 66 | } 67 | for _, autoScalingGroup := range autoScalingGroups { 68 | service.AutoScalingGroups[aws.StringValue(autoScalingGroup.AutoScalingGroupName)] = autoScalingGroup 69 | } 70 | return service 71 | } 72 | 73 | func (m *MockAutoScalingService) TerminateInstanceInAutoScalingGroup(_ *autoscaling.TerminateInstanceInAutoScalingGroupInput) (*autoscaling.TerminateInstanceInAutoScalingGroupOutput, error) { 74 | m.Counter["TerminateInstanceInAutoScalingGroup"]++ 75 | return &autoscaling.TerminateInstanceInAutoScalingGroupOutput{}, nil 76 | } 77 | 78 | func (m *MockAutoScalingService) DescribeAutoScalingGroups(input *autoscaling.DescribeAutoScalingGroupsInput) (*autoscaling.DescribeAutoScalingGroupsOutput, error) { 79 | m.Counter["DescribeAutoScalingGroups"]++ 80 | var autoScalingGroups []*autoscaling.Group 81 | for _, autoScalingGroupName := range input.AutoScalingGroupNames { 82 | for _, autoScalingGroup := range m.AutoScalingGroups { 83 | if aws.StringValue(autoScalingGroupName) == aws.StringValue(autoScalingGroup.AutoScalingGroupName) { 84 | autoScalingGroups = append(autoScalingGroups, autoScalingGroup) 85 | } 86 | } 87 | } 88 | return &autoscaling.DescribeAutoScalingGroupsOutput{ 89 | AutoScalingGroups: autoScalingGroups, 90 | }, nil 91 | } 92 | 93 | func (m *MockAutoScalingService) DescribeAutoScalingGroupsPages(input *autoscaling.DescribeAutoScalingGroupsInput, f func(*autoscaling.DescribeAutoScalingGroupsOutput, bool) bool) error { 94 | idx := 0 95 | for _, asg := range m.AutoScalingGroups { 96 | x := &autoscaling.DescribeAutoScalingGroupsOutput{AutoScalingGroups: []*autoscaling.Group{asg}} 97 | idx++ 98 | f(x, idx == len(m.AutoScalingGroups)) 99 | } 100 | return nil 101 | } 102 | 103 | func (m *MockAutoScalingService) SetDesiredCapacity(input *autoscaling.SetDesiredCapacityInput) (*autoscaling.SetDesiredCapacityOutput, error) { 104 | m.Counter["SetDesiredCapacity"]++ 105 | m.AutoScalingGroups[aws.StringValue(input.AutoScalingGroupName)].SetDesiredCapacity(aws.Int64Value(input.DesiredCapacity)) 106 | return &autoscaling.SetDesiredCapacityOutput{}, nil 107 | } 108 | 109 | func (m *MockAutoScalingService) UpdateAutoScalingGroup(_ *autoscaling.UpdateAutoScalingGroupInput) (*autoscaling.UpdateAutoScalingGroupOutput, error) { 110 | m.Counter["UpdateAutoScalingGroup"]++ 111 | return &autoscaling.UpdateAutoScalingGroupOutput{}, nil 112 | } 113 | 114 | func CreateTestAutoScalingGroup(name, launchConfigurationName string, launchTemplateSpecification *autoscaling.LaunchTemplateSpecification, instances []*autoscaling.Instance, withMixedInstancesPolicy bool) *autoscaling.Group { 115 | asg := &autoscaling.Group{ 116 | AutoScalingGroupName: aws.String(name), 117 | Instances: instances, 118 | DesiredCapacity: aws.Int64(int64(len(instances))), 119 | MinSize: aws.Int64(0), 120 | MaxSize: aws.Int64(999), 121 | } 122 | if len(launchConfigurationName) != 0 { 123 | asg.SetLaunchConfigurationName(launchConfigurationName) 124 | } 125 | if withMixedInstancesPolicy { 126 | asg.SetMixedInstancesPolicy(&autoscaling.MixedInstancesPolicy{ 127 | LaunchTemplate: &autoscaling.LaunchTemplate{ 128 | LaunchTemplateSpecification: launchTemplateSpecification, 129 | Overrides: []*autoscaling.LaunchTemplateOverrides{ 130 | {InstanceType: aws.String("c5.2xlarge")}, 131 | {InstanceType: aws.String("c5n.2xlarge")}, 132 | {InstanceType: aws.String("c5d.2xlarge")}, 133 | }, 134 | }, 135 | }) 136 | } else { 137 | if launchTemplateSpecification != nil { 138 | asg.SetLaunchTemplate(launchTemplateSpecification) 139 | } 140 | } 141 | return asg 142 | } 143 | 144 | func CreateTestAutoScalingInstance(id, launchConfigurationName string, launchTemplateSpecification *autoscaling.LaunchTemplateSpecification, lifeCycleState string) *autoscaling.Instance { 145 | instance := &autoscaling.Instance{ 146 | LifecycleState: aws.String(lifeCycleState), 147 | InstanceId: aws.String(id), 148 | InstanceType: aws.String("c5.2xlarge"), 149 | } 150 | if len(launchConfigurationName) != 0 { 151 | instance.SetLaunchConfigurationName(launchConfigurationName) 152 | } 153 | if launchTemplateSpecification != nil { 154 | instance.SetLaunchTemplate(launchTemplateSpecification) 155 | } 156 | return instance 157 | } 158 | -------------------------------------------------------------------------------- /cloud/aws.go: -------------------------------------------------------------------------------- 1 | package cloud 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "strings" 7 | 8 | "github.com/aws/aws-sdk-go/aws" 9 | "github.com/aws/aws-sdk-go/aws/session" 10 | "github.com/aws/aws-sdk-go/service/autoscaling" 11 | "github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface" 12 | "github.com/aws/aws-sdk-go/service/ec2" 13 | "github.com/aws/aws-sdk-go/service/ec2/ec2iface" 14 | ) 15 | 16 | var ( 17 | ErrCannotIncreaseDesiredCountAboveMax = errors.New("cannot increase ASG desired size above max ASG size") 18 | ) 19 | 20 | // GetServices returns an instance of a EC2 client with a session as well as 21 | // an instance of an Autoscaling client with a session 22 | func GetServices(awsRegion string) (ec2iface.EC2API, autoscalingiface.AutoScalingAPI, error) { 23 | awsSession, err := session.NewSession(&aws.Config{Region: aws.String(awsRegion)}) 24 | if err != nil { 25 | return nil, nil, err 26 | } 27 | return ec2.New(awsSession), autoscaling.New(awsSession), nil 28 | } 29 | 30 | func DescribeAutoScalingGroupsByNames(svc autoscalingiface.AutoScalingAPI, names []string) ([]*autoscaling.Group, error) { 31 | input := &autoscaling.DescribeAutoScalingGroupsInput{ 32 | AutoScalingGroupNames: aws.StringSlice(names), 33 | MaxRecords: aws.Int64(100), 34 | } 35 | result, err := svc.DescribeAutoScalingGroups(input) 36 | if err != nil { 37 | return nil, err 38 | } 39 | return result.AutoScalingGroups, nil 40 | } 41 | 42 | func filterAutoScalingGroupsByTag(autoScalingGroups []*autoscaling.Group, filter func([]*autoscaling.TagDescription) bool) (ret []*autoscaling.Group) { 43 | for _, autoScalingGroup := range autoScalingGroups { 44 | if filter(autoScalingGroup.Tags) { 45 | ret = append(ret, autoScalingGroup) 46 | } 47 | } 48 | return 49 | } 50 | 51 | // DescribeEnabledAutoScalingGroupsByTags Gets AutoScalingGroups that match the given tags 52 | func DescribeEnabledAutoScalingGroupsByTags(svc autoscalingiface.AutoScalingAPI, autodiscoveryTags string) ([]*autoscaling.Group, error) { 53 | input := &autoscaling.DescribeAutoScalingGroupsInput{} 54 | var result []*autoscaling.Group 55 | err := svc.DescribeAutoScalingGroupsPages(input, func(page *autoscaling.DescribeAutoScalingGroupsOutput, lastPage bool) bool { 56 | tagFilter := func(tagDescriptions []*autoscaling.TagDescription) bool { 57 | var matches []bool 58 | for _, tag := range strings.Split(autodiscoveryTags, ",") { 59 | kv := strings.Split(tag, "=") 60 | match := false 61 | for _, tagDescription := range tagDescriptions { 62 | if aws.StringValue(tagDescription.Key) == kv[0] && aws.StringValue(tagDescription.Value) == kv[1] { 63 | match = true 64 | break 65 | } 66 | } 67 | matches = append(matches, match) 68 | } 69 | for _, match := range matches { 70 | if !match { 71 | return false 72 | } 73 | } 74 | return true 75 | } 76 | result = append(result, filterAutoScalingGroupsByTag(page.AutoScalingGroups, tagFilter)...) 77 | return !lastPage 78 | }) 79 | if err != nil { 80 | return nil, err 81 | } 82 | return result, nil 83 | } 84 | 85 | func DescribeLaunchTemplateByID(svc ec2iface.EC2API, id string) (*ec2.LaunchTemplate, error) { 86 | input := &ec2.DescribeLaunchTemplatesInput{ 87 | LaunchTemplateIds: []*string{ 88 | aws.String(id), 89 | }, 90 | } 91 | return DescribeLaunchTemplate(svc, input) 92 | } 93 | 94 | func DescribeLaunchTemplateByName(svc ec2iface.EC2API, name string) (*ec2.LaunchTemplate, error) { 95 | input := &ec2.DescribeLaunchTemplatesInput{ 96 | LaunchTemplateNames: []*string{ 97 | aws.String(name), 98 | }, 99 | } 100 | return DescribeLaunchTemplate(svc, input) 101 | } 102 | 103 | func DescribeLaunchTemplate(svc ec2iface.EC2API, input *ec2.DescribeLaunchTemplatesInput) (*ec2.LaunchTemplate, error) { 104 | templatesOutput, err := svc.DescribeLaunchTemplates(input) 105 | descriptiveMsg := fmt.Sprintf("%v / %v", aws.StringValueSlice(input.LaunchTemplateIds), aws.StringValueSlice(input.LaunchTemplateNames)) 106 | if err != nil { 107 | return nil, fmt.Errorf("unable to get description for Launch Templates %s: %v", descriptiveMsg, err) 108 | } 109 | if len(templatesOutput.LaunchTemplates) < 1 { 110 | return nil, nil 111 | } 112 | return templatesOutput.LaunchTemplates[0], nil 113 | } 114 | 115 | // IncrementAutoScalingGroupDesiredCount retrieves the latest definition of the ASG and increments its current 116 | // desired capacity by 1. The reason why we retrieve the ASG again even though we already have it is to avoid a 117 | // scenario in which the ASG had already been scaled up or down since the last time it was retrieved. 118 | // See https://github.com/TwiN/aws-eks-asg-rolling-update-handler/issues/129 for more information. 119 | func IncrementAutoScalingGroupDesiredCount(svc autoscalingiface.AutoScalingAPI, autoScalingGroupName string) error { 120 | latestASGs, err := DescribeAutoScalingGroupsByNames(svc, []string{autoScalingGroupName}) 121 | if err != nil { 122 | return fmt.Errorf("failed to retrieve latest asg with name '%s': %w", autoScalingGroupName, err) 123 | } 124 | if len(latestASGs) != 1 { 125 | // ASG names are unique per region and account, so if there isn't exactly one ASG, there's a problem. 126 | return errors.New("failed to retrieve latest asg with name: " + autoScalingGroupName) 127 | } 128 | asg := latestASGs[0] 129 | newDesiredCapacity := aws.Int64Value(asg.DesiredCapacity) + 1 130 | if newDesiredCapacity > aws.Int64Value(asg.MaxSize) { 131 | return ErrCannotIncreaseDesiredCountAboveMax 132 | } 133 | desiredInput := &autoscaling.SetDesiredCapacityInput{ 134 | AutoScalingGroupName: asg.AutoScalingGroupName, 135 | DesiredCapacity: aws.Int64(newDesiredCapacity), 136 | HonorCooldown: aws.Bool(true), 137 | } 138 | _, err = svc.SetDesiredCapacity(desiredInput) 139 | if err != nil { 140 | return fmt.Errorf("unable to increase ASG %s desired count to %d: %w", autoScalingGroupName, newDesiredCapacity, err) 141 | } 142 | return nil 143 | } 144 | 145 | func TerminateEc2Instance(svc autoscalingiface.AutoScalingAPI, instance *autoscaling.Instance, shouldDecrementDesiredCapacity bool) error { 146 | _, err := svc.TerminateInstanceInAutoScalingGroup(&autoscaling.TerminateInstanceInAutoScalingGroupInput{ 147 | InstanceId: instance.InstanceId, 148 | ShouldDecrementDesiredCapacity: aws.Bool(shouldDecrementDesiredCapacity), 149 | }) 150 | return err 151 | } 152 | -------------------------------------------------------------------------------- /k8s/client.go: -------------------------------------------------------------------------------- 1 | package k8s 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "time" 8 | 9 | "github.com/TwiN/gocache/v2" 10 | "github.com/aws/aws-sdk-go/aws" 11 | "github.com/aws/aws-sdk-go/service/autoscaling" 12 | v1 "k8s.io/api/core/v1" 13 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 14 | "k8s.io/client-go/kubernetes" 15 | "k8s.io/kubectl/pkg/drain" 16 | ) 17 | 18 | const ( 19 | AnnotationRollingUpdateStartedTimestamp = "aws-eks-asg-rolling-update-handler.twin.sh/started-at" 20 | AnnotationRollingUpdateDrainedTimestamp = "aws-eks-asg-rolling-update-handler.twin.sh/drained-at" 21 | AnnotationRollingUpdateTerminatedTimestamp = "aws-eks-asg-rolling-update-handler.twin.sh/terminated-at" 22 | 23 | LabelExcludeFromExternalLoadBalancers = "node.kubernetes.io/exclude-from-external-load-balancers" 24 | 25 | nodesCacheKey = "nodes" 26 | ) 27 | 28 | var ( 29 | cache = gocache.NewCache().WithMaxSize(1000).WithEvictionPolicy(gocache.LeastRecentlyUsed) 30 | ) 31 | 32 | type ClientAPI interface { 33 | GetNodes() ([]v1.Node, error) 34 | GetPodsInNode(nodeName string) ([]v1.Pod, error) 35 | GetNodeByAutoScalingInstance(instance *autoscaling.Instance) (*v1.Node, error) 36 | FilterNodeByAutoScalingInstance(nodes []v1.Node, instance *autoscaling.Instance) (*v1.Node, error) 37 | UpdateNode(node *v1.Node) error 38 | Cordon(nodeName string) error 39 | Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData bool, podTerminationGracePeriod int) error 40 | } 41 | 42 | type Client struct { 43 | client kubernetes.Interface 44 | } 45 | 46 | // NewClient creates a new Client 47 | func NewClient(client kubernetes.Interface) *Client { 48 | return &Client{ 49 | client: client, 50 | } 51 | } 52 | 53 | // GetNodes retrieves all nodes from the cluster 54 | func (k *Client) GetNodes() ([]v1.Node, error) { 55 | nodes, exists := cache.Get(nodesCacheKey) 56 | if exists { 57 | if v1Nodes, ok := nodes.([]v1.Node); ok { 58 | // Return cached nodes 59 | return v1Nodes, nil 60 | } else { 61 | log.Println("[k8s.GetNodes] Failed to cast cached nodes to []v1.Node; retrieving nodes from API instead") 62 | cache.Delete(nodesCacheKey) 63 | } 64 | } 65 | nodeList, err := k.client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) 66 | if err != nil { 67 | return nil, err 68 | } 69 | cache.SetWithTTL(nodesCacheKey, nodeList.Items, 10*time.Second) 70 | return nodeList.Items, nil 71 | } 72 | 73 | // GetPodsInNode retrieves all pods from a given node 74 | func (k *Client) GetPodsInNode(node string) ([]v1.Pod, error) { 75 | podList, err := k.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{ 76 | FieldSelector: "spec.nodeName=" + node, 77 | ResourceVersion: "0", 78 | }) 79 | if err != nil { 80 | return nil, err 81 | } 82 | return podList.Items, nil 83 | } 84 | 85 | // GetNodeByAutoScalingInstance gets the Kubernetes node matching an AWS AutoScaling instance 86 | // Because we cannot filter by spec.providerID, the entire list of nodes is fetched every time 87 | // this function is called 88 | func (k *Client) GetNodeByAutoScalingInstance(instance *autoscaling.Instance) (*v1.Node, error) { 89 | nodes, err := k.GetNodes() 90 | if err != nil { 91 | return nil, err 92 | } 93 | return k.FilterNodeByAutoScalingInstance(nodes, instance) 94 | } 95 | 96 | // FilterNodeByAutoScalingInstance extracts the Kubernetes node belonging to a given AWS instance from a list of nodes 97 | func (k *Client) FilterNodeByAutoScalingInstance(nodes []v1.Node, instance *autoscaling.Instance) (*v1.Node, error) { 98 | providerId := fmt.Sprintf("aws:///%s/%s", aws.StringValue(instance.AvailabilityZone), aws.StringValue(instance.InstanceId)) 99 | for _, node := range nodes { 100 | if node.Spec.ProviderID == providerId { 101 | return &node, nil 102 | } 103 | } 104 | return nil, fmt.Errorf("node with providerID \"%s\" not found", providerId) 105 | } 106 | 107 | // UpdateNode updates a node 108 | func (k *Client) UpdateNode(node *v1.Node) error { 109 | api := k.client.CoreV1().Nodes() 110 | _, err := api.Update(context.TODO(), node, metav1.UpdateOptions{}) 111 | return err 112 | } 113 | 114 | // Cordon disables scheduling new pods onto the given node 115 | func (k *Client) Cordon(nodeName string) error { 116 | node, err := k.client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) 117 | if err != nil { 118 | return err 119 | } 120 | drainer := &drain.Helper{ 121 | Client: k.client, 122 | Ctx: context.TODO(), 123 | } 124 | if err := drain.RunCordonOrUncordon(drainer, node, true); err != nil { 125 | log.Printf("[%s][CORDONER] Failed to cordon node: %v", node.Name, err) 126 | return err 127 | } 128 | return nil 129 | } 130 | 131 | // Drain gracefully deletes all pods from a given node 132 | func (k *Client) Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData bool, podTerminationGracePeriod int) error { 133 | node, err := k.client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) 134 | if err != nil { 135 | return err 136 | } 137 | drainer := &drain.Helper{ 138 | Client: k.client, 139 | Force: true, // Continue even if there are pods not managed by a ReplicationController, ReplicaSet, Job, DaemonSet or StatefulSet 140 | IgnoreAllDaemonSets: ignoreDaemonSets, 141 | DeleteEmptyDirData: deleteEmptyDirData, 142 | GracePeriodSeconds: podTerminationGracePeriod, 143 | Timeout: 5 * time.Minute, 144 | Ctx: context.TODO(), 145 | Out: drainLogger{NodeName: nodeName}, 146 | ErrOut: drainLogger{NodeName: nodeName}, 147 | OnPodDeletedOrEvicted: func(pod *v1.Pod, usingEviction bool) { 148 | log.Printf("[%s][DRAINER] evicted pod %s/%s", nodeName, pod.Namespace, pod.Name) 149 | }, 150 | } 151 | if !node.Spec.Unschedulable { 152 | // Cordon the node if it's not already unschedulable 153 | if err := drain.RunCordonOrUncordon(drainer, node, true); err != nil { 154 | log.Printf("[%s][DRAINER] Failed to cordon node: %v", node.Name, err) 155 | return err 156 | } 157 | } 158 | if err := drain.RunNodeDrain(drainer, node.Name); err != nil { 159 | log.Printf("[%s][DRAINER] Failed to drain node: %v", node.Name, err) 160 | return err 161 | } 162 | return nil 163 | } 164 | 165 | type drainLogger struct { 166 | NodeName string 167 | } 168 | 169 | func (l drainLogger) Write(p []byte) (n int, err error) { 170 | log.Printf("[%s][DRAINER] %s", l.NodeName, string(p)) 171 | return len(p), nil 172 | } 173 | -------------------------------------------------------------------------------- /k8s/util_test.go: -------------------------------------------------------------------------------- 1 | package k8s 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/k8stest" 7 | "k8s.io/api/core/v1" 8 | ) 9 | 10 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(t *testing.T) { 11 | // allocatable cpu & memory aren't used for the old node. 12 | // They're only used by the target nodes (newNode, in this case) to calculate if the leftover resources from moving 13 | // the pods from the old node to the new node are positive (if the leftover is negative, it means there's not enough 14 | // space in the target nodes) 15 | oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m") 16 | newNode := k8stest.CreateTestNode("new-node-1", "us-west-2b", "i-07550830aef9e4179", "1000m", "1000Mi") 17 | oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning) 18 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodePod}) 19 | 20 | hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode}) 21 | if !hasEnoughResources { 22 | t.Error("should've had enough space in node") 23 | } 24 | if mockClient.Counter["GetPodsInNode"] != 2 { 25 | t.Error("GetPodInNode should've been called twice") 26 | } 27 | } 28 | 29 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_whenNotEnoughSpaceInNewNodes(t *testing.T) { 30 | oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m") 31 | newNode := k8stest.CreateTestNode("new-node-1", "us-west-2c", "i-0b22d79604221412c", "1000m", "1000Mi") 32 | oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "200m", "200Mi", false, v1.PodRunning) 33 | newNodePod := k8stest.CreateTestPod("new-pod-1", newNode.Name, "900m", "200Mi", false, v1.PodRunning) 34 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodePod, newNodePod}) 35 | 36 | hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode}) 37 | if hasEnoughResources { 38 | t.Error("shouldn't have had enough space in node") 39 | } 40 | if mockClient.Counter["GetPodsInNode"] != 2 { 41 | t.Error("GetPodInNode should've been called twice") 42 | } 43 | } 44 | 45 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withMultiplePods(t *testing.T) { 46 | oldNode := k8stest.CreateTestNode("old-node", "us-west-2c", "i-0b22d79604221412c", "0m", "0m") 47 | newNode := k8stest.CreateTestNode("new-node-1", "us-west-2b", "i-07550830aef9e4179", "1000m", "1000Mi") 48 | oldNodeFirstPod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "300m", "0", false, v1.PodRunning) 49 | oldNodeSecondPod := k8stest.CreateTestPod("old-pod-2", oldNode.Name, "300m", "0", false, v1.PodRunning) 50 | oldNodeThirdPod := k8stest.CreateTestPod("old-pod-3", oldNode.Name, "300m", "0", false, v1.PodRunning) 51 | newNodePod := k8stest.CreateTestPod("new-pod-1", newNode.Name, "200m", "200Mi", false, v1.PodRunning) 52 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod, newNodePod}) 53 | 54 | hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode}) 55 | if hasEnoughResources { 56 | t.Error("shouldn't have had enough space in node") 57 | } 58 | if mockClient.Counter["GetPodsInNode"] != 2 { 59 | t.Error("GetPodInNode should've been called twice") 60 | } 61 | } 62 | 63 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withMultipleTargetNodes(t *testing.T) { 64 | oldNode := k8stest.CreateTestNode("old-node", "us-west-2b", "i-07550830aef9e4179", "0m", "0m") 65 | firstNewNode := k8stest.CreateTestNode("new-node-1", "us-west-2a", "i-034fa1dfbfd35f8bb", "1000m", "1000Mi") 66 | secondNewNode := k8stest.CreateTestNode("new-node-2", "us-west-2b", "i-0918aff89347cef0c", "1000m", "1000Mi") 67 | oldNodeFirstPod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "500m", "0", false, v1.PodRunning) 68 | oldNodeSecondPod := k8stest.CreateTestPod("old-node-pod-2", oldNode.Name, "500m", "0", false, v1.PodRunning) 69 | oldNodeThirdPod := k8stest.CreateTestPod("old-node-pod-3", oldNode.Name, "500m", "0", false, v1.PodRunning) 70 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode, firstNewNode, secondNewNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod}) 71 | 72 | hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&firstNewNode, &secondNewNode}) 73 | if !hasEnoughResources { 74 | t.Error("should've had enough space in node") 75 | } 76 | if mockClient.Counter["GetPodsInNode"] != 3 { 77 | t.Error("GetPodInNode should've been called thrice") 78 | } 79 | } 80 | 81 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withPodsSpreadAcrossMultipleTargetNodes(t *testing.T) { 82 | oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m") 83 | firstNewNode := k8stest.CreateTestNode("new-node-1", "us-west-2a", "i-07550830aef9e4179", "1000m", "1000Mi") 84 | secondNewNode := k8stest.CreateTestNode("new-node-2", "us-west-2a", "i-0147ad0816c210dae", "1000m", "1000Mi") 85 | firstNewNodePod := k8stest.CreateTestPod("new-node-1-pod-1", oldNode.Name, "0", "300Mi", false, v1.PodRunning) 86 | secondNewNodePod := k8stest.CreateTestPod("new-node-2-pod-1", oldNode.Name, "0", "300Mi", false, v1.PodRunning) 87 | oldNodeFirstPod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "0", "500Mi", false, v1.PodRunning) 88 | oldNodeSecondPod := k8stest.CreateTestPod("old-node-pod-2", oldNode.Name, "0", "500Mi", false, v1.PodRunning) 89 | oldNodeThirdPod := k8stest.CreateTestPod("old-node-pod-3", oldNode.Name, "0", "500Mi", false, v1.PodRunning) 90 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode, firstNewNode, secondNewNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod, firstNewNodePod, secondNewNodePod}) 91 | 92 | hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&firstNewNode, &secondNewNode}) 93 | if hasEnoughResources { 94 | t.Error("shouldn't have had enough space in node") 95 | } 96 | if mockClient.Counter["GetPodsInNode"] != 3 { 97 | t.Error("GetPodInNode should've been called thrice") 98 | } 99 | } 100 | 101 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withNoTargetNodes(t *testing.T) { 102 | oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m") 103 | oldNodePod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "500Mi", "500Mi", false, v1.PodRunning) 104 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod}) 105 | 106 | hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{}) 107 | if hasEnoughResources { 108 | t.Error("there's no target nodes; there definitely shouldn't have been enough space") 109 | } 110 | } 111 | 112 | func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withNoTargetNodesButOldNodeOnlyHasPodsFromDaemonSets(t *testing.T) { 113 | oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m") 114 | oldNodePod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "500Mi", "500Mi", true, v1.PodRunning) 115 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod}) 116 | 117 | hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{}) 118 | if !hasEnoughResources { 119 | t.Error("there's no target nodes, but the only pods in the old node are from daemon sets") 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "strconv" 8 | "strings" 9 | "time" 10 | ) 11 | 12 | var cfg *config 13 | 14 | const ( 15 | EnvEnvironment = "ENVIRONMENT" 16 | EnvDebug = "DEBUG" 17 | EnvIgnoreDaemonSets = "IGNORE_DAEMON_SETS" 18 | EnvDeleteLocalData = "DELETE_LOCAL_DATA" // Deprecated: in favor of DeleteEmptyDirData (DELETE_EMPTY_DIR_DATA) 19 | EnvDeleteEmptyDirData = "DELETE_EMPTY_DIR_DATA" 20 | EnvClusterName = "CLUSTER_NAME" 21 | EnvAutodiscoveryTags = "AUTODISCOVERY_TAGS" 22 | EnvAutoScalingGroupNames = "AUTO_SCALING_GROUP_NAMES" 23 | EnvAwsRegion = "AWS_REGION" 24 | EnvExecutionInterval = "EXECUTION_INTERVAL" 25 | EnvExecutionTimeout = "EXECUTION_TIMEOUT" 26 | EnvPodTerminationGracePeriod = "POD_TERMINATION_GRACE_PERIOD" 27 | EnvMetrics = "METRICS" 28 | EnvMetricsPort = "METRICS_PORT" 29 | EnvSlowMode = "SLOW_MODE" 30 | EnvEagerCordoning = "EAGER_CORDONING" 31 | EnvExcludeFromExternalLoadBalancers = "EXCLUDE_FROM_EXTERNAL_LOAD_BALANCERS" 32 | ) 33 | 34 | type config struct { 35 | Environment string // Optional 36 | Debug bool // Defaults to false 37 | AutoScalingGroupNames []string // Required if AutodiscoveryTags not provided 38 | AutodiscoveryTags string // Required if AutoScalingGroupNames not provided 39 | AwsRegion string // Defaults to us-west-2 40 | IgnoreDaemonSets bool // Defaults to true 41 | DeleteEmptyDirData bool // Defaults to true 42 | ExecutionInterval time.Duration // Defaults to 20s 43 | ExecutionTimeout time.Duration // Defaults to 900s 44 | PodTerminationGracePeriod int // Defaults to -1 45 | Metrics bool // Defaults to false 46 | MetricsPort int // Defaults to 8080 47 | SlowMode bool // Defaults to false 48 | EagerCordoning bool // Defaults to false 49 | ExcludeFromExternalLoadBalancers bool // Defaults to false 50 | } 51 | 52 | // Initialize is used to initialize the application's configuration 53 | func Initialize() error { 54 | cfg = &config{ 55 | Environment: strings.ToLower(os.Getenv(EnvEnvironment)), 56 | Debug: strings.ToLower(os.Getenv(EnvDebug)) == "true", 57 | SlowMode: strings.ToLower(os.Getenv(EnvSlowMode)) == "true", 58 | EagerCordoning: strings.ToLower(os.Getenv(EnvEagerCordoning)) == "true", 59 | ExcludeFromExternalLoadBalancers: strings.ToLower(os.Getenv(EnvExcludeFromExternalLoadBalancers)) == "true", 60 | } 61 | if clusterName := os.Getenv(EnvClusterName); len(clusterName) > 0 { 62 | // See "Prerequisites" in https://docs.aws.amazon.com/eks/latest/userguide/autoscaling.html 63 | cfg.AutodiscoveryTags = fmt.Sprintf("k8s.io/cluster-autoscaler/%s=owned,k8s.io/cluster-autoscaler/enabled=true", clusterName) 64 | } else if autodiscoveryTags := os.Getenv(EnvAutodiscoveryTags); len(autodiscoveryTags) > 0 { 65 | cfg.AutodiscoveryTags = autodiscoveryTags 66 | } else if autoScalingGroupNames := os.Getenv(EnvAutoScalingGroupNames); len(autoScalingGroupNames) > 0 { 67 | cfg.AutoScalingGroupNames = strings.Split(strings.TrimSpace(autoScalingGroupNames), ",") 68 | } else { 69 | return fmt.Errorf("environment variables '%s', '%s' or '%s' are not set", EnvAutoScalingGroupNames, EnvClusterName, EnvAutodiscoveryTags) 70 | } 71 | if ignoreDaemonSets := strings.ToLower(os.Getenv(EnvIgnoreDaemonSets)); len(ignoreDaemonSets) == 0 || ignoreDaemonSets == "true" { 72 | cfg.IgnoreDaemonSets = true 73 | } 74 | // if the deprecated EnvDeleteLocalData is set, we need to set EnvDeleteEmptyDirData to its value 75 | if deleteLocalData := strings.ToLower(os.Getenv(EnvDeleteLocalData)); len(deleteLocalData) > 0 { 76 | log.Println("NOTICE: Environment variable '" + EnvDeleteLocalData + "' has been deprecated in favor of '" + EnvDeleteEmptyDirData + "'.") 77 | log.Println("NOTICE: Make sure to update your configuration, as said deprecated environment variable will be removed in a future release.") 78 | if len(os.Getenv(EnvDeleteEmptyDirData)) == 0 { 79 | _ = os.Setenv(EnvDeleteEmptyDirData, deleteLocalData) 80 | } else { 81 | log.Println("WARNING: Both '" + EnvDeleteLocalData + "' and '" + EnvDeleteEmptyDirData + "' are set. The former is deprecated, and will be ignored.") 82 | } 83 | } 84 | if deleteEmptyDirData := strings.ToLower(os.Getenv(EnvDeleteEmptyDirData)); len(deleteEmptyDirData) == 0 || deleteEmptyDirData == "true" { 85 | cfg.DeleteEmptyDirData = true 86 | } 87 | if awsRegion := strings.ToLower(os.Getenv(EnvAwsRegion)); len(awsRegion) == 0 { 88 | log.Printf("Environment variable '%s' not specified, defaulting to us-west-2", EnvAwsRegion) 89 | cfg.AwsRegion = "us-west-2" 90 | } else { 91 | cfg.AwsRegion = awsRegion 92 | } 93 | if metricsPort := os.Getenv(EnvMetricsPort); len(metricsPort) == 0 { 94 | log.Printf("Environment variable '%s' not specified, defaulting to 8080", EnvMetricsPort) 95 | cfg.MetricsPort = 8080 96 | } else { 97 | port, err := strconv.Atoi(metricsPort) 98 | if err != nil { 99 | return fmt.Errorf("invalid value for '%s': %s", EnvMetricsPort, err) 100 | } 101 | cfg.MetricsPort = port 102 | } 103 | if metrics := strings.ToLower(os.Getenv(EnvMetrics)); len(metrics) != 0 { 104 | cfg.Metrics = true 105 | } 106 | if executionInterval := os.Getenv(EnvExecutionInterval); len(executionInterval) > 0 { 107 | if interval, err := strconv.Atoi(executionInterval); err != nil { 108 | return fmt.Errorf("environment variable '%s' must be an integer", EnvExecutionInterval) 109 | } else { 110 | cfg.ExecutionInterval = time.Second * time.Duration(interval) 111 | } 112 | } else { 113 | log.Printf("Environment variable '%s' not specified, defaulting to 20 seconds", EnvExecutionInterval) 114 | cfg.ExecutionInterval = time.Second * 20 115 | } 116 | if executionTimeout := os.Getenv(EnvExecutionTimeout); len(executionTimeout) > 0 { 117 | if timeout, err := strconv.Atoi(executionTimeout); err != nil { 118 | return fmt.Errorf("environment variable '%s' must be an integer", EnvExecutionTimeout) 119 | } else { 120 | cfg.ExecutionTimeout = time.Second * time.Duration(timeout) 121 | } 122 | } else { 123 | log.Printf("Environment variable '%s' not specified, defaulting to 900 seconds", EnvExecutionTimeout) 124 | cfg.ExecutionTimeout = time.Second * 900 125 | } 126 | if terminationGracePeriod := os.Getenv(EnvPodTerminationGracePeriod); len(terminationGracePeriod) > 0 { 127 | if gracePeriod, err := strconv.Atoi(terminationGracePeriod); err != nil { 128 | return fmt.Errorf("environment variable '%s' must be an integer", EnvPodTerminationGracePeriod) 129 | } else { 130 | cfg.PodTerminationGracePeriod = gracePeriod 131 | } 132 | } else { 133 | log.Printf("Environment variable '%s' not specified, defaulting to -1 (pod's terminationGracePeriodSeconds)", EnvPodTerminationGracePeriod) 134 | cfg.PodTerminationGracePeriod = -1 135 | } 136 | return nil 137 | } 138 | 139 | // Set sets the application's configuration and is intended to be used for testing purposes. 140 | // See Initialize() for production 141 | func Set(autoScalingGroupNames []string, ignoreDaemonSets, deleteEmptyDirData, eagerCordoning bool, excludeFromExternalLoadBalancers bool) { 142 | cfg = &config{ 143 | AutoScalingGroupNames: autoScalingGroupNames, 144 | IgnoreDaemonSets: ignoreDaemonSets, 145 | DeleteEmptyDirData: deleteEmptyDirData, 146 | EagerCordoning: eagerCordoning, 147 | ExcludeFromExternalLoadBalancers: excludeFromExternalLoadBalancers, 148 | ExecutionInterval: time.Second * 20, 149 | ExecutionTimeout: time.Second * 900, 150 | } 151 | } 152 | 153 | func Get() *config { 154 | if cfg == nil { 155 | log.Println("Config wasn't initialized prior to being called. Assuming this is a test.") 156 | Set(nil, true, true, false, false) 157 | } 158 | return cfg 159 | } 160 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aws-eks-asg-rolling-update-handler 2 | 3 | ![test](https://github.com/TwiN/aws-eks-asg-rolling-update-handler/actions/workflows/test.yml/badge.svg) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/TwiN/aws-eks-asg-rolling-update-handler)](https://goreportcard.com/report/github.com/TwiN/aws-eks-asg-rolling-update-handler) 5 | [![Docker pulls](https://img.shields.io/docker/pulls/twinproduction/aws-eks-asg-rolling-update-handler.svg)](https://cloud.docker.com/repository/docker/twinproduction/aws-eks-asg-rolling-update-handler) 6 | 7 | This application handles rolling upgrades for AWS ASGs for EKS by replacing outdated nodes by new nodes. 8 | Outdated nodes are defined as nodes whose current configuration does not match its ASG's current launch 9 | template version or launch configuration. 10 | 11 | Inspired by aws-asg-roller, this application only has one purpose: Scale down outdated nodes gracefully. 12 | 13 | Unlike aws-asg-roller, it will not attempt to control the amount of nodes at all; it will scale up enough new nodes 14 | to move the pods from the old nodes to the new nodes, and then evict the old nodes. 15 | 16 | It will not adjust the desired size back to its initial desired size like aws-asg-roller does, it will simply leave 17 | everything else up to cluster-autoscaler. 18 | 19 | Note that unlike other solutions, this application actually uses the resources to determine how many instances should 20 | be spun up before draining the old nodes. This is much better, because simply using the initial number of instances is 21 | completely useless in the event that the ASG's update on the launch configuration/template is a change of instance type. 22 | 23 | 24 | ## Behavior 25 | 26 | On interval, this application: 27 | 1. Iterates over each ASG discovered by the `CLUSTER_NAME`, `AUTODISCOVERY_TAGS` environment variables or the ones defined in the `AUTO_SCALING_GROUP_NAMES` environment variable, in that order. 28 | 2. Iterates over each instance of each ASG 29 | 3. Checks if there's any instance with an outdated launch template version 30 | 4. **If ASG uses MixedInstancesPolicy**, checks if there's any instances with an instance type that isn't part of the list of instance type overrides 31 | 5. Checks if there's any instance with an outdated launch configuration 32 | 6. If any of the conditions defined in the step 3, 4 or 5 are met for any instance, begin the rolling update process for that instance 33 | 34 | The steps of each action are persisted directly on the old nodes via annotations (i.e. when the old node starts rolling out, gets drained, and gets scheduled for termination). 35 | Therefore, this application will not run into any issues if it is restarted, rescheduled or stopped at any point in time. 36 | 37 | 38 | **NOTE**: Ensure that your PodDisruptionBudgets - if you have any - are properly configured. This usually means having at least 1 allowed disruption at all time (i.e. at least `minAvailable: 1` with at least 2 replicas OR `maxUnavailable: 1`) 39 | 40 | 41 | ## Usage 42 | 43 | | Environment variable | Description | Required | Default | 44 | |:-------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------|:------------| 45 | | CLUSTER_NAME | Name of the eks-cluster, used in place of `AUTODISCOVERRY_TAGS` and `AUTO_SCALING_GROUP_NAMES`. Checks for `k8s.io/cluster-autoscaler/: owned` and `k8s.io/cluster-autoscaler/enabled: true` tags on ASG | yes | `""` | 46 | | AUTODISCOVERY_TAGS | Comma separated key value string with tags to autodiscover ASGs, used in place of `CLUSTER_NAME` and `AUTO_SCALING_GROUP_NAMES`. | yes | `""` | 47 | | AUTO_SCALING_GROUP_NAMES | Comma-separated list of ASGs, CLUSTER_NAME takes priority. | yes | `""` | 48 | | IGNORE_DAEMON_SETS | Whether to ignore DaemonSets when draining the nodes | no | `true` | 49 | | DELETE_EMPTY_DIR_DATA | Whether to delete empty dir data when draining the nodes | no | `true` | 50 | | AWS_REGION | Self-explanatory | no | `us-west-2` | 51 | | ENVIRONMENT | If set to `dev`, will try to create the Kubernetes client using your local kubeconfig. Any other values will use the in-cluster configuration | no | `""` | 52 | | EXECUTION_INTERVAL | Duration to sleep between each execution in seconds | no | `20` | 53 | | EXECUTION_TIMEOUT | Maximum execution duration before timing out in seconds | no | `900` | 54 | | POD_TERMINATION_GRACE_PERIOD | How long to wait for a pod to terminate in seconds; 0 means "delete immediately"; set to a negative value to use the pod's terminationGracePeriodSeconds. | no | `-1` | 55 | | METRICS_PORT | Port to bind metrics server to | no | `8080` | 56 | | METRICS | Expose metrics in Prometheus format at `:${METRICS_PORT}/metrics` | no | `""` | 57 | | SLOW_MODE | If enabled, every time a node is terminated during an execution, the current execution will stop rather than continuing to the next ASG | no | `false` | 58 | | EAGER_CORDONING | If enabled, all outdated nodes will get cordoned before any rolling update action. The default mode is to cordon a node just before draining it. See [#41](https://github.com/TwiN/aws-eks-asg-rolling-update-handler/issues/41) for possible consequences of enabling this. | no | `false` | 59 | | EXCLUDE_FROM_EXTERNAL_LOAD_BALANCERS | If enabled, node label `node.kubernetes.io/exclude-from-external-load-balancers=true` will be added to nodes before draining. See [#131](https://github.com/TwiN/aws-eks-asg-rolling-update-handler/pull/131) for more information | no | `false` | 60 | 61 | **NOTE:** Only one of `CLUSTER_NAME`, `AUTODISCOVERY_TAGS` or `AUTO_SCALING_GROUP_NAMES` must be set. 62 | 63 | 64 | ## Metrics 65 | 66 | | Metric name | Metric type | Labels | Description | 67 | |--------------------------------------------|-------------|--------------|---------------------------------------| 68 | | rolling_update_handler_node_groups | Gauge | | Node groups managed by the handler | 69 | | rolling_update_handler_outdated_nodes | Gauge | `node_group` | The number of outdated nodes | 70 | | rolling_update_handler_updated_nodes | Gauge | `node_group` | The number of updated nodes | 71 | | rolling_update_handler_scaled_up_nodes | Counter | `node_group` | The total number of nodes scaled up | 72 | | rolling_update_handler_scaled_down_nodes | Counter | `node_group` | The total number of nodes scaled down | 73 | | rolling_update_handler_drained_nodes_total | Counter | `node_group` | The total number of drained nodes | 74 | | rolling_update_handler_errors | Counter | | The total number of errors | 75 | 76 | 77 | ## Permissions 78 | 79 | To function properly, this application requires the following permissions on AWS: 80 | - autoscaling:DescribeAutoScalingGroups 81 | - autoscaling:DescribeAutoScalingInstances 82 | - autoscaling:DescribeLaunchConfigurations 83 | - autoscaling:SetDesiredCapacity 84 | - autoscaling:TerminateInstanceInAutoScalingGroup 85 | - autoscaling:UpdateAutoScalingGroup 86 | - ec2:DescribeLaunchTemplates 87 | - ec2:DescribeInstances 88 | 89 | 90 | ## Deploying on Kubernetes 91 | 92 | ```yaml 93 | apiVersion: v1 94 | kind: ServiceAccount 95 | metadata: 96 | name: aws-eks-asg-rolling-update-handler 97 | namespace: kube-system 98 | labels: 99 | app: aws-eks-asg-rolling-update-handler 100 | --- 101 | apiVersion: rbac.authorization.k8s.io/v1 102 | kind: ClusterRole 103 | metadata: 104 | name: aws-eks-asg-rolling-update-handler 105 | labels: 106 | app: aws-eks-asg-rolling-update-handler 107 | rules: 108 | - apiGroups: 109 | - "*" 110 | resources: 111 | - "*" 112 | verbs: 113 | - get 114 | - list 115 | - watch 116 | - apiGroups: 117 | - "*" 118 | resources: 119 | - nodes 120 | verbs: 121 | - get 122 | - list 123 | - watch 124 | - update 125 | - patch 126 | - apiGroups: 127 | - "*" 128 | resources: 129 | - pods/eviction 130 | verbs: 131 | - get 132 | - list 133 | - create 134 | - apiGroups: 135 | - "*" 136 | resources: 137 | - pods 138 | verbs: 139 | - get 140 | - list 141 | --- 142 | apiVersion: rbac.authorization.k8s.io/v1 143 | kind: ClusterRoleBinding 144 | metadata: 145 | name: aws-eks-asg-rolling-update-handler 146 | labels: 147 | app: aws-eks-asg-rolling-update-handler 148 | roleRef: 149 | kind: ClusterRole 150 | name: aws-eks-asg-rolling-update-handler 151 | apiGroup: rbac.authorization.k8s.io 152 | subjects: 153 | - kind: ServiceAccount 154 | name: aws-eks-asg-rolling-update-handler 155 | namespace: kube-system 156 | --- 157 | apiVersion: apps/v1 158 | kind: Deployment 159 | metadata: 160 | name: aws-eks-asg-rolling-update-handler 161 | namespace: kube-system 162 | labels: 163 | app: aws-eks-asg-rolling-update-handler 164 | spec: 165 | replicas: 1 166 | selector: 167 | matchLabels: 168 | app: aws-eks-asg-rolling-update-handler 169 | template: 170 | metadata: 171 | labels: 172 | app: aws-eks-asg-rolling-update-handler 173 | spec: 174 | automountServiceAccountToken: true 175 | serviceAccountName: aws-eks-asg-rolling-update-handler 176 | restartPolicy: Always 177 | dnsPolicy: Default 178 | containers: 179 | - name: aws-eks-asg-rolling-update-handler 180 | image: twinproduction/aws-eks-asg-rolling-update-handler 181 | imagePullPolicy: Always 182 | env: 183 | - name: AUTO_SCALING_GROUP_NAMES 184 | value: "asg-1,asg-2,asg-3" # REPLACE THESE VALUES FOR THE NAMES OF THE ASGs 185 | ``` 186 | 187 | 188 | ## Deploying with Helm 189 | 190 | For the chart associated to this project, see [TwiN/helm-charts](https://github.com/TwiN/helm-charts): 191 | ```sh 192 | helm repo add twin https://twin.github.io/helm-charts 193 | helm repo update 194 | helm install aws-eks-asg-rolling-update-handler twin/aws-eks-asg-rolling-update-handler 195 | ``` 196 | 197 | 198 | ## Developing 199 | 200 | To run the application locally, make sure your local kubeconfig file is configured properly (i.e. you can use kubectl). 201 | 202 | Once you've done that, set the local environment variable `ENVIRONMENT` to `dev` and `AUTO_SCALING_GROUP_NAMES` 203 | to a comma-separated list of auto scaling group names. 204 | 205 | Your local aws credentials must also be valid (i.e. you can use `awscli`) 206 | 207 | 208 | ## Special thanks 209 | 210 | I had originally worked on [deitch/aws-asg-roller](https://github.com/deitch/aws-asg-roller), but due to the numerous conflicts it had with cluster-autoscaler, 211 | I decided to make a project that heavily relies on cluster-autoscaler rather than simply coexist with it, with a much bigger emphasis on maintaining 212 | high availability during rolling upgrades. 213 | 214 | In any case, this project was inspired by aws-asg-roller and the code for comparing launch template versions also comes from there, hence why this special thanks section exists. 215 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= 2 | github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= 3 | github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ= 4 | github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE= 5 | github.com/TwiN/gocache/v2 v2.4.0 h1:BZ/TqvhipDQE23MFFTjC0MiI1qZ7GEVtSdOFVVXyr18= 6 | github.com/TwiN/gocache/v2 v2.4.0/go.mod h1:Cl1c0qNlQlXzJhTpAARVqpQDSuGDM5RhtzPYAM1x17g= 7 | github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= 8 | github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= 9 | github.com/aws/aws-sdk-go v1.55.7 h1:UJrkFq7es5CShfBwlWAC8DA077vp8PyVbQd3lqLiztE= 10 | github.com/aws/aws-sdk-go v1.55.7/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= 11 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= 12 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 13 | github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= 14 | github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= 15 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 16 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 17 | github.com/chai2010/gettext-go v1.0.2 h1:1Lwwip6Q2QGsAdl/ZKPCwTe9fe0CjlUbqj5bFNSjIRk= 18 | github.com/chai2010/gettext-go v1.0.2/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA= 19 | github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= 20 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 21 | github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= 22 | github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= 23 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 24 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 25 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= 26 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 27 | github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= 28 | github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= 29 | github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f h1:Wl78ApPPB2Wvf/TIe2xdyJxTlb6obmF18d8QdkxNDu4= 30 | github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f/go.mod h1:OSYXu++VVOHnXeitef/D8n/6y4QV8uLHSFXX4NeXMGc= 31 | github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= 32 | github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= 33 | github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= 34 | github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= 35 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= 36 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 37 | github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= 38 | github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= 39 | github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= 40 | github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= 41 | github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= 42 | github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= 43 | github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= 44 | github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= 45 | github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= 46 | github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= 47 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= 48 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= 49 | github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= 50 | github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= 51 | github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= 52 | github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= 53 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= 54 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= 55 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 56 | github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= 57 | github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= 58 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 59 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 60 | github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= 61 | github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= 62 | github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= 63 | github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= 64 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= 65 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= 66 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= 67 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= 68 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= 69 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= 70 | github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= 71 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= 72 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= 73 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= 74 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= 75 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= 76 | github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= 77 | github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= 78 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= 79 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 80 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 81 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 82 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 83 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 84 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 85 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= 86 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= 87 | github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= 88 | github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= 89 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= 90 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= 91 | github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= 92 | github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= 93 | github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= 94 | github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= 95 | github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= 96 | github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= 97 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 98 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= 99 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 100 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= 101 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= 102 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= 103 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= 104 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= 105 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 106 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 107 | github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= 108 | github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= 109 | github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= 110 | github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= 111 | github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= 112 | github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= 113 | github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= 114 | github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= 115 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 116 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 117 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 118 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 119 | github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= 120 | github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= 121 | github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= 122 | github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= 123 | github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= 124 | github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= 125 | github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= 126 | github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= 127 | github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= 128 | github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= 129 | github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= 130 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 131 | github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= 132 | github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= 133 | github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= 134 | github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= 135 | github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= 136 | github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 137 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 138 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 139 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 140 | github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= 141 | github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= 142 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 143 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 144 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 145 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 146 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 147 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= 148 | github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= 149 | github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= 150 | github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= 151 | github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= 152 | github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= 153 | github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= 154 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 155 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 156 | go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= 157 | go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= 158 | go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= 159 | go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= 160 | go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= 161 | go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= 162 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 163 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 164 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 165 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 166 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 167 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 168 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 169 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 170 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 171 | golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= 172 | golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= 173 | golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= 174 | golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= 175 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 176 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 177 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 178 | golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= 179 | golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= 180 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 181 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 182 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 183 | golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 184 | golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= 185 | golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 186 | golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= 187 | golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= 188 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 189 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 190 | golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= 191 | golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= 192 | golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= 193 | golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= 194 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 195 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 196 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= 197 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= 198 | golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= 199 | golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= 200 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 201 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 202 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 203 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 204 | google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= 205 | google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= 206 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 207 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 208 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 209 | gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= 210 | gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= 211 | gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= 212 | gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= 213 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= 214 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 215 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 216 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 217 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 218 | k8s.io/api v0.34.3 h1:D12sTP257/jSH2vHV2EDYrb16bS7ULlHpdNdNhEw2S4= 219 | k8s.io/api v0.34.3/go.mod h1:PyVQBF886Q5RSQZOim7DybQjAbVs8g7gwJNhGtY5MBk= 220 | k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE= 221 | k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= 222 | k8s.io/cli-runtime v0.34.3 h1:YRyMhiwX0dT9lmG0AtZDaeG33Nkxgt9OlCTZhRXj9SI= 223 | k8s.io/cli-runtime v0.34.3/go.mod h1:GVwL1L5uaGEgM7eGeKjaTG2j3u134JgG4dAI6jQKhMc= 224 | k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A= 225 | k8s.io/client-go v0.34.3/go.mod h1:OxxeYagaP9Kdf78UrKLa3YZixMCfP6bgPwPwNBQBzpM= 226 | k8s.io/component-base v0.34.3 h1:zsEgw6ELqK0XncCQomgO9DpUIzlrYuZYA0Cgo+JWpVk= 227 | k8s.io/component-base v0.34.3/go.mod h1:5iIlD8wPfWE/xSHTRfbjuvUul2WZbI2nOUK65XL0E/c= 228 | k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= 229 | k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= 230 | k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= 231 | k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= 232 | k8s.io/kubectl v0.34.3 h1:vpM6//153gh5gvsYHXWHVJ4l4xmN5QFwTSmlfd8icm8= 233 | k8s.io/kubectl v0.34.3/go.mod h1:zZQHtIZoUqTP1bAnPzq/3W1jfc0NeOeunFgcswrfg1c= 234 | k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= 235 | k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= 236 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= 237 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= 238 | sigs.k8s.io/kustomize/api v0.20.1 h1:iWP1Ydh3/lmldBnH/S5RXgT98vWYMaTUL1ADcr+Sv7I= 239 | sigs.k8s.io/kustomize/api v0.20.1/go.mod h1:t6hUFxO+Ph0VxIk1sKp1WS0dOjbPCtLJ4p8aADLwqjM= 240 | sigs.k8s.io/kustomize/kyaml v0.20.1 h1:PCMnA2mrVbRP3NIB6v9kYCAc38uvFLVs8j/CD567A78= 241 | sigs.k8s.io/kustomize/kyaml v0.20.1/go.mod h1:0EmkQHRUsJxY8Ug9Niig1pUMSCGHxQ5RklbpV/Ri6po= 242 | sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= 243 | sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= 244 | sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= 245 | sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= 246 | sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= 247 | sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= 248 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "log" 7 | "math/rand" 8 | "time" 9 | 10 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/cloud" 11 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/config" 12 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/k8s" 13 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/metrics" 14 | "github.com/aws/aws-sdk-go/aws" 15 | "github.com/aws/aws-sdk-go/service/autoscaling" 16 | "github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface" 17 | "github.com/aws/aws-sdk-go/service/ec2" 18 | "github.com/aws/aws-sdk-go/service/ec2/ec2iface" 19 | v1 "k8s.io/api/core/v1" 20 | ) 21 | 22 | const ( 23 | MaximumFailedExecutionBeforePanic = 10 // Maximum number of allowed failed executions before panicking 24 | 25 | MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio = 0.11 // To help with larger clusters 26 | MaximumNumberOfUpdatedNonReadyNodes = 5 // To prevent too many non-ready nodes from being taken into account when calculating resources available in one node 27 | ) 28 | 29 | var ( 30 | ErrTimedOut = errors.New("execution timed out") 31 | 32 | executionFailedCounter = 0 33 | ) 34 | 35 | func init() { 36 | rand.Seed(time.Now().UnixNano()) 37 | } 38 | 39 | func main() { 40 | err := config.Initialize() 41 | if err != nil { 42 | log.Fatalf("Unable to initialize configuration: %s", err.Error()) 43 | } 44 | if config.Get().Metrics { 45 | go metrics.Server.Listen(config.Get().MetricsPort) 46 | } 47 | ec2Service, autoScalingService, err := cloud.GetServices(config.Get().AwsRegion) 48 | if err != nil { 49 | log.Fatalf("Unable to create AWS services: %s", err.Error()) 50 | } 51 | for { 52 | start := time.Now() 53 | if err := run(ec2Service, autoScalingService); err != nil { 54 | log.Printf("Error during execution: %s", err.Error()) 55 | metrics.Server.Errors.Inc() 56 | executionFailedCounter++ 57 | if executionFailedCounter > MaximumFailedExecutionBeforePanic { 58 | panic(fmt.Errorf("execution failed %d times: %v", executionFailedCounter, err)) 59 | } 60 | } else if executionFailedCounter > 0 { 61 | log.Printf("Execution was successful after %d failed attempts, resetting counter to 0", executionFailedCounter) 62 | executionFailedCounter = 0 63 | } 64 | log.Printf("Execution took %dms, sleeping for %s", time.Since(start).Milliseconds(), config.Get().ExecutionInterval) 65 | time.Sleep(config.Get().ExecutionInterval) 66 | } 67 | } 68 | 69 | func run(ec2Service ec2iface.EC2API, autoScalingService autoscalingiface.AutoScalingAPI) error { 70 | log.Println("Starting execution") 71 | cfg := config.Get() 72 | client, err := k8s.CreateClientSet() 73 | if err != nil { 74 | return errors.New("unable to create Kubernetes client: " + err.Error()) 75 | } 76 | kubernetesClient := k8s.NewClient(client) 77 | if cfg.Debug { 78 | log.Println("Created Kubernetes Client successfully") 79 | } 80 | 81 | var autoScalingGroups []*autoscaling.Group 82 | if len(cfg.AutodiscoveryTags) > 0 { 83 | autoScalingGroups, err = cloud.DescribeEnabledAutoScalingGroupsByTags(autoScalingService, cfg.AutodiscoveryTags) 84 | } else { 85 | autoScalingGroups, err = cloud.DescribeAutoScalingGroupsByNames(autoScalingService, cfg.AutoScalingGroupNames) 86 | } 87 | if err != nil { 88 | return errors.New("unable to describe AutoScalingGroups: " + err.Error()) 89 | } 90 | if cfg.Debug { 91 | log.Println("Described AutoScalingGroups successfully") 92 | } 93 | return HandleRollingUpgrade(kubernetesClient, ec2Service, autoScalingService, autoScalingGroups) 94 | } 95 | 96 | // HandleRollingUpgrade handles rolling upgrades. 97 | // 98 | // Returns an error if an execution lasts for longer than ExecutionTimeout 99 | func HandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, autoScalingService autoscalingiface.AutoScalingAPI, autoScalingGroups []*autoscaling.Group) error { 100 | metrics.Server.NodeGroups.WithLabelValues().Set(float64(len(autoScalingGroups))) 101 | timeout := make(chan bool, 1) 102 | result := make(chan bool, 1) 103 | go func() { 104 | time.Sleep(config.Get().ExecutionTimeout) 105 | timeout <- true 106 | }() 107 | go func() { 108 | result <- DoHandleRollingUpgrade(client, ec2Service, autoScalingService, autoScalingGroups) 109 | }() 110 | select { 111 | case <-timeout: 112 | return ErrTimedOut 113 | case <-result: 114 | return nil 115 | } 116 | } 117 | 118 | // DoHandleRollingUpgrade handles rolling upgrades by iterating over every single AutoScalingGroups' outdated 119 | // instances 120 | func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, autoScalingService autoscalingiface.AutoScalingAPI, autoScalingGroups []*autoscaling.Group) bool { 121 | for _, autoScalingGroup := range autoScalingGroups { 122 | outdatedInstances, updatedInstances, err := SeparateOutdatedFromUpdatedInstances(autoScalingGroup, ec2Service) 123 | if err != nil { 124 | metrics.Server.Errors.Inc() 125 | log.Printf("[%s] Skipping because unable to separate outdated instances from updated instances: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), err.Error()) 126 | continue 127 | } 128 | metrics.Server.UpdatedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Set(float64(len(updatedInstances))) 129 | metrics.Server.OutdatedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Set(float64(len(outdatedInstances))) 130 | if config.Get().Debug { 131 | log.Printf("[%s] outdatedInstances: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), outdatedInstances) 132 | log.Printf("[%s] updatedInstances: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), updatedInstances) 133 | } 134 | // Get the updated and ready nodes from the list of updated instances 135 | // This will be used to determine if the desired number of updated instances need to scale up or not 136 | // We also use this to clean up, if necessary 137 | updatedReadyNodes, numberOfNonReadyUpdatedNodesOrInstances := getReadyNodesAndNumberOfNonReadyNodesOrInstances(client, updatedInstances, autoScalingGroup) 138 | if len(outdatedInstances) == 0 { 139 | log.Printf("[%s] All instances are up to date", aws.StringValue(autoScalingGroup.AutoScalingGroupName)) 140 | continue 141 | } else { 142 | log.Printf("[%s] outdated=%d; updated=%d; updatedAndReady=%d; asgCurrent=%d; asgDesired=%d; asgMax=%d", aws.StringValue(autoScalingGroup.AutoScalingGroupName), len(outdatedInstances), len(updatedInstances), len(updatedReadyNodes), len(autoScalingGroup.Instances), aws.Int64Value(autoScalingGroup.DesiredCapacity), aws.Int64Value(autoScalingGroup.MaxSize)) 143 | } 144 | if int64(len(autoScalingGroup.Instances)) < aws.Int64Value(autoScalingGroup.DesiredCapacity) { 145 | log.Printf("[%s] Skipping because ASG has a desired capacity of %d, but only has %d instances", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.Int64Value(autoScalingGroup.DesiredCapacity), len(autoScalingGroup.Instances)) 146 | continue 147 | } 148 | if !HasAcceptableNumberOfUpdatedNonReadyNodes(numberOfNonReadyUpdatedNodesOrInstances, len(updatedReadyNodes)) { 149 | log.Printf("[%s] ASG has too many non-ready updated nodes/instances (%d), waiting until they become ready", aws.StringValue(autoScalingGroup.AutoScalingGroupName), numberOfNonReadyUpdatedNodesOrInstances) 150 | continue 151 | } 152 | // Shuffle the outdated instances, so that we don't always try to terminate the same instance. 153 | // This is also useful if you want to have more than one aws-eks-asg-rolling-update-handler running 154 | rand.Shuffle(len(outdatedInstances), func(i, j int) { 155 | outdatedInstances[i], outdatedInstances[j] = outdatedInstances[j], outdatedInstances[i] 156 | }) 157 | for _, outdatedInstance := range outdatedInstances { 158 | node, err := client.GetNodeByAutoScalingInstance(outdatedInstance) 159 | if err != nil { 160 | log.Printf("[%s][%s] Skipping because unable to get outdated node from Kubernetes: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error()) 161 | continue 162 | } 163 | if config.Get().EagerCordoning { 164 | if !node.Spec.Unschedulable { 165 | // If EagerCordoning is enabled and the node is schedulable, we need to cordon it. 166 | if err := client.Cordon(node.Name); err != nil { 167 | metrics.Server.Errors.Inc() 168 | log.Printf("[%s][%s] Skipping because ran into error while cordoning node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error()) 169 | continue 170 | } 171 | } 172 | } 173 | minutesSinceStarted, minutesSinceDrained, minutesSinceTerminated := getRollingUpdateTimestampsFromNode(node) 174 | // Check if outdated nodes in k8s have been marked with annotation from aws-eks-asg-rolling-update-handler 175 | if minutesSinceStarted == -1 { 176 | log.Printf("[%s][%s] Starting node rollout process", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 177 | // Annotate the node to persist the fact that the rolling update process has begun 178 | err := k8s.AnnotateNodeByAutoScalingInstance(client, outdatedInstance, k8s.AnnotationRollingUpdateStartedTimestamp, time.Now().Format(time.RFC3339)) 179 | if err != nil { 180 | log.Printf("[%s][%s] Skipping because unable to annotate node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error()) 181 | continue 182 | } 183 | } else { 184 | log.Printf("[%s][%s] Node already started rollout process", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 185 | // check if existing updatedInstances have the capacity to support what's inside this node 186 | hasEnoughResources := k8s.CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(client, node, updatedReadyNodes) 187 | if hasEnoughResources { 188 | log.Printf("[%s][%s] Updated nodes have enough resources available", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 189 | if minutesSinceDrained == -1 { 190 | if config.Get().ExcludeFromExternalLoadBalancers { 191 | log.Printf("[%s][%s] Label node to exclude from external load balancers", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 192 | k8s.LabelNodeByAutoScalingInstance(client, outdatedInstance, k8s.LabelExcludeFromExternalLoadBalancers, "true") 193 | } 194 | log.Printf("[%s][%s] Draining node", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 195 | err := client.Drain(node.Name, config.Get().IgnoreDaemonSets, config.Get().DeleteEmptyDirData, config.Get().PodTerminationGracePeriod) 196 | if err != nil { 197 | metrics.Server.Errors.Inc() 198 | log.Printf("[%s][%s] Skipping because ran into error while draining node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error()) 199 | continue 200 | } else { 201 | metrics.Server.DrainedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Inc() 202 | // Only annotate if no error was encountered 203 | _ = k8s.AnnotateNodeByAutoScalingInstance(client, outdatedInstance, k8s.AnnotationRollingUpdateDrainedTimestamp, time.Now().Format(time.RFC3339)) 204 | } 205 | } else { 206 | log.Printf("[%s][%s] Node has already been drained %d minutes ago, skipping", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), minutesSinceDrained) 207 | } 208 | if minutesSinceTerminated == -1 { 209 | // Terminate node 210 | log.Printf("[%s][%s] Terminating node", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 211 | shouldDecrementDesiredCapacity := aws.Int64Value(autoScalingGroup.DesiredCapacity) != aws.Int64Value(autoScalingGroup.MinSize) 212 | err = cloud.TerminateEc2Instance(autoScalingService, outdatedInstance, shouldDecrementDesiredCapacity) 213 | if err != nil { 214 | metrics.Server.Errors.Inc() 215 | log.Printf("[%s][%s] Ran into error while terminating node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error()) 216 | continue 217 | } else { 218 | metrics.Server.ScaledDownNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Inc() 219 | // Only annotate if no error was encountered 220 | _ = k8s.AnnotateNodeByAutoScalingInstance(client, outdatedInstance, k8s.AnnotationRollingUpdateTerminatedTimestamp, time.Now().Format(time.RFC3339)) 221 | } 222 | } else { 223 | log.Printf("[%s][%s] Node is already in the process of being terminated since %d minutes ago, skipping", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), minutesSinceTerminated) 224 | // TODO: check if minutesSinceTerminated > 10. If that happens, then there's clearly a problem, so we should do something about it 225 | // The node has already been terminated, there's nothing to do here, continue to the next one 226 | continue 227 | } 228 | // If this code is reached, it means that the current node has been successfully drained and 229 | // scheduled for termination. 230 | // As a result, we return here to make sure that multiple old instances didn't use the same updated 231 | // instances to calculate resources available 232 | log.Printf("[%s][%s] Node has been drained and scheduled for termination successfully", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 233 | if config.Get().SlowMode { 234 | // If SlowMode is enabled, we'll return after draining a node and wait for the next execution 235 | return true 236 | } 237 | // Move on to the next ASG 238 | break 239 | } else { 240 | // Don't increase the ASG if the node has already been drained or scheduled for termination 241 | if minutesSinceDrained != -1 || minutesSinceTerminated != -1 { 242 | continue 243 | } 244 | log.Printf("[%s][%s] Updated nodes do not have enough resources available, increasing desired count by 1", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 245 | err := cloud.IncrementAutoScalingGroupDesiredCount(autoScalingService, aws.StringValue(autoScalingGroup.AutoScalingGroupName)) 246 | if err != nil { 247 | log.Printf("[%s][%s] Unable to increase ASG desired size: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error()) 248 | log.Printf("[%s][%s] Skipping", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId)) 249 | continue 250 | } else { 251 | metrics.Server.ScaledUpNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Inc() 252 | // ASG was scaled up already, stop iterating over outdated instances in current ASG so we can 253 | // move on to the next ASG 254 | break 255 | } 256 | } 257 | } 258 | } 259 | } 260 | return true 261 | } 262 | 263 | func getReadyNodesAndNumberOfNonReadyNodesOrInstances(client k8s.ClientAPI, updatedInstances []*autoscaling.Instance, autoScalingGroup *autoscaling.Group) ([]*v1.Node, int) { 264 | var updatedReadyNodes []*v1.Node 265 | numberOfNonReadyNodesOrInstances := 0 266 | for _, updatedInstance := range updatedInstances { 267 | if aws.StringValue(updatedInstance.LifecycleState) != "InService" { 268 | numberOfNonReadyNodesOrInstances++ 269 | log.Printf("[%s][%s] Skipping because instance is not in LifecycleState 'InService', but is in '%s' instead", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), aws.StringValue(updatedInstance.LifecycleState)) 270 | continue 271 | } 272 | updatedNode, err := client.GetNodeByAutoScalingInstance(updatedInstance) 273 | if err != nil { 274 | numberOfNonReadyNodesOrInstances++ 275 | log.Printf("[%s][%s] Skipping because unable to get updated node from Kubernetes: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), err.Error()) 276 | continue 277 | } 278 | // Check if Kubelet is ready to accept pods on that node 279 | conditions := updatedNode.Status.Conditions 280 | if len(conditions) == 0 { 281 | log.Printf("[%s][%s] For some magical reason, %s doesn't have any conditions, therefore it is impossible to determine whether the node is ready to accept new pods or not", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), updatedNode.Name) 282 | numberOfNonReadyNodesOrInstances++ 283 | } else if kubeletCondition := conditions[len(conditions)-1]; kubeletCondition.Type == v1.NodeReady { 284 | if kubeletCondition.Status == v1.ConditionTrue { 285 | updatedReadyNodes = append(updatedReadyNodes, updatedNode) 286 | } else { 287 | log.Printf("[%s][%s] Skipping because kubelet condition %s is reporting as %s", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), kubeletCondition.Type, kubeletCondition.Status) 288 | numberOfNonReadyNodesOrInstances++ 289 | } 290 | } else { 291 | log.Printf("[%s][%s] Skipping because expected kubelet on node to have condition %s with value %s, but it didn't", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(updatedInstance.InstanceId), v1.NodeReady, v1.ConditionTrue) 292 | numberOfNonReadyNodesOrInstances++ 293 | } 294 | 295 | // Cleaning up 296 | // This is an edge case, but it may happen that an ASG's launch template is modified, creating a new 297 | // template version, but then that new template version is deleted before the node has been terminated. 298 | // To make it even more of an edge case, the draining function would've had to time out, meaning that 299 | // the termination would be skipped until the next run. 300 | // This would cause an instance to be considered as updated, even though it has been drained therefore 301 | // cordoned (NoSchedule). 302 | if startedAtValue, ok := updatedNode.Annotations[k8s.AnnotationRollingUpdateStartedTimestamp]; ok { 303 | // An updated node should never have k8s.AnnotationRollingUpdateStartedTimestamp, so this indicates that 304 | // at one point, this node was considered old compared to the ASG's current LT/LC 305 | // First, check if there's a NoSchedule taint 306 | for i, taint := range updatedNode.Spec.Taints { 307 | if taint.Effect == v1.TaintEffectNoSchedule { 308 | // There's a taint, but we need to make sure it was added after the rolling update started 309 | startedAt, err := time.Parse(time.RFC3339, startedAtValue) 310 | // If the annotation can't be parsed OR the taint was added after the rolling updated started, 311 | // we need to remove that taint 312 | if err != nil || taint.TimeAdded.Time.After(startedAt) { 313 | log.Printf("[%s] EDGE-0001: Attempting to remove taint from updated node %s", aws.StringValue(autoScalingGroup.AutoScalingGroupName), updatedNode.Name) 314 | // Remove the taint 315 | updatedNode.Spec.Taints = append(updatedNode.Spec.Taints[:i], updatedNode.Spec.Taints[i+1:]...) 316 | // Remove the annotation 317 | delete(updatedNode.Annotations, k8s.AnnotationRollingUpdateStartedTimestamp) 318 | // Update the node 319 | err = client.UpdateNode(updatedNode) 320 | if err != nil { 321 | log.Printf("[%s] EDGE-0001: Unable to update tainted node %s: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), updatedNode.Name, err.Error()) 322 | } 323 | break 324 | } 325 | } 326 | } 327 | } 328 | } 329 | return updatedReadyNodes, numberOfNonReadyNodesOrInstances 330 | } 331 | 332 | func getRollingUpdateTimestampsFromNode(node *v1.Node) (minutesSinceStarted, minutesSinceDrained, minutesSinceTerminated int) { 333 | rollingUpdateStartedAt, ok := node.Annotations[k8s.AnnotationRollingUpdateStartedTimestamp] 334 | if ok { 335 | startedAt, err := time.Parse(time.RFC3339, rollingUpdateStartedAt) 336 | if err == nil { 337 | minutesSinceStarted = int(time.Since(startedAt).Minutes()) 338 | } 339 | } else { 340 | minutesSinceStarted = -1 341 | } 342 | drainedAtValue, ok := node.Annotations[k8s.AnnotationRollingUpdateDrainedTimestamp] 343 | if ok { 344 | drainedAt, err := time.Parse(time.RFC3339, drainedAtValue) 345 | if err == nil { 346 | minutesSinceDrained = int(time.Since(drainedAt).Minutes()) 347 | } 348 | } else { 349 | minutesSinceDrained = -1 350 | } 351 | terminatedAtValue, ok := node.Annotations[k8s.AnnotationRollingUpdateTerminatedTimestamp] 352 | if ok { 353 | terminatedAt, err := time.Parse(time.RFC3339, terminatedAtValue) 354 | if err == nil { 355 | minutesSinceTerminated = int(time.Since(terminatedAt).Minutes()) 356 | } 357 | } else { 358 | minutesSinceTerminated = -1 359 | } 360 | return 361 | } 362 | 363 | // SeparateOutdatedFromUpdatedInstances splits a list of instances into a list of outdated 364 | // instances and a list of updated instances. 365 | func SeparateOutdatedFromUpdatedInstances(asg *autoscaling.Group, ec2Svc ec2iface.EC2API) ([]*autoscaling.Instance, []*autoscaling.Instance, error) { 366 | if config.Get().Debug { 367 | log.Printf("[%s] Separating outdated from updated instances", aws.StringValue(asg.AutoScalingGroupName)) 368 | } 369 | targetLaunchConfiguration := asg.LaunchConfigurationName 370 | targetLaunchTemplate := asg.LaunchTemplate 371 | var targetLaunchTemplateOverrides []*autoscaling.LaunchTemplateOverrides 372 | if targetLaunchTemplate == nil && asg.MixedInstancesPolicy != nil && asg.MixedInstancesPolicy.LaunchTemplate != nil { 373 | if config.Get().Debug { 374 | log.Printf("[%s] using mixed instances policy launch template", aws.StringValue(asg.AutoScalingGroupName)) 375 | } 376 | targetLaunchTemplate = asg.MixedInstancesPolicy.LaunchTemplate.LaunchTemplateSpecification 377 | targetLaunchTemplateOverrides = asg.MixedInstancesPolicy.LaunchTemplate.Overrides 378 | } 379 | if targetLaunchTemplate != nil { 380 | return SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate(aws.StringValue(asg.AutoScalingGroupName), targetLaunchTemplate, targetLaunchTemplateOverrides, asg.Instances, ec2Svc) 381 | } else if targetLaunchConfiguration != nil { 382 | return SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(targetLaunchConfiguration, asg.Instances) 383 | } 384 | return nil, nil, errors.New("AutoScalingGroup has neither launch template nor launch configuration") 385 | } 386 | 387 | // SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate separates a list of instances into a list of outdated 388 | // instances and a list of updated instances. 389 | func SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate(asgName string, targetLaunchTemplate *autoscaling.LaunchTemplateSpecification, overrides []*autoscaling.LaunchTemplateOverrides, instances []*autoscaling.Instance, ec2Svc ec2iface.EC2API) ([]*autoscaling.Instance, []*autoscaling.Instance, error) { 390 | var ( 391 | oldInstances []*autoscaling.Instance 392 | newInstances []*autoscaling.Instance 393 | targetTemplate *ec2.LaunchTemplate 394 | err error 395 | ) 396 | switch { 397 | case targetLaunchTemplate.LaunchTemplateId != nil && aws.StringValue(targetLaunchTemplate.LaunchTemplateId) != "": 398 | if targetTemplate, err = cloud.DescribeLaunchTemplateByID(ec2Svc, aws.StringValue(targetLaunchTemplate.LaunchTemplateId)); err != nil { 399 | return nil, nil, fmt.Errorf("error retrieving information about launch template %s: %v", aws.StringValue(targetLaunchTemplate.LaunchTemplateId), err) 400 | } 401 | case targetLaunchTemplate.LaunchTemplateName != nil && aws.StringValue(targetLaunchTemplate.LaunchTemplateName) != "": 402 | if targetTemplate, err = cloud.DescribeLaunchTemplateByName(ec2Svc, aws.StringValue(targetLaunchTemplate.LaunchTemplateName)); err != nil { 403 | return nil, nil, fmt.Errorf("error retrieving information about launch template name %s: %v", aws.StringValue(targetLaunchTemplate.LaunchTemplateName), err) 404 | } 405 | default: 406 | return nil, nil, fmt.Errorf("invalid launch template name") 407 | } 408 | // extra safety check 409 | if targetTemplate == nil { 410 | return nil, nil, fmt.Errorf("no template found") 411 | } 412 | // now we can loop through each node and compare 413 | for _, instance := range instances { 414 | if isInstanceTypePartOfLaunchTemplateOverrides(overrides, instance.InstanceType) { 415 | var ( 416 | overrideTargetTemplate *ec2.LaunchTemplate 417 | overrideTargetLaunchTemplate *autoscaling.LaunchTemplateSpecification 418 | ) 419 | for _, override := range overrides { 420 | if aws.StringValue(override.InstanceType) == aws.StringValue(instance.InstanceType) && override.LaunchTemplateSpecification != nil { 421 | if overrideTargetTemplate, err = cloud.DescribeLaunchTemplateByName(ec2Svc, aws.StringValue(override.LaunchTemplateSpecification.LaunchTemplateName)); err != nil { 422 | log.Printf("[%s][%s] Unable to retrieve information for launch template with name '%s': %v", asgName, aws.StringValue(instance.InstanceId), aws.StringValue(override.LaunchTemplateSpecification.LaunchTemplateName), err) 423 | } 424 | overrideTargetLaunchTemplate = override.LaunchTemplateSpecification 425 | } 426 | } 427 | if overrideTargetTemplate != nil && overrideTargetLaunchTemplate != nil { 428 | targetTemplate = overrideTargetTemplate 429 | targetLaunchTemplate = overrideTargetLaunchTemplate 430 | } 431 | } 432 | switch { 433 | case instance.LaunchTemplate == nil: 434 | fallthrough 435 | case aws.StringValue(instance.LaunchTemplate.LaunchTemplateName) != aws.StringValue(targetLaunchTemplate.LaunchTemplateName): 436 | fallthrough 437 | case aws.StringValue(instance.LaunchTemplate.LaunchTemplateId) != aws.StringValue(targetLaunchTemplate.LaunchTemplateId): 438 | fallthrough 439 | case !compareLaunchTemplateVersions(targetTemplate, targetLaunchTemplate, instance.LaunchTemplate): 440 | fallthrough 441 | case overrides != nil && len(overrides) > 0 && !isInstanceTypePartOfLaunchTemplateOverrides(overrides, instance.InstanceType): 442 | oldInstances = append(oldInstances, instance) 443 | default: 444 | newInstances = append(newInstances, instance) 445 | } 446 | } 447 | return oldInstances, newInstances, nil 448 | } 449 | 450 | func isInstanceTypePartOfLaunchTemplateOverrides(overrides []*autoscaling.LaunchTemplateOverrides, instanceType *string) bool { 451 | for _, override := range overrides { 452 | if aws.StringValue(override.InstanceType) == aws.StringValue(instanceType) { 453 | return true 454 | } 455 | } 456 | return false 457 | } 458 | 459 | // SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration separates a list of instances into a list of outdated 460 | // instances and a list of updated instances. 461 | func SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(targetLaunchConfigurationName *string, instances []*autoscaling.Instance) ([]*autoscaling.Instance, []*autoscaling.Instance, error) { 462 | var ( 463 | oldInstances []*autoscaling.Instance 464 | newInstances []*autoscaling.Instance 465 | ) 466 | for _, i := range instances { 467 | if i.LaunchConfigurationName != nil && *i.LaunchConfigurationName == *targetLaunchConfigurationName { 468 | newInstances = append(newInstances, i) 469 | } else { 470 | oldInstances = append(oldInstances, i) 471 | } 472 | } 473 | return oldInstances, newInstances, nil 474 | } 475 | 476 | // compareLaunchTemplateVersions compare two launch template versions and see if they match 477 | // can handle `$Latest` and `$Default` by resolving to the actual version in use 478 | func compareLaunchTemplateVersions(targetTemplate *ec2.LaunchTemplate, lt1, lt2 *autoscaling.LaunchTemplateSpecification) bool { 479 | // if both versions do not start with `$`, then just compare 480 | if lt1 == nil && lt2 == nil { 481 | return true 482 | } 483 | if (lt1 == nil && lt2 != nil) || (lt1 != nil && lt2 == nil) { 484 | return false 485 | } 486 | if lt1.Version == nil && lt2.Version == nil { 487 | return true 488 | } 489 | if (lt1.Version == nil && lt2.Version != nil) || (lt1.Version != nil && lt2.Version == nil) { 490 | return false 491 | } 492 | // if either version starts with `$`, then resolve to actual version from LaunchTemplate 493 | var lt1version, lt2version string 494 | switch aws.StringValue(lt1.Version) { 495 | case "$Default": 496 | lt1version = fmt.Sprintf("%d", aws.Int64Value(targetTemplate.DefaultVersionNumber)) 497 | case "$Latest": 498 | lt1version = fmt.Sprintf("%d", aws.Int64Value(targetTemplate.LatestVersionNumber)) 499 | default: 500 | lt1version = aws.StringValue(lt1.Version) 501 | } 502 | switch aws.StringValue(lt2.Version) { 503 | case "$Default": 504 | lt2version = fmt.Sprintf("%d", aws.Int64Value(targetTemplate.DefaultVersionNumber)) 505 | case "$Latest": 506 | lt2version = fmt.Sprintf("%d", aws.Int64Value(targetTemplate.LatestVersionNumber)) 507 | default: 508 | lt2version = aws.StringValue(lt2.Version) 509 | } 510 | return lt1version == lt2version 511 | } 512 | 513 | // HasAcceptableNumberOfUpdatedNonReadyNodes checks if there's a sufficient amount of updated 514 | // and ready nodes to move on to the next step (drain & terminate an outdated node) for a number of non-ready nodes. 515 | // 516 | // The logic behind this is that the more nodes are ready and updated, the higher the confidence we have that the 517 | // upgrade is going well, so we can ramp things up faster the deeper we are in the upgrade process. 518 | func HasAcceptableNumberOfUpdatedNonReadyNodes(numberOfUpdatedNonReadyNodes, numberOfUpdatedReadyNodes int) bool { 519 | if numberOfUpdatedNonReadyNodes == 0 { 520 | return true // all updated nodes are ready, so we can proceed 521 | } 522 | if numberOfUpdatedReadyNodes == 0 { 523 | return false // there are no ready nodes AND there are non-ready nodes (we know this because of the previous check), so we cannot proceed 524 | } 525 | if numberOfUpdatedNonReadyNodes > MaximumNumberOfUpdatedNonReadyNodes { 526 | return false // there are too many non-ready nodes, so we cannot proceed 527 | } 528 | return float64(numberOfUpdatedNonReadyNodes)/float64(numberOfUpdatedReadyNodes) <= MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio 529 | } 530 | -------------------------------------------------------------------------------- /main_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/cloudtest" 7 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/config" 8 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/k8s" 9 | "github.com/TwiN/aws-eks-asg-rolling-update-handler/k8stest" 10 | "github.com/aws/aws-sdk-go/aws" 11 | "github.com/aws/aws-sdk-go/service/autoscaling" 12 | "github.com/aws/aws-sdk-go/service/ec2" 13 | v1 "k8s.io/api/core/v1" 14 | ) 15 | 16 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration_whenInstanceIsOutdated(t *testing.T) { 17 | instance := cloudtest.CreateTestAutoScalingInstance("instance", "v1", nil, "InService") 18 | outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(aws.String("v2"), []*autoscaling.Instance{instance}) 19 | if err != nil { 20 | t.Fatal("Shouldn't have returned an error, but returned", err) 21 | } 22 | if len(outdated) != 1 || len(updated) != 0 { 23 | t.Error("Instance should've been outdated") 24 | } 25 | } 26 | 27 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration_whenInstanceIsUpdated(t *testing.T) { 28 | instance := cloudtest.CreateTestAutoScalingInstance("instance", "v1", nil, "InService") 29 | outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(aws.String("v1"), []*autoscaling.Instance{instance}) 30 | if err != nil { 31 | t.Fatal("Shouldn't have returned an error, but returned", err) 32 | } 33 | if len(outdated) != 0 || len(updated) != 1 { 34 | t.Error("Instance should've been updated") 35 | } 36 | } 37 | 38 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration_whenOneInstanceIsUpdatedAndTwoInstancesAreOutdated(t *testing.T) { 39 | firstInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService") 40 | secondInstance := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService") 41 | thirdInstance := cloudtest.CreateTestAutoScalingInstance("new", "v2", nil, "InService") 42 | outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchConfiguration(aws.String("v2"), []*autoscaling.Instance{firstInstance, secondInstance, thirdInstance}) 43 | if err != nil { 44 | t.Fatal("Shouldn't have returned an error, but returned", err) 45 | } 46 | if len(outdated) != 2 { 47 | t.Error("2 instances should've been outdated") 48 | } 49 | if len(updated) != 1 { 50 | t.Error("1 instance should've been outdated") 51 | } 52 | } 53 | 54 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceIsOutdated(t *testing.T) { 55 | outdatedLaunchTemplate := &autoscaling.LaunchTemplateSpecification{ 56 | LaunchTemplateId: aws.String("id"), 57 | LaunchTemplateName: aws.String("name"), 58 | Version: aws.String("v1"), 59 | } 60 | updatedLaunchTemplate := &autoscaling.LaunchTemplateSpecification{ 61 | LaunchTemplateId: aws.String("id"), 62 | LaunchTemplateName: aws.String("name"), 63 | Version: aws.String("v2"), 64 | } 65 | updatedEc2LaunchTemplate := &ec2.LaunchTemplate{ 66 | DefaultVersionNumber: aws.Int64(1), 67 | LatestVersionNumber: aws.Int64(10), 68 | LaunchTemplateId: updatedLaunchTemplate.LaunchTemplateId, 69 | LaunchTemplateName: updatedLaunchTemplate.LaunchTemplateName, 70 | } 71 | instance := cloudtest.CreateTestAutoScalingInstance("instance", "", outdatedLaunchTemplate, "InService") 72 | outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", updatedLaunchTemplate, nil, []*autoscaling.Instance{instance}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate})) 73 | if err != nil { 74 | t.Fatal("Shouldn't have returned an error, but returned:", err) 75 | } 76 | if len(outdated) != 1 || len(updated) != 0 { 77 | t.Error("Instance should've been outdated") 78 | } 79 | } 80 | 81 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceIsOutdatedDueToMixedInstancesPolicyInstanceTypeGettingRemoved(t *testing.T) { 82 | launchTemplate := &autoscaling.LaunchTemplateSpecification{ 83 | LaunchTemplateId: aws.String("id"), 84 | LaunchTemplateName: aws.String("name"), 85 | Version: aws.String("v1"), 86 | } 87 | updatedEc2LaunchTemplate := &ec2.LaunchTemplate{ 88 | DefaultVersionNumber: aws.Int64(1), 89 | LatestVersionNumber: aws.Int64(10), 90 | LaunchTemplateId: launchTemplate.LaunchTemplateId, 91 | LaunchTemplateName: launchTemplate.LaunchTemplateName, 92 | } 93 | instance := cloudtest.CreateTestAutoScalingInstance("instance", "", launchTemplate, "InService") 94 | instance.SetInstanceType("c5n.2xlarge") 95 | overrides := []*autoscaling.LaunchTemplateOverrides{ 96 | {InstanceType: aws.String("c5.2xlarge")}, 97 | {InstanceType: aws.String("c5d.2xlarge")}, 98 | } 99 | // Notice: The instance's instance type isn't part of the overrides. 100 | outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", launchTemplate, overrides, []*autoscaling.Instance{instance}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate})) 101 | if err != nil { 102 | t.Fatal("Shouldn't have returned an error, but returned:", err) 103 | } 104 | if len(outdated) != 1 || len(updated) != 0 { 105 | t.Error("Instance should've been outdated") 106 | } 107 | } 108 | 109 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceIsUpdated(t *testing.T) { 110 | updatedLaunchTemplate := &autoscaling.LaunchTemplateSpecification{ 111 | LaunchTemplateId: aws.String("id"), 112 | LaunchTemplateName: aws.String("name"), 113 | Version: aws.String("v1"), 114 | } 115 | updatedEc2LaunchTemplate := &ec2.LaunchTemplate{ 116 | DefaultVersionNumber: aws.Int64(1), 117 | LatestVersionNumber: aws.Int64(10), 118 | LaunchTemplateId: updatedLaunchTemplate.LaunchTemplateId, 119 | LaunchTemplateName: updatedLaunchTemplate.LaunchTemplateName, 120 | } 121 | instance := cloudtest.CreateTestAutoScalingInstance("instance", "", updatedLaunchTemplate, "InService") 122 | outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", updatedLaunchTemplate, nil, []*autoscaling.Instance{instance}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate})) 123 | if err != nil { 124 | t.Fatal("Shouldn't have returned an error, but returned:", err) 125 | } 126 | if len(outdated) != 0 || len(updated) != 1 { 127 | t.Error("Instance should've been updated") 128 | } 129 | } 130 | 131 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceWithMixedInstancesPolicyIsUpdated(t *testing.T) { 132 | launchTemplate := &autoscaling.LaunchTemplateSpecification{ 133 | LaunchTemplateId: aws.String("id"), 134 | LaunchTemplateName: aws.String("name"), 135 | Version: aws.String("v1"), 136 | } 137 | updatedEc2LaunchTemplate := &ec2.LaunchTemplate{ 138 | DefaultVersionNumber: aws.Int64(1), 139 | LatestVersionNumber: aws.Int64(10), 140 | LaunchTemplateId: launchTemplate.LaunchTemplateId, 141 | LaunchTemplateName: launchTemplate.LaunchTemplateName, 142 | } 143 | instance := cloudtest.CreateTestAutoScalingInstance("instance", "", launchTemplate, "InService") 144 | instance.SetInstanceType("c5d.2xlarge") 145 | overrides := []*autoscaling.LaunchTemplateOverrides{ 146 | {InstanceType: aws.String("c5.2xlarge")}, 147 | {InstanceType: aws.String("c5d.2xlarge")}, 148 | } 149 | outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", launchTemplate, overrides, []*autoscaling.Instance{instance}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate})) 150 | if err != nil { 151 | t.Fatal("Shouldn't have returned an error, but returned:", err) 152 | } 153 | if len(outdated) != 0 || len(updated) != 1 { 154 | t.Error("Instance should've been updated") 155 | } 156 | } 157 | 158 | func TestSeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate_whenInstanceWithMixedInstancesPolicyAndOverrideIsUpdated(t *testing.T) { 159 | launchTemplate := &autoscaling.LaunchTemplateSpecification{ 160 | LaunchTemplateId: aws.String("id"), 161 | LaunchTemplateName: aws.String("name"), 162 | Version: aws.String("v1"), 163 | } 164 | updatedEc2LaunchTemplate := &ec2.LaunchTemplate{ 165 | DefaultVersionNumber: aws.Int64(1), 166 | LatestVersionNumber: aws.Int64(10), 167 | LaunchTemplateId: launchTemplate.LaunchTemplateId, 168 | LaunchTemplateName: launchTemplate.LaunchTemplateName, 169 | } 170 | instance := cloudtest.CreateTestAutoScalingInstance("instance", "", launchTemplate, "InService") 171 | instance.SetInstanceType("c5d.2xlarge") 172 | instanceWithLaunchTemplateOverride := cloudtest.CreateTestAutoScalingInstance("instance", "", launchTemplate, "InService") 173 | instanceWithLaunchTemplateOverride.SetInstanceType("c5d.2xlarge") 174 | overrides := []*autoscaling.LaunchTemplateOverrides{ 175 | {InstanceType: aws.String("c5.2xlarge"), LaunchTemplateSpecification: launchTemplate}, 176 | {InstanceType: aws.String("c5d.2xlarge")}, 177 | } 178 | outdated, updated, err := SeparateOutdatedFromUpdatedInstancesUsingLaunchTemplate("test", launchTemplate, overrides, []*autoscaling.Instance{instance, instanceWithLaunchTemplateOverride}, cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{updatedEc2LaunchTemplate})) 179 | if err != nil { 180 | t.Fatal("Shouldn't have returned an error, but returned:", err) 181 | } 182 | if len(outdated) != 0 || len(updated) != 2 { 183 | t.Error("Instance should've been updated") 184 | } 185 | } 186 | 187 | func TestSeparateOutdatedFromUpdatedInstances_withLaunchConfigurationWhenOneInstanceIsUpdatedAndTwoInstancesAreOutdated(t *testing.T) { 188 | firstInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService") 189 | secondInstance := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService") 190 | thirdInstance := cloudtest.CreateTestAutoScalingInstance("new", "v2", nil, "InService") 191 | 192 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{firstInstance, secondInstance, thirdInstance}, false) 193 | 194 | outdated, updated, err := SeparateOutdatedFromUpdatedInstances(asg, nil) 195 | if err != nil { 196 | t.Fatal("Shouldn't have returned an error, but returned", err) 197 | } 198 | if len(outdated) != 2 { 199 | t.Error("2 instances should've been outdated") 200 | } 201 | if len(updated) != 1 { 202 | t.Error("1 instance should've been outdated") 203 | } 204 | } 205 | 206 | func TestHandleRollingUpgrade(t *testing.T) { 207 | oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService") 208 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance}, false) 209 | 210 | oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi") 211 | oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning) 212 | 213 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod}) 214 | mockEc2Service := cloudtest.NewMockEC2Service(nil) 215 | mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg}) 216 | 217 | // First run (Node rollout process gets marked as started) 218 | err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 219 | if err != nil { 220 | t.Error("unexpected error:", err) 221 | } 222 | if mockClient.Counter["UpdateNode"] != 1 { 223 | t.Error("Node should've been annotated, meaning that UpdateNode should've been called once") 224 | } 225 | oldNode = mockClient.Nodes[oldNode.Name] 226 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateStartedTimestamp]; !ok { 227 | t.Error("Node should've been annotated with", k8s.AnnotationRollingUpdateStartedTimestamp) 228 | } 229 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 230 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 231 | } 232 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; ok { 233 | t.Error("Node shouldn't have been terminated yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateTerminatedTimestamp) 234 | } 235 | 236 | // Second run (ASG's desired capacity gets increased) 237 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 238 | if err != nil { 239 | t.Error("unexpected error:", err) 240 | } 241 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 242 | t.Error("ASG should've been increased because there's no updated nodes yet") 243 | } 244 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 245 | if aws.Int64Value(asg.DesiredCapacity) != 2 { 246 | t.Error("The desired capacity of the ASG should've been increased to 2") 247 | } 248 | oldNode = mockClient.Nodes[oldNode.Name] 249 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 250 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 251 | } 252 | 253 | // Third run (Nothing changed) 254 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 255 | if err != nil { 256 | t.Error("unexpected error:", err) 257 | } 258 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 259 | t.Error("Desired capacity shouldn't have been updated") 260 | } 261 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 262 | if aws.Int64Value(asg.DesiredCapacity) != 2 { 263 | t.Error("The desired capacity of the ASG should've stayed at 2") 264 | } 265 | oldNode = mockClient.Nodes[oldNode.Name] 266 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 267 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 268 | } 269 | 270 | // Fourth run (new instance has been registered to ASG, but is pending) 271 | newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "v2", nil, "Pending") 272 | asg.Instances = append(asg.Instances, newInstance) 273 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 274 | if err != nil { 275 | t.Error("unexpected error:", err) 276 | } 277 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 278 | t.Error("Desired capacity shouldn't have been updated") 279 | } 280 | oldNode = mockClient.Nodes[oldNode.Name] 281 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 282 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 283 | } 284 | 285 | // Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found)) 286 | newInstance.SetLifecycleState("InService") 287 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 288 | if err != nil { 289 | t.Error("unexpected error:", err) 290 | } 291 | oldNode = mockClient.Nodes[oldNode.Name] 292 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 293 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 294 | } 295 | 296 | // Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet) 297 | newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi") 298 | newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}} 299 | mockClient.Nodes[newNode.Name] = newNode 300 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 301 | if err != nil { 302 | t.Error("unexpected error:", err) 303 | } 304 | oldNode = mockClient.Nodes[oldNode.Name] 305 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 306 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 307 | } 308 | 309 | // Seventh run (Kubelet is ready to accept new pods. Old node gets drained and terminated) 310 | newNode = mockClient.Nodes[newNode.Name] 311 | newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}} 312 | mockClient.Nodes[newNode.Name] = newNode 313 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 314 | if err != nil { 315 | t.Error("unexpected error:", err) 316 | } 317 | oldNode = mockClient.Nodes[oldNode.Name] 318 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; !ok { 319 | t.Error("Node should've been drained") 320 | } 321 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; !ok { 322 | t.Error("Node should've been terminated") 323 | } 324 | } 325 | 326 | func TestHandleRollingUpgrade_withLaunchTemplate(t *testing.T) { 327 | oldLaunchTemplateSpecification := &autoscaling.LaunchTemplateSpecification{ 328 | LaunchTemplateId: aws.String("lt1"), 329 | LaunchTemplateName: aws.String("lt1"), 330 | Version: aws.String("1"), 331 | } 332 | newLaunchTemplateSpecification := &autoscaling.LaunchTemplateSpecification{ 333 | LaunchTemplateId: aws.String("lt1"), 334 | LaunchTemplateName: aws.String("lt1"), 335 | Version: aws.String("2"), 336 | } 337 | lt := &ec2.LaunchTemplate{ 338 | DefaultVersionNumber: aws.Int64(1), 339 | LatestVersionNumber: aws.Int64(1), 340 | LaunchTemplateId: aws.String("lt1"), 341 | LaunchTemplateName: aws.String("lt1"), 342 | } 343 | oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "", oldLaunchTemplateSpecification, "InService") 344 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "", newLaunchTemplateSpecification, []*autoscaling.Instance{oldInstance}, false) 345 | 346 | oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi") 347 | oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning) 348 | 349 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod}) 350 | mockEc2Service := cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{lt}) 351 | mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg}) 352 | 353 | // First run (Node rollout process gets marked as started) 354 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 355 | if mockClient.Counter["UpdateNode"] != 1 { 356 | t.Error("Node should've been annotated, meaning that UpdateNode should've been called once") 357 | } 358 | oldNode = mockClient.Nodes[oldNode.Name] 359 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateStartedTimestamp]; !ok { 360 | t.Error("Node should've been annotated with", k8s.AnnotationRollingUpdateStartedTimestamp) 361 | } 362 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 363 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 364 | } 365 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; ok { 366 | t.Error("Node shouldn't have been terminated yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateTerminatedTimestamp) 367 | } 368 | 369 | // Second run (ASG's desired capacity gets increased) 370 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 371 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 372 | t.Error("ASG should've been increased because there's no updated nodes yet") 373 | } 374 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 375 | if aws.Int64Value(asg.DesiredCapacity) != 2 { 376 | t.Error("The desired capacity of the ASG should've been increased to 2") 377 | } 378 | oldNode = mockClient.Nodes[oldNode.Name] 379 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 380 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 381 | } 382 | 383 | // Third run (Nothing changed) 384 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 385 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 386 | t.Error("Desired capacity shouldn't have been updated") 387 | } 388 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 389 | if aws.Int64Value(asg.DesiredCapacity) != 2 { 390 | t.Error("The desired capacity of the ASG should've stayed at 2") 391 | } 392 | oldNode = mockClient.Nodes[oldNode.Name] 393 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 394 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 395 | } 396 | 397 | // Fourth run (new instance has been registered to ASG, but is pending) 398 | newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "", newLaunchTemplateSpecification, "Pending") 399 | asg.Instances = append(asg.Instances, newInstance) 400 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 401 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 402 | t.Error("Desired capacity shouldn't have been updated") 403 | } 404 | oldNode = mockClient.Nodes[oldNode.Name] 405 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 406 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 407 | } 408 | 409 | // Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found)) 410 | newInstance.SetLifecycleState("InService") 411 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 412 | oldNode = mockClient.Nodes[oldNode.Name] 413 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 414 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 415 | } 416 | 417 | // Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet) 418 | newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi") 419 | newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}} 420 | mockClient.Nodes[newNode.Name] = newNode 421 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 422 | oldNode = mockClient.Nodes[oldNode.Name] 423 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 424 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 425 | } 426 | 427 | // Seventh run (Kubelet is ready to accept new pods. Old node gets drained and terminated) 428 | newNode = mockClient.Nodes[newNode.Name] 429 | newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}} 430 | mockClient.Nodes[newNode.Name] = newNode 431 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 432 | oldNode = mockClient.Nodes[oldNode.Name] 433 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; !ok { 434 | t.Error("Node should've been drained") 435 | } 436 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; !ok { 437 | t.Error("Node should've been terminated") 438 | } 439 | } 440 | 441 | func TestHandleRollingUpgrade_withLaunchTemplateWhenLaunchTemplateDidNotUpdate(t *testing.T) { 442 | launchTemplateSpecification := &autoscaling.LaunchTemplateSpecification{ 443 | LaunchTemplateId: aws.String("lt1"), 444 | LaunchTemplateName: aws.String("lt1"), 445 | Version: aws.String("1"), 446 | } 447 | lt := &ec2.LaunchTemplate{ 448 | DefaultVersionNumber: aws.Int64(1), 449 | LatestVersionNumber: aws.Int64(1), 450 | LaunchTemplateId: aws.String("lt1"), 451 | LaunchTemplateName: aws.String("lt1"), 452 | } 453 | oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "", launchTemplateSpecification, "InService") 454 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "", launchTemplateSpecification, []*autoscaling.Instance{oldInstance}, false) 455 | 456 | oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi") 457 | 458 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{}) 459 | mockEc2Service := cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{lt}) 460 | mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg}) 461 | 462 | // First run (No changes, no updates) 463 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 464 | if mockClient.Counter["UpdateNode"] != 0 { 465 | t.Error("The LT hasn't been updated, therefore nothing should've changed") 466 | } 467 | } 468 | 469 | func TestHandleRollingUpgrade_withEnoughPodsToRequireTwoNewNodes(t *testing.T) { 470 | oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService") 471 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance}, false) 472 | 473 | oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi") 474 | oldNodeFirstPod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "300m", "300Mi", false, v1.PodRunning) 475 | oldNodeSecondPod := k8stest.CreateTestPod("old-pod-2", oldNode.Name, "300m", "300Mi", false, v1.PodRunning) 476 | oldNodeThirdPod := k8stest.CreateTestPod("old-pod-3", oldNode.Name, "300m", "300Mi", false, v1.PodRunning) 477 | oldNodeFourthPod := k8stest.CreateTestPod("old-pod-4", oldNode.Name, "300m", "300Mi", false, v1.PodRunning) 478 | // This pod should be ignored, because the pod.Status.Phase is v1.PodFailed 479 | oldNodeFifthPod := k8stest.CreateTestPod("old-pod-5-evicted", oldNode.Name, "99999m", "99999Mi", false, v1.PodFailed) 480 | 481 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod, oldNodeFourthPod, oldNodeFifthPod}) 482 | mockEc2Service := cloudtest.NewMockEC2Service(nil) 483 | mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg}) 484 | 485 | // First run (Node rollout process gets marked as started) 486 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 487 | if mockClient.Counter["UpdateNode"] != 1 { 488 | t.Error("Node should've been annotated, meaning that UpdateNode should've been called once") 489 | } 490 | oldNode = mockClient.Nodes[oldNode.Name] 491 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateStartedTimestamp]; !ok { 492 | t.Error("Node should've been annotated with", k8s.AnnotationRollingUpdateStartedTimestamp) 493 | } 494 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 495 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 496 | } 497 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; ok { 498 | t.Error("Node shouldn't have been terminated yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateTerminatedTimestamp) 499 | } 500 | 501 | // Second run (ASG's desired capacity gets increased) 502 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 503 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 504 | t.Error("ASG should've been increased because there's no updated nodes yet") 505 | } 506 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 507 | if aws.Int64Value(asg.DesiredCapacity) != 2 { 508 | t.Error("The desired capacity of the ASG should've been increased to 2") 509 | } 510 | oldNode = mockClient.Nodes[oldNode.Name] 511 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 512 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 513 | } 514 | 515 | // Third run (Nothing changed) 516 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 517 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 518 | t.Error("Desired capacity shouldn't have been updated") 519 | } 520 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 521 | if aws.Int64Value(asg.DesiredCapacity) != 2 { 522 | t.Error("The desired capacity of the ASG should've stayed at 2") 523 | } 524 | oldNode = mockClient.Nodes[oldNode.Name] 525 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 526 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 527 | } 528 | 529 | // Fourth run (new instance has been registered to ASG, but is pending) 530 | newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "v2", nil, "Pending") 531 | asg.Instances = append(asg.Instances, newInstance) 532 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 533 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 534 | t.Error("Desired capacity shouldn't have been updated") 535 | } 536 | oldNode = mockClient.Nodes[oldNode.Name] 537 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 538 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 539 | } 540 | 541 | // Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found)) 542 | newInstance.SetLifecycleState("InService") 543 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 544 | oldNode = mockClient.Nodes[oldNode.Name] 545 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 546 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 547 | } 548 | 549 | // Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet) 550 | newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi") 551 | newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}} 552 | mockClient.Nodes[newNode.Name] = newNode 553 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 554 | oldNode = mockClient.Nodes[oldNode.Name] 555 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 556 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 557 | } 558 | 559 | // Seventh run (Kubelet is ready to accept new pods) 560 | newNode = mockClient.Nodes[newNode.Name] 561 | newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}} 562 | mockClient.Nodes[newNode.Name] = newNode 563 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 564 | oldNode = mockClient.Nodes[oldNode.Name] 565 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 566 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 567 | } 568 | 569 | // Eight run (ASG's desired capacity gets increased) 570 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 571 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 2 { 572 | t.Error("ASG should've been increased again") 573 | } 574 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 575 | if aws.Int64Value(asg.DesiredCapacity) != 3 { 576 | t.Error("The desired capacity of the ASG should've been increased to 3") 577 | } 578 | oldNode = mockClient.Nodes[oldNode.Name] 579 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; ok { 580 | t.Error("Node shouldn't have been drained yet, therefore shouldn't have been annotated with", k8s.AnnotationRollingUpdateDrainedTimestamp) 581 | } 582 | 583 | // Ninth run (fast-forward new instance, node and kubelet ready to accept. Old node gets drained and terminated) 584 | newSecondInstance := cloudtest.CreateTestAutoScalingInstance("new-2", "v2", nil, "InService") 585 | asg.Instances = append(asg.Instances, newSecondInstance) 586 | newSecondNode := k8stest.CreateTestNode("new-node-2", aws.StringValue(newSecondInstance.AvailabilityZone), aws.StringValue(newSecondInstance.InstanceId), "1000m", "1000Mi") 587 | newSecondNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}} 588 | mockClient.Nodes[newSecondNode.Name] = newSecondNode 589 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 590 | oldNode = mockClient.Nodes[oldNode.Name] 591 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateDrainedTimestamp]; !ok { 592 | t.Error("Node should've been drained") 593 | } 594 | if _, ok := oldNode.GetAnnotations()[k8s.AnnotationRollingUpdateTerminatedTimestamp]; !ok { 595 | t.Error("Node should've been terminated") 596 | } 597 | } 598 | 599 | // The mixed instance policy is not part of the launch template; it's part of the ASG itself. 600 | // This means that not only must we check the launch template version (it doesn't change in this test), but 601 | // we must also check if the instance's instance type is part of the MixedInstancesPolicy's instance types. 602 | // If it isn't, then it means the ASG has been modified, and the instance is old. 603 | func TestHandleRollingUpgrade_withMixedInstancePolicyWhenOneOfTheInstanceTypesOverrideChanges(t *testing.T) { 604 | launchTemplateSpecification := &autoscaling.LaunchTemplateSpecification{ 605 | LaunchTemplateId: aws.String("lt1"), 606 | LaunchTemplateName: aws.String("lt1"), 607 | Version: aws.String("1"), 608 | } 609 | lt := &ec2.LaunchTemplate{ 610 | DefaultVersionNumber: aws.Int64(1), 611 | LatestVersionNumber: aws.Int64(1), 612 | LaunchTemplateId: aws.String("lt1"), 613 | LaunchTemplateName: aws.String("lt1"), 614 | } 615 | oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "", launchTemplateSpecification, "InService") 616 | // The LT has NOT changed, but we're setting withMixedInstancesPolicy to true 617 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "", launchTemplateSpecification, []*autoscaling.Instance{oldInstance}, true) 618 | // We set the instance type to something isn't the default instance type, because the first one has the same value as the 619 | // Launch template version, meaning that modifying that one would likely trigger a new version to be created. 620 | // What we're trying to test here is whether we're able to trigger a rolling update on an instance type that is no 621 | // longer part of the MixedInstancesPolicy overrides 622 | oldInstance.SetInstanceType(aws.StringValue(asg.MixedInstancesPolicy.LaunchTemplate.Overrides[1].InstanceType)) 623 | 624 | oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi") 625 | 626 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{}) 627 | mockEc2Service := cloudtest.NewMockEC2Service([]*ec2.LaunchTemplate{lt}) 628 | mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg}) 629 | 630 | // First run (Nothing changed) 631 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 632 | if mockClient.Counter["UpdateNode"] != 0 { 633 | t.Error("Nothing should've changed") 634 | } 635 | 636 | // Suddenly, the ASG's MixedInstancePolicy gets updated, and only the first instance type override is kept 637 | // The second instance type is the one that our old instance uses 638 | asg.MixedInstancesPolicy.SetLaunchTemplate(&autoscaling.LaunchTemplate{ 639 | LaunchTemplateSpecification: asg.MixedInstancesPolicy.LaunchTemplate.LaunchTemplateSpecification, 640 | Overrides: asg.MixedInstancesPolicy.LaunchTemplate.Overrides[0:1], 641 | }) 642 | 643 | // Second run 644 | HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 645 | if mockClient.Counter["UpdateNode"] != 1 { 646 | t.Error("The old instance's instance type is no longer part of the ASG's MixedInstancePolicy's LaunchTemplate overrides, therefore, it is outdated and should've been annotated") 647 | } 648 | } 649 | 650 | func TestHasAcceptableNumberOfUpdatedNonReadyNodes(t *testing.T) { 651 | // false: there's too many non-ready nodes 652 | // true: there's an acceptable amount of non-ready nodes given how many ready nodes there are 653 | if HasAcceptableNumberOfUpdatedNonReadyNodes(100, 0) { 654 | t.Error("100NR/0R ready should not be acceptable") 655 | } 656 | if HasAcceptableNumberOfUpdatedNonReadyNodes(50, 50) { 657 | t.Error("50NR/50R should not be acceptable") 658 | } 659 | if HasAcceptableNumberOfUpdatedNonReadyNodes(6, 10000) { 660 | t.Error("6NR/10000R should not be acceptable, because MaximumNumberOfUpdatedNonReadyNodes is set to", MaximumNumberOfUpdatedNonReadyNodes) 661 | } 662 | if !HasAcceptableNumberOfUpdatedNonReadyNodes(5, 10000) { 663 | t.Error("5NR/10000R should be acceptable") 664 | } 665 | if !HasAcceptableNumberOfUpdatedNonReadyNodes(4, 100) { 666 | t.Error("4NR/100R should be acceptable") 667 | } 668 | if !HasAcceptableNumberOfUpdatedNonReadyNodes(1, 99) { 669 | t.Error("1NR/99R should be acceptable") 670 | } 671 | if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 100) { 672 | t.Error("0NR/100R should be acceptable") 673 | } 674 | if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 1) { 675 | t.Error("0NR/1R should be acceptable") 676 | } 677 | if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 0) { 678 | t.Error("0NR/0R should be acceptable") 679 | } 680 | if !HasAcceptableNumberOfUpdatedNonReadyNodes(1, 11) { 681 | t.Error("1NR/11R should be acceptable") 682 | } 683 | } 684 | 685 | func TestHandleRollingUpgrade_withEagerCordoning(t *testing.T) { 686 | config.Set(nil, true, true, true, false) 687 | defer config.Set(nil, true, true, false, false) 688 | 689 | oldInstance1 := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService") 690 | oldInstance2 := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService") 691 | oldInstance3 := cloudtest.CreateTestAutoScalingInstance("old-3", "v1", nil, "InService") 692 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance1, oldInstance2, oldInstance3}, false) 693 | 694 | oldNode1 := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance1.AvailabilityZone), aws.StringValue(oldInstance1.InstanceId), "1000m", "1000Mi") 695 | oldNode2 := k8stest.CreateTestNode("old-node-2", aws.StringValue(oldInstance2.AvailabilityZone), aws.StringValue(oldInstance2.InstanceId), "1000m", "1000Mi") 696 | oldNode3 := k8stest.CreateTestNode("old-node-3", aws.StringValue(oldInstance3.AvailabilityZone), aws.StringValue(oldInstance3.InstanceId), "1000m", "1000Mi") 697 | oldNodePod1 := k8stest.CreateTestPod("old-pod-1", oldNode1.Name, "600m", "600Mi", false, v1.PodRunning) 698 | oldNodePod2 := k8stest.CreateTestPod("old-pod-2", oldNode2.Name, "600m", "600Mi", false, v1.PodRunning) 699 | oldNodePod3 := k8stest.CreateTestPod("old-pod-3", oldNode3.Name, "600m", "600Mi", false, v1.PodRunning) 700 | 701 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode1, oldNode2, oldNode3}, []v1.Pod{oldNodePod1, oldNodePod2, oldNodePod3}) 702 | mockEc2Service := cloudtest.NewMockEC2Service(nil) 703 | mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg}) 704 | 705 | // First run (Node rollout process gets marked as started) 706 | err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 707 | if err != nil { 708 | t.Error("unexpected error:", err) 709 | } 710 | if mockClient.Counter["UpdateNode"] != 3 { 711 | t.Error("Node should've been annotated as started, meaning that UpdateNode should've been called once") 712 | } 713 | // Make sure that all nodes were "eagerly cordoned" 714 | if mockClient.Counter["Cordon"] != 3 { 715 | t.Error("Node should've been annotated, meaning that Cordon should've been called thrice, but was called", mockClient.Counter["Cordon"], "times") 716 | } 717 | } 718 | 719 | func TestHandleRollingUpgrade_withEagerCordoningDisabled(t *testing.T) { 720 | // explicitly setting this, but eager cordoning is disabled by default anyways 721 | config.Set(nil, true, true, false, true) 722 | defer config.Set(nil, true, true, true, false) 723 | 724 | oldInstance1 := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService") 725 | oldInstance2 := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService") 726 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance1, oldInstance2}, false) 727 | 728 | oldNode1 := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance1.AvailabilityZone), aws.StringValue(oldInstance1.InstanceId), "1000m", "1000Mi") 729 | oldNode2 := k8stest.CreateTestNode("old-node-2", aws.StringValue(oldInstance2.AvailabilityZone), aws.StringValue(oldInstance2.InstanceId), "1000m", "1000Mi") 730 | oldNodePod1 := k8stest.CreateTestPod("old-pod-1", oldNode1.Name, "600m", "600Mi", false, v1.PodRunning) 731 | oldNodePod2 := k8stest.CreateTestPod("old-pod-2", oldNode2.Name, "600m", "600Mi", false, v1.PodRunning) 732 | 733 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode1, oldNode2}, []v1.Pod{oldNodePod1, oldNodePod2}) 734 | mockEc2Service := cloudtest.NewMockEC2Service(nil) 735 | mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg}) 736 | 737 | // First run (Node rollout process gets marked as started) 738 | err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 739 | if err != nil { 740 | t.Error("unexpected error:", err) 741 | } 742 | if mockClient.Counter["UpdateNode"] != 2 { 743 | t.Error("Nodes should've been annotated as started, meaning that UpdateNode should've been called twice") 744 | } 745 | // Make sure that all nodes were NOT "eagerly cordoned" 746 | if mockClient.Counter["Cordon"] != 0 { 747 | t.Error("Eager cordoning is not enabled, so no node should have been cordoned on the first execution") 748 | } 749 | } 750 | 751 | func TestHandleRollingUpgrade_withExcludeFromExternalLoadBalancers(t *testing.T) { 752 | config.Set(nil, true, true, false, true) 753 | defer config.Set(nil, true, true, false, false) 754 | 755 | oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService") 756 | asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance}, false) 757 | 758 | oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi") 759 | oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning) 760 | 761 | mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod}) 762 | mockEc2Service := cloudtest.NewMockEC2Service(nil) 763 | mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg}) 764 | 765 | // First run (Node rollout process gets marked as started) 766 | err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 767 | if err != nil { 768 | t.Error("unexpected error:", err) 769 | } 770 | if mockClient.Counter["UpdateNode"] != 1 { 771 | t.Error("Node should've been annotated, meaning that UpdateNode should've been called once") 772 | } 773 | 774 | // Second run (ASG's desired capacity gets increased) 775 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 776 | if err != nil { 777 | t.Error("unexpected error:", err) 778 | } 779 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 780 | t.Error("ASG should've been increased because there's no updated nodes yet") 781 | } 782 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 783 | if aws.Int64Value(asg.DesiredCapacity) != 2 { 784 | t.Error("The desired capacity of the ASG should've been increased to 2") 785 | } 786 | 787 | // Third run (Nothing changed) 788 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 789 | if err != nil { 790 | t.Error("unexpected error:", err) 791 | } 792 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 793 | t.Error("Desired capacity shouldn't have been updated") 794 | } 795 | asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)] 796 | if aws.Int64Value(asg.DesiredCapacity) != 2 { 797 | t.Error("The desired capacity of the ASG should've stayed at 2") 798 | } 799 | 800 | // Fourth run (new instance has been registered to ASG, but is pending) 801 | newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "v2", nil, "Pending") 802 | asg.Instances = append(asg.Instances, newInstance) 803 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 804 | if err != nil { 805 | t.Error("unexpected error:", err) 806 | } 807 | if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 { 808 | t.Error("Desired capacity shouldn't have been updated") 809 | } 810 | 811 | // Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found)) 812 | newInstance.SetLifecycleState("InService") 813 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 814 | if err != nil { 815 | t.Error("unexpected error:", err) 816 | } 817 | 818 | // Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet) 819 | newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi") 820 | newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}} 821 | mockClient.Nodes[newNode.Name] = newNode 822 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 823 | if err != nil { 824 | t.Error("unexpected error:", err) 825 | } 826 | 827 | // Seventh run (Kubelet is ready to accept new pods. Old node gets drained and terminated) 828 | newNode = mockClient.Nodes[newNode.Name] 829 | newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}} 830 | mockClient.Nodes[newNode.Name] = newNode 831 | err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg}) 832 | if err != nil { 833 | t.Error("unexpected error:", err) 834 | } 835 | oldNode = mockClient.Nodes[oldNode.Name] 836 | if _, ok := oldNode.GetLabels()[k8s.LabelExcludeFromExternalLoadBalancers]; !ok { 837 | t.Error("Node should've been labeled") 838 | } 839 | } 840 | --------------------------------------------------------------------------------