├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ ├── documentation_improvements.yml │ ├── feature_request.yml │ └── maintenance.yml ├── PULL_REQUEST_TEMPLATE.md ├── actions │ ├── delete-cluster │ │ └── action.yaml │ └── dep-setup │ │ └── action.yml ├── scripts │ ├── cluster_creation_1.22-us-east-1.yaml │ ├── cluster_creation_1.22-us-east-2.yaml │ ├── cluster_creation_1.23-us-east-2.yaml │ └── cluster_creation_1.24-us-east-1.yaml └── workflows │ ├── close-stale-issues.yaml │ ├── e2e-test-template.yaml │ ├── e2e-test.yaml │ ├── pr-title.yaml │ ├── publish.yaml │ └── validate.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── eksupgrade ├── __init__.py ├── __main__.py ├── cli.py ├── exceptions.py ├── models │ ├── __init__.py │ ├── base.py │ └── eks.py ├── src │ ├── S3Files │ │ ├── coredns.json │ │ ├── kube-proxy-configmap.json │ │ ├── kube-proxy.json │ │ ├── version_dict.json │ │ └── vpc-cni.json │ ├── __init__.py │ ├── boto_aws.py │ ├── eks_get_image_type.py │ ├── k8s_client.py │ ├── latest_ami.py │ └── self_managed.py ├── starter.py └── utils.py ├── poetry.lock ├── pyproject.toml └── tests ├── __init__.py ├── conftest.py ├── test_cli.py ├── test_eks_get_image_type.py ├── test_k8s_client.py ├── test_models_base.py ├── test_models_eks.py └── test_utils.py /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug report 2 | description: Report a reproducible bug to help us improve 3 | title: "Bug: TITLE" 4 | labels: ["bug", "triage"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for submitting a bug report. Please add as much information as possible to help us reproduce, and remove any potential sensitive data. 10 | - type: textarea 11 | id: expected_behaviour 12 | attributes: 13 | label: Expected Behaviour 14 | description: Please share details on the behaviour you expected 15 | validations: 16 | required: true 17 | - type: textarea 18 | id: current_behaviour 19 | attributes: 20 | label: Current Behaviour 21 | description: Please share details on the current issue 22 | validations: 23 | required: true 24 | - type: textarea 25 | id: code_snippet 26 | attributes: 27 | label: Code snippet 28 | description: Please share a code snippet to help us reproduce the issue 29 | render: python3 30 | validations: 31 | required: true 32 | - type: textarea 33 | id: solution 34 | attributes: 35 | label: Possible Solution 36 | description: If known, please suggest a potential resolution 37 | validations: 38 | required: false 39 | - type: textarea 40 | id: steps 41 | attributes: 42 | label: Steps to Reproduce 43 | description: Please share how we might be able to reproduce this issue 44 | validations: 45 | required: true 46 | - type: input 47 | id: version 48 | attributes: 49 | label: Amazon EKS upgrade version 50 | placeholder: "latest" 51 | value: latest 52 | validations: 53 | required: true 54 | - type: dropdown 55 | id: runtime 56 | attributes: 57 | label: Python runtime version 58 | options: 59 | - "3.8" 60 | - "3.9" 61 | - "3.10" 62 | - "3.11" 63 | validations: 64 | required: true 65 | - type: dropdown 66 | id: packaging 67 | attributes: 68 | label: Packaging format used 69 | options: 70 | - Git clone 71 | - PyPi 72 | multiple: true 73 | validations: 74 | required: true 75 | - type: textarea 76 | id: logs 77 | attributes: 78 | label: Debugging logs 79 | description: If available, please share debugging logs 80 | render: python3 81 | validations: 82 | required: false 83 | - type: markdown 84 | attributes: 85 | value: | 86 | --- 87 | 88 | **Disclaimer**: We value your time and bandwidth. As such, any pull requests created on non-triaged issues might not be successful. 89 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Ask a question 4 | url: https://github.com/aws-samples/eks-cluster-upgrade/discussions/new 5 | about: Ask a general question about Amazon EKS cluster upgrade 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation_improvements.yml: -------------------------------------------------------------------------------- 1 | name: Documentation improvements 2 | description: Suggest a documentation update to improve everyone's experience 3 | title: "Docs: TITLE" 4 | labels: ["documentation", "triage"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for helping us improve everyone's experience. We review documentation updates on a case by case basis. 10 | - type: textarea 11 | id: search_area 12 | attributes: 13 | label: What were you searching in the docs? 14 | description: Please help us understand how you looked for information that was either unclear or not available 15 | validations: 16 | required: true 17 | - type: input 18 | id: area 19 | attributes: 20 | label: Is this related to an existing documentation section? 21 | description: Please share a link, if applicable 22 | validations: 23 | required: false 24 | - type: textarea 25 | id: idea 26 | attributes: 27 | label: How can we improve? 28 | description: Please share your thoughts on how we can improve this experience 29 | validations: 30 | required: true 31 | - type: textarea 32 | id: suggestion 33 | attributes: 34 | label: Got a suggestion in mind? 35 | description: Please suggest a proposed update 36 | validations: 37 | required: false 38 | - type: checkboxes 39 | id: acknowledgment 40 | attributes: 41 | label: Acknowledgment 42 | options: 43 | - label: I understand the final update might be different from my proposed suggestion, or refused. 44 | required: true 45 | - type: markdown 46 | attributes: 47 | value: | 48 | --- 49 | 50 | **Disclaimer**: We value your time and bandwidth. As such, any pull requests created on non-triaged issues might not be successful. 51 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea 3 | title: "Feature request: TITLE" 4 | labels: ["feature-request", "triage"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for taking the time to suggest an idea for the project. 10 | 11 | *Future readers*: Please react with 👍 and your use case to help us understand customer demand. 12 | - type: textarea 13 | id: problem 14 | attributes: 15 | label: Use case 16 | description: Please help us understand your use case or problem you're facing 17 | validations: 18 | required: true 19 | - type: textarea 20 | id: suggestion 21 | attributes: 22 | label: Solution/User Experience 23 | description: Please share what a good solution would look like to this use case 24 | validations: 25 | required: true 26 | - type: textarea 27 | id: alternatives 28 | attributes: 29 | label: Alternative solutions 30 | description: Please describe what alternative solutions to this use case, if any 31 | render: Markdown 32 | validations: 33 | required: false 34 | - type: markdown 35 | attributes: 36 | value: | 37 | --- 38 | 39 | **Disclaimer**: We value your time and bandwidth. As such, any pull requests created on non-triaged issues might not be successful. 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/maintenance.yml: -------------------------------------------------------------------------------- 1 | name: Maintenance 2 | description: Suggest an activity to help address tech debt, governance, and anything internal 3 | title: "Maintenance: TITLE" 4 | labels: ["internal", "triage"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for taking the time to help us improve operational excellence. 10 | 11 | *Future readers*: Please react with 👍 and your use case to help us understand customer demand. 12 | - type: textarea 13 | id: activity 14 | attributes: 15 | label: Summary 16 | description: Please provide an overview in one or two paragraphs 17 | validations: 18 | required: true 19 | - type: textarea 20 | id: importance 21 | attributes: 22 | label: Why is this needed? 23 | description: Please help us understand the value so we can prioritize it accordingly 24 | validations: 25 | required: true 26 | - type: dropdown 27 | id: area 28 | attributes: 29 | label: Which area does this relate to? 30 | multiple: true 31 | options: 32 | - Automation 33 | - Dependencies 34 | - Governance 35 | - Tests 36 | - Other 37 | - type: textarea 38 | id: suggestion 39 | attributes: 40 | label: Solution 41 | description: If available, please share what a good solution would look like 42 | validations: 43 | required: false 44 | - type: markdown 45 | attributes: 46 | value: | 47 | --- 48 | 49 | **Disclaimer**: We value your time and bandwidth. As such, any pull requests created on non-triaged issues might not be successful. 50 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Summary 4 | 5 | 6 | Resolves: # 7 | 8 | ### Changes 9 | 10 | > Please provide a summary of what's being changed 11 | 12 | ### User experience 13 | 14 | > Please share what the user experience looks like before and after this change 15 | 16 | ## Checklist 17 | 18 | If your change doesn't seem to apply, please leave them unchecked. 19 | 20 | - [ ] I have performed a self-review of this change 21 | - [ ] Changes have been tested 22 | - [ ] Changes are documented 23 | 24 | ## Acknowledgment 25 | 26 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. 27 | 28 | **Disclaimer**: We value your time and bandwidth. As such, any pull requests created on non-triaged issues might not be successful. 29 | -------------------------------------------------------------------------------- /.github/actions/delete-cluster/action.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Delete EKS Cluster 3 | description: 'Re-usable action to delete the active test clusters' 4 | 5 | inputs: 6 | eks_version: 7 | description: 'The Python version to be used during setup' 8 | required: true 9 | iam_role: 10 | description: 'IAM role to be assumed by GitHub actions' 11 | required: true 12 | region: 13 | description: 'AWS Region to be used' 14 | required: true 15 | 16 | 17 | runs: 18 | using: composite 19 | 20 | steps: 21 | - name: Configure AWS Credentials 22 | uses: aws-actions/configure-aws-credentials@v2 23 | with: 24 | role-to-assume: ${{ inputs.iam_role }} 25 | aws-region: ${{ inputs.region }} 26 | role-duration-seconds: 3600 27 | role-session-name: GithubActions-Session 28 | - name: setup eksctl 29 | shell: bash 30 | run: | 31 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp 32 | sudo mv /tmp/eksctl /usr/local/bin 33 | eksctl version 34 | - name: Destroy the cluster 35 | shell: bash 36 | id: destroy_cluster 37 | run: | 38 | cluster_version='${{ inputs.eks_version }}' 39 | echo "Destroying the temporary cluster eksup-cluster-${cluster_version/./-}-'${{inputs.region}}'" 40 | eksctl delete cluster -f .github/scripts/cluster_creation_${{inputs.eks_version}}-${{inputs.region}}.yaml 41 | -------------------------------------------------------------------------------- /.github/actions/dep-setup/action.yml: -------------------------------------------------------------------------------- 1 | name: Dependency Setup 2 | description: 'Action to setup the runtime environment for CI jobs.' 3 | 4 | inputs: 5 | python-version: 6 | description: 'The Python version to be used during setup' 7 | required: true 8 | 9 | runs: 10 | using: "composite" 11 | steps: 12 | - name: Setup Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '${{ inputs.python-version }}' 16 | 17 | - name: Cache Poetry 18 | id: cache-poetry 19 | uses: actions/cache@v3 20 | with: 21 | path: ${{github.workspace}}/.poetry 22 | key: poetry-self-${{ hashFiles('.github/workflows/*.yml') }} 23 | restore-keys: poetry-self- 24 | 25 | - name: Install Poetry 26 | if: steps.cache-poetry.outputs.cache-hit != 'true' 27 | shell: bash 28 | run: | 29 | export POETRY_HOME=${{github.workspace}}/.poetry 30 | curl -sSL https://install.python-poetry.org -o install-poetry.py 31 | python install-poetry.py --preview 32 | rm install-poetry.py 33 | 34 | - name: Add Poetry to $PATH 35 | shell: bash 36 | run: echo "${{github.workspace}}/.poetry/bin" >> $GITHUB_PATH 37 | 38 | - name: Add poethepoet plugin 39 | shell: bash 40 | run: poetry self add 'poethepoet[poetry_plugin]' 41 | 42 | - name: Poetry Version 43 | shell: bash 44 | run: poetry --version 45 | 46 | - name: Check pyproject.toml validity 47 | shell: bash 48 | run: poetry check --no-interaction 49 | 50 | - name: Cache Dependencies 51 | id: cache-deps 52 | uses: actions/cache@v3 53 | with: 54 | path: ${{github.workspace}}/.venv 55 | key: poetry-deps-${{ hashFiles('**/poetry.lock') }} 56 | restore-keys: poetry-deps- 57 | 58 | - name: Install Deps 59 | if: steps.cache-deps.cache-hit != 'true' 60 | shell: bash 61 | run: | 62 | poetry config virtualenvs.in-project true 63 | poetry install --no-interaction 64 | -------------------------------------------------------------------------------- /.github/scripts/cluster_creation_1.22-us-east-1.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: eksctl.io/v1alpha5 3 | kind: ClusterConfig 4 | 5 | metadata: 6 | name: eksup-cluster-1-22-us-east-1 7 | region: us-east-1 8 | version: "1.22" 9 | managedNodeGroups: 10 | - name: eksup-managed-ng 11 | instanceType: t2.large 12 | minSize: 2 13 | maxSize: 2 14 | desiredCapacity: 2 15 | iam: 16 | withAddonPolicies: 17 | externalDNS: true 18 | certManager: true 19 | addons: 20 | - name: vpc-cni 21 | - name: coredns 22 | - name: kube-proxy 23 | availabilityZones: ['us-east-1a','us-east-1b'] -------------------------------------------------------------------------------- /.github/scripts/cluster_creation_1.22-us-east-2.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: eksctl.io/v1alpha5 3 | kind: ClusterConfig 4 | 5 | metadata: 6 | name: eksup-cluster-1-22-us-east-2 7 | region: us-east-2 8 | version: "1.22" 9 | 10 | managedNodeGroups: 11 | - name: linux-ng 12 | instanceType: t2.large 13 | minSize: 2 14 | maxSize: 2 15 | - name: windows-managed-ng 16 | amiFamily: WindowsServer2019FullContainer 17 | minSize: 2 18 | maxSize: 2 19 | addons: 20 | - name: vpc-cni 21 | - name: coredns 22 | - name: kube-proxy 23 | availabilityZones: ['us-east-2a','us-east-2b'] 24 | -------------------------------------------------------------------------------- /.github/scripts/cluster_creation_1.23-us-east-2.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: eksctl.io/v1alpha5 3 | kind: ClusterConfig 4 | 5 | metadata: 6 | name: eksup-cluster-1-23-us-east-2 7 | region: us-east-2 8 | version: "1.23" 9 | managedNodeGroups: 10 | - name: eksup-managed-ng 11 | instanceType: t2.large 12 | minSize: 2 13 | maxSize: 2 14 | desiredCapacity: 2 15 | iam: 16 | withAddonPolicies: 17 | externalDNS: true 18 | certManager: true 19 | addons: 20 | - name: vpc-cni 21 | - name: coredns 22 | - name: kube-proxy 23 | availabilityZones: ['us-east-2a','us-east-2b'] -------------------------------------------------------------------------------- /.github/scripts/cluster_creation_1.24-us-east-1.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: eksctl.io/v1alpha5 3 | kind: ClusterConfig 4 | 5 | metadata: 6 | name: eksup-cluster-1-24-us-east-1 7 | region: us-east-1 8 | version: "1.24" 9 | managedNodeGroups: 10 | - name: eksup-managed-ng 11 | instanceType: t2.large 12 | minSize: 2 13 | maxSize: 2 14 | desiredCapacity: 2 15 | iam: 16 | withAddonPolicies: 17 | externalDNS: true 18 | certManager: true 19 | addons: 20 | - name: vpc-cni 21 | - name: coredns 22 | - name: kube-proxy 23 | availabilityZones: ['us-east-1a','us-east-1b'] -------------------------------------------------------------------------------- /.github/workflows/close-stale-issues.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Close Stale Issues" 3 | 4 | on: 5 | workflow_dispatch: 6 | schedule: 7 | - cron: "0 0 * * *" 8 | 9 | jobs: 10 | issue-cleanup: 11 | permissions: 12 | issues: write 13 | contents: read 14 | pull-requests: write 15 | runs-on: ubuntu-latest 16 | name: Stale issue 17 | steps: 18 | - uses: aws-actions/stale-issue-cleanup@v6 19 | with: 20 | issue-types: issues 21 | ancient-issue-message: This issue has not received any attention in 30 days. If you want to keep this issue open, please leave a comment below and auto-close will be canceled. 22 | stale-issue-message: This issue has not received a response in a while. If you want to keep this issue open, please leave a comment below and auto-close will be canceled. 23 | stale-issue-label: closing-soon 24 | exempt-issue-labels: no-autoclose, feature-request 25 | response-requested-label: response-requested 26 | # Don't set closed-for-staleness label to skip closing very old issues regardless of label 27 | closed-for-staleness-label: closed-for-staleness 28 | # Issue timing 29 | days-before-stale: 7 30 | days-before-close: 14 31 | days-before-ancient: 30 32 | # If you don't want to mark an issue as being ancient based on a threshold of "upvotes", you can set this here. 33 | # An "upvote" is the total number of +1, heart, hooray, and rocket reactions on an issue. 34 | minimum-upvotes-to-exempt: 5 35 | loglevel: DEBUG 36 | dry-run: false 37 | repo-token: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/e2e-test-template.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Cluster Upgrade tests - workflow 3 | 4 | on: 5 | workflow_call: 6 | inputs: 7 | eksupgrade_version: 8 | type: string 9 | default: 'latest' 10 | trigger_tests: 11 | type: string 12 | default: 'true' 13 | trigger_destroy: 14 | type: string 15 | default: 'true' 16 | current_version: 17 | type: string 18 | required: true 19 | cluster_suffix: 20 | type: string 21 | required: true 22 | target_version: 23 | type: string 24 | required: true 25 | target_region: 26 | type: string 27 | required: true 28 | secrets: 29 | git_role: 30 | required: true 31 | 32 | jobs: 33 | create-cluster: 34 | name: Cluster creation 35 | timeout-minutes: 120 36 | runs-on: ubuntu-latest 37 | permissions: 38 | id-token: write 39 | contents: read 40 | environment: dev 41 | steps: 42 | - name: Checkout 43 | uses: actions/checkout@v3 44 | - name: Configure AWS Credentials 45 | uses: aws-actions/configure-aws-credentials@v2 46 | with: 47 | role-to-assume: ${{ secrets.git_role }} 48 | aws-region: ${{inputs.target_region}} 49 | role-duration-seconds: 3600 50 | role-session-name: GithubActions-Session 51 | - name: setup eksctl 52 | run: | 53 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp 54 | sudo mv /tmp/eksctl /usr/local/bin 55 | eksctl version 56 | - name: create eks cluster 57 | id: create_cluster 58 | run: | 59 | echo "Creating the cluster eksup-cluster-'${{inputs.cluster_suffix}}'-'${{inputs.target_region}}'" 60 | eksctl create cluster -f .github/scripts/cluster_creation_${{inputs.current_version}}-${{inputs.target_region}}.yaml 61 | 62 | 63 | cluster-upgrade-check: 64 | name: Cluster Upgrade check 65 | timeout-minutes: 120 66 | runs-on: ubuntu-latest 67 | needs: create-cluster 68 | permissions: 69 | id-token: write 70 | contents: read 71 | environment: dev 72 | steps: 73 | - name: Checkout 74 | uses: actions/checkout@v3 75 | - uses: actions/setup-python@v4 76 | with: 77 | python-version: '3.10' 78 | - name: install latest utility version 79 | id: install_deps_latest 80 | if: ${{inputs.eksupgrade_version == 'latest' }} 81 | run: | 82 | python3 -m pip install eksupgrade 83 | - name: install utility version from a ref 84 | id: install_deps_from_ref 85 | if: ${{inputs.eksupgrade_version == 'main' }} 86 | run: | 87 | python3 -m pip install git+https://github.com/aws-samples/eks-cluster-upgrade.git@main 88 | - name: Configure AWS Credentials 89 | uses: aws-actions/configure-aws-credentials@v2 90 | with: 91 | role-to-assume: ${{ secrets.git_role }} 92 | aws-region: ${{inputs.target_region}} 93 | role-duration-seconds: 3600 94 | role-session-name: GithubActions-Session 95 | - name: Test no upgrade 96 | id: no_upgrade 97 | run: | 98 | echo "Test the cluster eksup-cluster-${{inputs.cluster_suffix}} with upgrade set to current version" 99 | eksupgrade eksup-cluster-${{inputs.cluster_suffix}}-${{inputs.target_region}} ${{inputs.current_version}} ${{inputs.target_region}} --no-interactive 100 | - name: Test standalone addon upgrade 101 | id: addon_upgrade 102 | run: | 103 | echo "Running upgrade addon checks on the cluster eksup-cluster-${{inputs.cluster_suffix}} with same control plane version " 104 | eksupgrade eksup-cluster-${{inputs.cluster_suffix}}-${{inputs.target_region}} ${{inputs.current_version}} ${{inputs.target_region}} --latest-addons --no-interactive 105 | - name: Configure AWS Credentials-2 # added to handle the windows nodegroup updates which take more time 106 | uses: aws-actions/configure-aws-credentials@v2 107 | with: 108 | role-to-assume: ${{ secrets.git_role }} 109 | aws-region: ${{inputs.target_region}} 110 | role-duration-seconds: 3600 111 | role-session-name: GithubActions-Session 112 | - name: Test cluster upgrade 113 | id: cluster_upgrade 114 | run: | 115 | echo "Upgrading the cluster eksup-cluster-${{inputs.cluster_suffix}}-${{inputs.target_region}} to version ${{inputs.target_version}} " 116 | eksupgrade eksup-cluster-${{inputs.cluster_suffix}}-${{inputs.target_region}} ${{inputs.target_version}} ${{inputs.target_region}} --no-interactive 117 | - name: Destroy the cluster 118 | if: ${{inputs.trigger_destroy == 'true'}} 119 | uses: './.github/actions/delete-cluster' 120 | with: 121 | eks_version: ${{inputs.current_version}} 122 | iam_role: ${{ secrets.git_role }} 123 | region: ${{inputs.target_region}} 124 | -------------------------------------------------------------------------------- /.github/workflows/e2e-test.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: E2E Cluster Upgrade 3 | 4 | on: 5 | workflow_dispatch: 6 | inputs: 7 | eksupgrade_version: 8 | type: choice 9 | options: ['latest', 'main'] 10 | description: Select the eksupgrade version you want to use 11 | default: 'latest' 12 | trigger_tests: 13 | type: choice 14 | options: ['true', 'false'] 15 | description: Select if you want to trigger cluster creation and tests 16 | default: 'true' 17 | trigger_destroy: 18 | type: choice 19 | options: ['true', 'false'] 20 | default: 'true' 21 | description: Destroy active test clusters ? 22 | 23 | 24 | concurrency: upgrade-test 25 | 26 | jobs: 27 | cluster-upgrade-checks: 28 | if: ${{inputs.trigger_tests == 'true'}} 29 | permissions: 30 | id-token: write 31 | contents: read 32 | strategy: 33 | matrix: 34 | include: 35 | - current_version: 1.22 36 | cluster_suffix: 1-22 37 | target_version: 1.23 38 | target_region: "us-east-2" 39 | - current_version: 1.22 40 | cluster_suffix: 1-22 41 | target_version: 1.23 42 | target_region: "us-east-1" 43 | - current_version: 1.23 44 | cluster_suffix: 1-23 45 | target_version: 1.24 46 | target_region: "us-east-2" 47 | - current_version: 1.24 48 | cluster_suffix: 1-24 49 | target_version: 1.25 50 | target_region: "us-east-1" 51 | uses: ./.github/workflows/e2e-test-template.yaml 52 | with: 53 | eksupgrade_version: ${{github.event.inputs.eksupgrade_version}} 54 | trigger_tests: ${{github.event.inputs.trigger_tests}} 55 | trigger_destroy: ${{github.event.inputs.trigger_destroy}} 56 | current_version: ${{matrix.current_version}} 57 | cluster_suffix: ${{matrix.cluster_suffix}} 58 | target_version: ${{matrix.target_version}} 59 | target_region: ${{matrix.target_region}} 60 | 61 | secrets: 62 | git_role: ${{ secrets.ROLE_TO_ASSUME }} 63 | 64 | 65 | destroy-cluster: 66 | if: ${{inputs.trigger_destroy == 'true' && inputs.trigger_tests == 'false' }} 67 | name: Delete Active Clusters 68 | timeout-minutes: 120 69 | runs-on: ubuntu-latest 70 | permissions: 71 | id-token: write 72 | contents: read 73 | environment: dev 74 | strategy: 75 | matrix: 76 | include: 77 | - eks_version: 1.22 78 | target_region: "us-east-2" 79 | - eks_version: 1.22 80 | target_region: "us-east-1" 81 | - eks_version: 1.23 82 | target_region: "us-east-2" 83 | - eks_version: 1.24 84 | target_region: "us-east-1" 85 | steps: 86 | - name: Checkout 87 | uses: actions/checkout@v3 88 | - name: Delete cluster 89 | uses: './.github/actions/delete-cluster' 90 | with: 91 | eks_version: ${{matrix.eks_version}} 92 | iam_role: ${{ secrets.ROLE_TO_ASSUME }} 93 | region: ${{matrix.target_region}} -------------------------------------------------------------------------------- /.github/workflows/pr-title.yaml: -------------------------------------------------------------------------------- 1 | name: 'PR title' 2 | 3 | on: 4 | pull_request_target: 5 | types: 6 | - opened 7 | - edited 8 | - synchronize 9 | 10 | jobs: 11 | main: 12 | name: Validate PR title 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: amannn/action-semantic-pull-request@v5.0.2 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | with: 19 | requireScope: false 20 | subjectPattern: ^[A-Z].+$ 21 | subjectPatternError: | 22 | The subject "{subject}" found in the pull request title "{title}" 23 | didn't match the configured pattern. Please ensure that the subject 24 | starts with an uppercase character. 25 | wip: true 26 | validateSingleCommit: false 27 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: 'Publish Release' 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | publish: 10 | name: Publish Release 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout Source 14 | uses: actions/checkout@v3 15 | with: 16 | fetch-depth: 0 17 | 18 | - name: Setup Dependencies 19 | uses: './.github/actions/dep-setup' 20 | with: 21 | python-version: '3.10' 22 | 23 | - name: Run Safety Check 24 | run: poetry poe safety 25 | 26 | - name: Get Python Module Version 27 | run: | 28 | MODULE_VERSION=$(poetry version --short) 29 | echo "MODULE_VERSION=$MODULE_VERSION" >> $GITHUB_ENV 30 | 31 | - name: Verify Versions Match 32 | run: | 33 | TAG_VERSION=$(git describe HEAD --tags --abbrev=0) 34 | echo "Git Tag Version: $TAG_VERSION" 35 | echo "Python Module Version: $MODULE_VERSION" 36 | if [[ "$TAG_VERSION" != "$MODULE_VERSION" ]]; then exit 1; fi 37 | 38 | - name: Publish to PyPi 39 | run: poetry publish --build 40 | env: 41 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }} 42 | 43 | - name: Release 44 | uses: softprops/action-gh-release@v1 45 | with: 46 | discussion_category_name: announcements 47 | generate_release_notes: true 48 | token: ${{ secrets.RELEASE_GITHUB_TOKEN }} 49 | files: | 50 | dist/eksupgrade-${{env.MODULE_VERSION}}-py3-none-any.whl 51 | dist/eksupgrade-${{env.MODULE_VERSION}}.tar.gz 52 | -------------------------------------------------------------------------------- /.github/workflows/validate.yaml: -------------------------------------------------------------------------------- 1 | name: 'Validation' 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | lint-style: 13 | name: Linting and Styling 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout Source 17 | uses: actions/checkout@v3 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: Setup Dependencies 22 | uses: './.github/actions/dep-setup' 23 | with: 24 | python-version: '3.10' 25 | 26 | - name: Run Styling Enforcement 27 | shell: bash 28 | run: poetry poe check 29 | 30 | # TODO: As soon as the repo is in a state to enable this, we'll do so. 31 | # - name: Run Style Linting Enforcement 32 | # shell: bash 33 | # run: poetry poe lint 34 | 35 | unit-tests: 36 | name: Run Unit Tests 37 | strategy: 38 | matrix: 39 | version: ['3.8', '3.9', '3.10', '3.11'] 40 | os: [ubuntu-latest] 41 | runs-on: ${{ matrix.os }} 42 | steps: 43 | - name: Checkout Source 44 | uses: actions/checkout@v3 45 | with: 46 | fetch-depth: 0 47 | 48 | - name: Setup Dependencies 49 | uses: './.github/actions/dep-setup' 50 | with: 51 | python-version: '${{ matrix.version }}' 52 | 53 | - name: Run Tests 54 | shell: bash 55 | run: poetry poe test 56 | 57 | - name: Codecov 58 | uses: codecov/codecov-action@v3 59 | 60 | security: 61 | name: Run Security Checks 62 | runs-on: ubuntu-latest 63 | steps: 64 | - name: Checkout Source 65 | uses: actions/checkout@v3 66 | with: 67 | fetch-depth: 0 68 | 69 | - name: Setup Dependencies 70 | uses: './.github/actions/dep-setup' 71 | with: 72 | python-version: '3.10' 73 | 74 | - name: Run Security Checks 75 | shell: bash 76 | run: poetry poe safety 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .DS_Store 3 | .idea 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # Unit test / coverage reports 33 | htmlcov/ 34 | .tox/ 35 | .nox/ 36 | .coverage 37 | .coverage.* 38 | .cache 39 | nosetests.xml 40 | coverage.xml 41 | *.cover 42 | *.py,cover 43 | .hypothesis/ 44 | .pytest_cache/ 45 | cover/ 46 | 47 | # IPython 48 | profile_default/ 49 | ipython_config.py 50 | 51 | # pdm 52 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 53 | #pdm.lock 54 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 55 | # in version control. 56 | # https://pdm.fming.dev/#use-with-ide 57 | .pdm.toml 58 | 59 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 60 | __pypackages__/ 61 | 62 | # Environments 63 | .env 64 | .venv 65 | env/ 66 | venv/ 67 | ENV/ 68 | env.bak/ 69 | venv.bak/ 70 | 71 | # mkdocs documentation 72 | /site 73 | 74 | # mypy 75 | .mypy_cache/ 76 | .dmypy.json 77 | dmypy.json 78 | 79 | # Pyre type checker 80 | .pyre/ 81 | 82 | # pytype static type analyzer 83 | .pytype/ 84 | 85 | # Cython debug symbols 86 | cython_debug/ 87 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | fail_fast: false 4 | minimum_pre_commit_version: "3.0.0" 5 | 6 | repos: 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: f71fa2c1f9cf5cb705f73dffe4b21f7c61470ba9 # frozen: v4.4.0 9 | hooks: 10 | - id: check-added-large-files 11 | - id: check-case-conflict 12 | - id: check-merge-conflict 13 | - id: check-executables-have-shebangs 14 | - id: check-json 15 | - id: check-symlinks 16 | - id: check-vcs-permalinks 17 | - id: check-xml 18 | - id: check-yaml 19 | - id: mixed-line-ending 20 | - id: trailing-whitespace 21 | 22 | - repo: https://github.com/PyCQA/bandit 23 | rev: ca4faf2f82a7c68a088100f8ba2b8e56f9bdcfe3 # frozen: 1.7.5 24 | hooks: 25 | - id: bandit 26 | description: 'Bandit is a tool for finding common security issues in Python code' 27 | additional_dependencies: [ "bandit[toml]" ] 28 | args: ["-c", "pyproject.toml"] 29 | 30 | - repo: https://github.com/psf/black 31 | rev: bf7a16254ec96b084a6caf3d435ec18f0f245cc7 # frozen: 23.3.0 32 | hooks: 33 | - id: black 34 | language_version: python3.10 35 | 36 | - repo: https://github.com/pycqa/isort 37 | rev: dbf82f2dd09ae41d9355bcd7ab69187a19e6bf2f # frozen: 5.12.0 38 | hooks: 39 | - id: isort 40 | name: isort 41 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon EKS Upgrade Utility 2 | 3 |

4 | Validation Status 5 | E2E Cluster Upgrade 6 | Coverage Status 7 | PyPI 8 | Downloads 9 |

10 | 11 | Amazon EKS cluster upgrade is a utility that automates the upgrade process for Amazon EKS clusters. 12 | 13 | 14 | ## Checks post v0.9.0 15 | 16 | The pre/post-flight checks are removed in favor of guiding the user to evaluate their clusters with existing tools which handle this better such as **[eksup](https://github.com/clowdhaus/eksup)**. The existing pre/post checks will be replaced with relevant checks specific to the upgrade (based on previous understanding the cluster is eligible for such an upgrade). 17 | 18 | ### Cluster Upgrade 19 | 20 | 1. Control plane upgrade - This is handled entirely by AWS once the version upgrade has been requested. 21 | 2. Identification of Managed and Self-managed node - The worker nodes are identified as EKS managed and Self-managed to perform upgrade. 22 | 3. Managed Node group update - Updates managed node group to the specified version. 23 | 4. Self-managed Node group update 24 | - Launch new nodes with upgraded version and wait until they require ready status for next step. 25 | - Mark existing nodes as unschedulable. 26 | - If pod disruption budget (PDB) is present then check for force eviction flag (--force) which is given by user, only then evict the pods or continue with the flow. 27 | 28 | ## Pre-Requisites 29 | 30 | Before running `eksupgrade`, you will need to have permission for both AWS and the Kubernetes cluster itself. 31 | 32 | 1. Install `eksupgrade` locally: 33 | 34 | ```sh 35 | python -m pip install eksupgrade 36 | ``` 37 | 38 | 2. Ensure you have the necessary AWS permissions; an example policy of required permissions is listed below: 39 | 40 | ```json 41 | { 42 | "Version": "2012-10-17", 43 | "Statement": [ 44 | { 45 | "Sid": "iam", 46 | "Effect": "Allow", 47 | "Action": [ 48 | "iam:GetRole", 49 | "sts:GetAccessKeyInfo", 50 | "sts:GetCallerIdentity", 51 | "sts:GetSessionToken" 52 | ], 53 | "Resource": "*" 54 | }, 55 | { 56 | "Sid": "ec2", 57 | "Effect": "Allow", 58 | "Action": [ 59 | "autoscaling:CreateLaunchConfiguration", 60 | "autoscaling:Describe*", 61 | "autoscaling:SetDesiredCapacity", 62 | "autoscaling:TerminateInstanceInAutoScalingGroup", 63 | "autoscaling:UpdateAutoScalingGroup", 64 | "ec2:Describe*", 65 | "ssm:*" 66 | ], 67 | "Resource": "*" 68 | }, 69 | { 70 | "Sid": "eks", 71 | "Effect": "Allow", 72 | "Action": [ 73 | "eks:Describe*", 74 | "eks:List*", 75 | "eks:UpdateAddon", 76 | "eks:UpdateClusterVersion", 77 | "eks:UpdateNodegroupVersion" 78 | ], 79 | "Resource": "*" 80 | } 81 | ] 82 | } 83 | ``` 84 | 85 | 3. Update your local kubeconfig to authenticate to the cluster: 86 | 87 | ```sh 88 | aws eks update-kubeconfig --name --region 89 | ``` 90 | 91 | ## Usage 92 | 93 | To view the arguments and options, run: 94 | 95 | ```sh 96 | eksupgrade --help 97 | ``` 98 | 99 | ```sh 100 | Usage: eksupgrade [OPTIONS] CLUSTER_NAME CLUSTER_VERSION REGION 101 | 102 | Run eksupgrade against a target cluster. 103 | 104 | ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ 105 | │ * cluster_name TEXT The name of the cluster to be upgraded [default: None] [required] │ 106 | │ * cluster_version TEXT The target Kubernetes version to upgrade the cluster to [default: None] [required] │ 107 | │ * region TEXT The AWS region where the target cluster resides [default: None] [required] │ 108 | ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ 109 | ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ 110 | │ --max-retry INTEGER The most number of times to retry an upgrade [default: 2] │ 111 | │ --force --no-force Force the upgrade (e.g. pod eviction with PDB) [default: no-force] │ 112 | │ --preflight --no-preflight Run pre-upgrade checks without upgrade [default: no-preflight] │ 113 | │ --parallel --no-parallel Upgrade all nodegroups in parallel [default: no-parallel] │ 114 | │ --latest-addons --no-latest-addons Upgrade addons to the latest eligible version instead of default [default: no-latest-addons] │ 115 | │ --disable-checks --no-disable-checks Disable the pre-upgrade and post-upgrade checks during upgrade scenarios [default: no-disable-checks] │ 116 | │ --interactive --no-interactive If enabled, prompt the user for confirmations [default: interactive] │ 117 | │ --version Display the current eksupgrade version │ 118 | │ --install-completion Install completion for the current shell. │ 119 | │ --show-completion Show completion for the current shell, to copy it or customize the installation. │ 120 | │ --help Show this message and exit. │ 121 | ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ 122 | ``` 123 | ## Support & Feedback 124 | 125 | This project is maintained by AWS Solution Architects and Consultants. It is not part of an AWS service and support is provided best-effort by the maintainers. To post feedback, submit feature ideas, or report bugs, please use the [Issues section](https://github.com/aws-samples/eks-cluster-upgrade/issues) of this repo. If you are interested in contributing, please see the [Contribution guide](https://github.com/aws-samples/eks-cluster-upgrade/blob/main/CONTRIBUTING.md). 126 | 127 | ## Security 128 | 129 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 130 | 131 | ## License 132 | 133 | This library is licensed under the MIT-0 License. See the [LICENSE](LICENSE) file. 134 | -------------------------------------------------------------------------------- /eksupgrade/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize the eksupgrade module. 2 | 3 | Attributes: 4 | __version__: The version of the eksupgrade module. 5 | 6 | """ 7 | 8 | __version__: str = "0.9.0" 9 | -------------------------------------------------------------------------------- /eksupgrade/__main__.py: -------------------------------------------------------------------------------- 1 | """Handle the main entry logic.""" 2 | 3 | from .cli import app 4 | 5 | app(prog_name="eksupgrade") 6 | -------------------------------------------------------------------------------- /eksupgrade/cli.py: -------------------------------------------------------------------------------- 1 | """Handle CLI specific logic and module definitions.""" 2 | 3 | from __future__ import annotations 4 | 5 | from queue import Queue 6 | from typing import Optional 7 | 8 | import typer 9 | import urllib3 10 | from rich.console import Console 11 | from rich.table import Table 12 | 13 | from eksupgrade import __version__ 14 | from eksupgrade.utils import confirm, echo_error, echo_info, echo_warning, get_logger 15 | 16 | from .exceptions import ClusterInactiveException 17 | from .models.eks import Cluster 18 | from .src.k8s_client import cluster_auto_enable_disable, is_cluster_auto_scaler_present 19 | from .starter import StatsWorker, actual_update 20 | 21 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 22 | 23 | logger = get_logger(__name__) 24 | app = typer.Typer(help="Automated Amazon EKS cluster upgrade CLI utility") 25 | console = Console() 26 | 27 | 28 | def version_callback(value: bool) -> None: 29 | """Handle the version callback.""" 30 | if value: 31 | typer.secho(f"eksupgrade version: {__version__}", fg=typer.colors.BRIGHT_BLUE, bold=True) 32 | raise typer.Exit() 33 | 34 | 35 | @app.command() 36 | def main( 37 | cluster_name: str = typer.Argument(..., help="The name of the cluster to be upgraded"), 38 | cluster_version: str = typer.Argument(..., help="The target Kubernetes version to upgrade the cluster to"), 39 | region: str = typer.Argument(..., help="The AWS region where the target cluster resides"), 40 | max_retry: int = typer.Option(default=2, help="The most number of times to retry an upgrade"), 41 | force: bool = typer.Option(default=False, help="Force the upgrade (e.g. pod eviction with PDB)"), 42 | preflight: bool = typer.Option(default=False, help="Run pre-upgrade checks without upgrade"), 43 | parallel: bool = typer.Option(default=False, help="Upgrade all nodegroups in parallel"), 44 | latest_addons: bool = typer.Option( 45 | default=False, help="Upgrade addons to the latest eligible version instead of default" 46 | ), 47 | disable_checks: bool = typer.Option( 48 | default=False, help="Disable the pre-upgrade and post-upgrade checks during upgrade scenarios" 49 | ), 50 | interactive: bool = typer.Option(default=True, help="If enabled, prompt the user for confirmations"), 51 | version: Optional[bool] = typer.Option( 52 | None, "--version", callback=version_callback, is_eager=True, help="Display the current eksupgrade version" 53 | ), 54 | ) -> None: 55 | """Run eksupgrade against a target cluster.""" 56 | queue = Queue() 57 | is_present: bool = False 58 | replicas_value: int = 0 59 | 60 | if disable_checks: 61 | echo_warning("--disable-checks is currently unused until the new validation workflows are implemented") 62 | 63 | if preflight: 64 | echo_warning( 65 | "--preflight is unused and will be removed in an upcoming release. " 66 | "Please use an EKS upgrade readiness assessment tool such as: github.com/clowdhaus/eksup" 67 | ) 68 | 69 | try: 70 | # Pull cluster details, populating the object for subsequent use throughout the upgrade. 71 | target_cluster: Cluster = Cluster.get( 72 | cluster_name=cluster_name, region=region, target_version=cluster_version, latest_addons=latest_addons 73 | ) 74 | echo_info( 75 | f"Upgrading cluster: {cluster_name} from version: {target_cluster.version} to {target_cluster.target_version}...", 76 | ) 77 | 78 | # Confirm whether or not to proceed following pre-flight checks. 79 | if interactive: 80 | confirm( 81 | f"Are you sure you want to proceed with the upgrade process against: {cluster_name}?", 82 | ) 83 | 84 | if not target_cluster.available: 85 | echo_error("The cluster is not active!") 86 | raise ClusterInactiveException("The cluster is not active") 87 | 88 | echo_info( 89 | f"The current version of the cluster was detected as: {target_cluster.version}", 90 | ) 91 | 92 | # Checking Cluster is Active or Not Before Making an Update 93 | if target_cluster.active: 94 | target_cluster.update_cluster(wait=True) 95 | else: 96 | echo_warning( 97 | f"The target EKS cluster: {target_cluster.name} isn't currently active - status: {target_cluster.status}", 98 | ) 99 | target_cluster.wait_for_active() 100 | 101 | echo_info("Found the following Managed Nodegroups") 102 | for _mng_nodegroup_name in target_cluster.nodegroup_names: 103 | echo_info(f"\t* {_mng_nodegroup_name}") 104 | 105 | managed_nodegroup_asgs: list[str] = [] 106 | for nodegroup in target_cluster.nodegroups: 107 | managed_nodegroup_asgs += nodegroup.autoscaling_group_names 108 | 109 | # removing self-managed from managed so that we don't update them again 110 | asg_list_self_managed = list(set(target_cluster.asg_names) - set(managed_nodegroup_asgs)) 111 | 112 | # addons update 113 | target_cluster.upgrade_addons(wait=True) 114 | 115 | # checking auto scaler present and the value associated from it 116 | is_present, replicas_value = is_cluster_auto_scaler_present(cluster_name=cluster_name, region=region) 117 | 118 | if is_present: 119 | cluster_auto_enable_disable( 120 | cluster_name=cluster_name, operation="pause", mx_val=replicas_value, region=region 121 | ) 122 | echo_info("Paused the Cluster AutoScaler") 123 | else: 124 | echo_info("No Cluster AutoScaler is Found") 125 | 126 | if parallel: 127 | for x in range(20): 128 | worker = StatsWorker(queue, x) 129 | worker.setDaemon(True) 130 | worker.start() 131 | 132 | if target_cluster.upgradable_managed_nodegroups: 133 | _mng_nodegroup_table = Table("Name", "Version") 134 | for item in target_cluster.upgradable_managed_nodegroups: 135 | _mng_nodegroup_table.add_row(item.name, item.version) 136 | echo_info("Outdated managed nodegroups:") 137 | console.print(_mng_nodegroup_table) 138 | else: 139 | echo_warning("No outdated managed nodegroups found!") 140 | 141 | target_cluster.upgrade_nodegroups(wait=not parallel) 142 | 143 | # TODO: Use custom_ami to update launch templates and re-roll self-managed nodes under ASGs. 144 | echo_info("Found the following Self-managed Nodegroups:") 145 | for asg_iter in asg_list_self_managed: 146 | echo_info(f"\t* {asg_iter}") 147 | if parallel: 148 | queue.put([cluster_name, asg_iter, cluster_version, region, max_retry, force, "selfmanaged"]) 149 | else: 150 | actual_update(cluster_name, asg_iter, cluster_version, region, max_retry, force) 151 | 152 | if parallel: 153 | queue.join() 154 | 155 | if is_present: 156 | cluster_auto_enable_disable( 157 | cluster_name=cluster_name, operation="start", mx_val=replicas_value, region=region 158 | ) 159 | echo_info("Cluster Autoscaler is Enabled Again") 160 | echo_info(f"EKS Cluster {cluster_name} UPDATED TO {cluster_version}") 161 | except typer.Abort: 162 | echo_warning("Cluster upgrade aborted!") 163 | except Exception as error: 164 | if is_present: 165 | try: 166 | cluster_auto_enable_disable( 167 | cluster_name=cluster_name, operation="start", mx_val=replicas_value, region=region 168 | ) 169 | echo_info("Cluster Autoscaler is Enabled Again") 170 | except Exception as error2: 171 | echo_error( 172 | f"Autoenable failed and must be done manually! Error: {error2}", 173 | ) 174 | echo_error(f"Exception encountered! Error: {error}") 175 | 176 | 177 | if __name__ == "__main__": # pragma: no cover 178 | app(prog_name="eksupgrade") 179 | -------------------------------------------------------------------------------- /eksupgrade/exceptions.py: -------------------------------------------------------------------------------- 1 | """Define the eksupgrade module exceptions.""" 2 | 3 | 4 | class BaseEksUpgradeException(Exception): 5 | """Define the base module exception for eksupgrade.""" 6 | 7 | 8 | class EksException(BaseEksUpgradeException): 9 | """Define the ELS module exception for eksupgrade.""" 10 | 11 | 12 | class ClusterException(EksException): 13 | """Define the cluster module exception for eksupgrade.""" 14 | 15 | 16 | class ClusterInactiveException(ClusterException): 17 | """Define the exception to raise when a cluster is considered inactive (or doesn't exist).""" 18 | 19 | 20 | class EksUpgradeNotImplementedError(BaseEksUpgradeException, NotImplementedError): 21 | """Define the Not Implemented exception for eksupgrade.""" 22 | 23 | 24 | class InvalidUpgradeTargetVersion(BaseEksUpgradeException): 25 | """Define the exception to be raised when invalid target versions are provided.""" 26 | -------------------------------------------------------------------------------- /eksupgrade/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/eks-cluster-upgrade/d960eab8299b4b9c1e79e024907d4dfe35f83ad9/eksupgrade/models/__init__.py -------------------------------------------------------------------------------- /eksupgrade/models/base.py: -------------------------------------------------------------------------------- 1 | """Define the base models to be used across the EKS upgrade tool.""" 2 | 3 | from __future__ import annotations 4 | 5 | from abc import ABC 6 | from dataclasses import dataclass, field 7 | from functools import cached_property 8 | from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union 9 | 10 | import boto3 11 | 12 | from eksupgrade.utils import echo_info, echo_success, get_logger 13 | 14 | if TYPE_CHECKING: # pragma: no cover 15 | from mypy_boto3_autoscaling.client import AutoScalingClient 16 | from mypy_boto3_ec2.client import EC2Client 17 | from mypy_boto3_eks.client import EKSClient 18 | from mypy_boto3_sts.client import STSClient 19 | else: 20 | AutoScalingClient = object 21 | EC2Client = object 22 | EKSClient = object 23 | STSClient = object 24 | 25 | logger = get_logger(__name__) 26 | 27 | 28 | @dataclass 29 | class BaseResource(ABC): 30 | """Define the base resource for the EKS cluster upgrade tool.""" 31 | 32 | def to_dict(self) -> Dict[str, Any]: 33 | """Return the dictionary representation of this object.""" 34 | return self.__dict__ 35 | 36 | def clear_cached_properties(self) -> None: 37 | """Clear all cached properties.""" 38 | cls = type(self) 39 | 40 | def get_cached_properties(instance_type) -> List[str]: 41 | """Get the list of properties matching the instance type.""" 42 | return [ 43 | attribute 44 | for attribute, _ in self.to_dict().items() 45 | if (instance := getattr(cls, attribute, None)) 46 | if isinstance(instance, instance_type) 47 | ] 48 | 49 | _cached_properties: List[str] = get_cached_properties(cached_property) 50 | 51 | for _cached_property in _cached_properties: 52 | echo_info(f"{self.__class__.__name__}: Clearing cached property: {_cached_property}") 53 | delattr(self, _cached_property) 54 | echo_success("Cached properties cleared!") 55 | 56 | 57 | @dataclass 58 | class AwsResource(BaseResource, ABC): 59 | """Define the abstract AWS base resource class.""" 60 | 61 | arn: str 62 | resource_id: str = "" 63 | tags: Dict[str, Union[str, bool]] = field(default_factory=lambda: ({})) 64 | errors: List[Dict[str, Any]] = field(default_factory=lambda: ([])) 65 | 66 | def _get_boto_client( 67 | self, service: Literal["autoscaling", "ec2", "eks", "sts"], **kwargs 68 | ) -> AutoScalingClient | EC2Client | EKSClient | STSClient: 69 | """Get a boto client.""" 70 | return boto3.client(service, **kwargs) 71 | 72 | @cached_property 73 | def sts_client(self) -> STSClient: 74 | """Get a boto STS client.""" 75 | boto_kwargs: Dict[str, Any] = {} 76 | region: Optional[str] = getattr(self, "region", "") 77 | 78 | if region: 79 | boto_kwargs["region_name"] = region 80 | 81 | return self._get_boto_client(service="sts", **boto_kwargs) 82 | 83 | 84 | @dataclass 85 | class AwsRegionResource(AwsResource, ABC): 86 | """Define the abstract AWS region specific base resource class.""" 87 | 88 | region: str = "" 89 | 90 | @cached_property 91 | def autoscaling_client(self) -> AutoScalingClient: 92 | """Get a boto autoscaling client.""" 93 | return self._get_boto_client(service="autoscaling", region_name=self.region) 94 | -------------------------------------------------------------------------------- /eksupgrade/models/eks.py: -------------------------------------------------------------------------------- 1 | """Define the models to be used across the EKS upgrade tool.""" 2 | 3 | from __future__ import annotations 4 | 5 | import base64 6 | import datetime 7 | import re 8 | import time 9 | from abc import ABC 10 | from dataclasses import dataclass, field 11 | from functools import cached_property 12 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union 13 | 14 | import boto3 15 | from kubernetes import client as k8s_client 16 | from kubernetes import config as k8s_config 17 | from packaging.version import Version 18 | from packaging.version import parse as parse_version 19 | 20 | from eksupgrade.utils import echo_error, echo_info, echo_success, echo_warning, get_logger 21 | 22 | from ..exceptions import InvalidUpgradeTargetVersion 23 | from .base import AwsRegionResource 24 | 25 | if TYPE_CHECKING: # pragma: no cover 26 | from mypy_boto3_autoscaling.type_defs import AutoScalingGroupsTypeTypeDef, AutoScalingGroupTypeDef 27 | from mypy_boto3_eks.client import EKSClient 28 | from mypy_boto3_eks.literals import ResolveConflictsType 29 | from mypy_boto3_eks.type_defs import ( 30 | AddonInfoTypeDef, 31 | AddonTypeDef, 32 | ClusterTypeDef, 33 | DescribeAddonResponseTypeDef, 34 | DescribeClusterResponseTypeDef, 35 | DescribeNodegroupResponseTypeDef, 36 | ListNodegroupsResponseTypeDef, 37 | NodegroupResourcesTypeDef, 38 | NodegroupTypeDef, 39 | UpdateAddonResponseTypeDef, 40 | UpdateClusterVersionResponseTypeDef, 41 | UpdateTypeDef, 42 | WaiterConfigTypeDef, 43 | ) 44 | else: 45 | AddonInfoTypeDef = object 46 | AddonTypeDef = object 47 | ClusterTypeDef = object 48 | DescribeAddonResponseTypeDef = object 49 | DescribeClusterResponseTypeDef = object 50 | EKSClient = object 51 | UpdateAddonResponseTypeDef = object 52 | ResolveConflictsType = object 53 | UpdateTypeDef = object 54 | UpdateClusterVersionResponseTypeDef = object 55 | DescribeNodegroupResponseTypeDef = object 56 | NodegroupTypeDef = object 57 | ListNodegroupsResponseTypeDef = object 58 | NodegroupResourcesTypeDef = object 59 | WaiterConfigTypeDef = object 60 | AutoScalingGroupsTypeTypeDef = object 61 | AutoScalingGroupTypeDef = object 62 | 63 | from eksupgrade.utils import get_logger 64 | 65 | logger = get_logger(__name__) 66 | 67 | TOKEN_PREFIX: str = "k8s-aws-v1" 68 | TOKEN_HEADER_KEY: str = "x-k8s-aws-id" 69 | 70 | 71 | def requires_cluster(function): 72 | """Decorate methods to require a cluster attribute.""" 73 | 74 | def wrapper(self, *args, **kwargs): 75 | if not self.cluster.name: 76 | echo_error( 77 | f"Unable to use method: {function.__name__} without the cluster attribute! Pass a cluster to this child object!", 78 | ) 79 | return None 80 | return function(self, *args, **kwargs) 81 | 82 | return wrapper 83 | 84 | 85 | @dataclass 86 | class EksResource(AwsRegionResource, ABC): 87 | """Define the abstract EKS base resource class.""" 88 | 89 | name: str = "" 90 | status: str = "" 91 | version: str = "" 92 | 93 | @cached_property 94 | def eks_client(self) -> EKSClient: 95 | """Get a boto EKS client.""" 96 | return self._get_boto_client(service="eks", region_name=self.region) 97 | 98 | @cached_property 99 | def core_api_client(self) -> Any: 100 | """Get a Kubernetes Core client.""" 101 | return k8s_client.CoreV1Api() 102 | 103 | @cached_property 104 | def apps_api_client(self) -> Any: 105 | """Get a Kubernetes Apps client.""" 106 | return k8s_client.AppsV1Api() 107 | 108 | 109 | @dataclass 110 | class AutoscalingGroup(AwsRegionResource): 111 | """Define the Autoscaling Group model.""" 112 | 113 | cluster: Cluster = field(default_factory=lambda: Cluster(arn="", version="1.24")) 114 | name: str = "" 115 | launch_configuration_name: str = "" 116 | launch_template: Dict[str, str] = field(default_factory=dict) 117 | mixed_instances_policy: Dict[str, Any] = field(default_factory=dict) 118 | min_size: int = 0 119 | max_size: int = 0 120 | desired_capacity: int = 0 121 | predicted_capacity: int = 0 122 | default_cooldown: int = 0 123 | availability_zones: List[str] = field(default_factory=list) 124 | load_balancer_names: List[str] = field(default_factory=list) 125 | target_group_arns: List[str] = field(default_factory=list) 126 | health_check_type: str = "" 127 | health_check_grace_period: int = 0 128 | instances: List[Dict[str, Any]] = field(default_factory=list) 129 | placement_group: str = "" 130 | created_time: datetime.datetime = datetime.datetime.now() 131 | suspended_processes: List[Dict[str, str]] = field(default_factory=list) 132 | vpc_zone_identifier: str = "" 133 | status: str = "" 134 | termination_policies: List[str] = field(default_factory=list) 135 | new_instances_protected_from_scale_in: bool = False 136 | service_linked_role_arn: str = "" 137 | max_instance_lifetime: int = 0 138 | capacity_rebalance: bool = False 139 | warm_pool_configuration: Dict[str, Any] = field(default_factory=dict) 140 | warm_pool_size: int = 0 141 | context: str = "" 142 | desired_capacity_type: str = "" 143 | default_instance_warmup: int = 0 144 | traffic_sources: List[Dict[str, str]] = field(default_factory=list) 145 | enabled_metrics: List[Dict[str, str]] = field(default_factory=list) 146 | asg_tags: List[Dict[str, str]] = field(default_factory=list) 147 | 148 | def __repr__(self) -> str: # pragma: no cover 149 | """Return the string representation of a EKS Managed Node Group.""" 150 | return f"<{self.__class__.__name__} - Name: {self.name} | Launch Config: {self.launch_configuration_name} | Last Status: {self.status}>" 151 | 152 | @property 153 | def asg_name(self) -> str: 154 | """Return the autoscaling group name.""" 155 | return self.name 156 | 157 | @classmethod 158 | def get( 159 | cls, 160 | cluster: Cluster, 161 | region: str, 162 | autoscaling_group_name: str = "", 163 | asg_data: Optional[AutoScalingGroupTypeDef] = None, 164 | ): 165 | """Get the cluster's manage nodegroup details and build a ManagedNodeGroup object.""" 166 | echo_info("Getting cluster autoscaling group details...") 167 | 168 | if not asg_data: 169 | response: AutoScalingGroupsTypeTypeDef = cluster.autoscaling_client.describe_auto_scaling_groups( 170 | AutoScalingGroupNames=[autoscaling_group_name], 171 | ) 172 | asg_data = response["AutoScalingGroups"][0] 173 | 174 | asg_name: str = autoscaling_group_name or asg_data.get("AutoScalingGroupName", "") 175 | echo_info( 176 | f"Autoscaling Group: {asg_name} - Cluster: {cluster.name}", 177 | ) 178 | instances = asg_data.get("Instances", []) 179 | unhealthy_instances = [ 180 | instance["InstanceId"] for instance in instances if instance["HealthStatus"] == "Unhealthy" 181 | ] 182 | healthy_instances: List[str] = [ 183 | instance["InstanceId"] for instance in instances if instance["HealthStatus"] == "Healthy" 184 | ] 185 | 186 | if unhealthy_instances: 187 | echo_warning("Unhealthy Instances:") 188 | for unhealthy_instance in unhealthy_instances: 189 | echo_warning(f"\t * {unhealthy_instance}") 190 | 191 | if healthy_instances: 192 | echo_info("Healthy Instances:") 193 | for healthy_instance in healthy_instances: 194 | echo_info(f"\t * {healthy_instance}") 195 | 196 | return cls( 197 | cluster=cluster, 198 | launch_configuration_name=asg_data.get("LaunchConfigurationName", ""), 199 | launch_template=asg_data.get("LaunchTemplate", {}), 200 | mixed_instances_policy=asg_data.get("MixedInstancesPolicy", {}), 201 | name=asg_data.get("AutoScalingGroupName", ""), 202 | status=asg_data.get("Status", ""), 203 | arn=asg_data.get("AutoScalingGroupARN", ""), 204 | min_size=asg_data.get("MinSize", 0), 205 | max_size=asg_data.get("MaxSize", 0), 206 | desired_capacity=asg_data.get("DesiredCapacity", 0), 207 | predicted_capacity=asg_data.get("PredictedCapacity", 0), 208 | default_cooldown=asg_data.get("DefaultCooldown", 0), 209 | created_time=asg_data.get("CreatedTime", datetime.datetime.now()), 210 | availability_zones=asg_data.get("AvailabilityZones", []), 211 | load_balancer_names=asg_data.get("LoadBalancerNames", []), 212 | target_group_arns=asg_data.get("TargetGroupARNs", []), 213 | health_check_type=asg_data.get("HealthCheckType", ""), 214 | instances=asg_data.get("Instances", []), 215 | health_check_grace_period=asg_data.get("HealthCheckGracePeriod", 0), 216 | suspended_processes=asg_data.get("SuspendedProcesses", []), 217 | placement_group=asg_data.get("PlacementGroup", ""), 218 | vpc_zone_identifier=asg_data.get("VPCZoneIdentifier", ""), 219 | enabled_metrics=asg_data.get("EnabledMetrics", []), 220 | termination_policies=asg_data.get("TerminationPolicies", []), 221 | asg_tags=asg_data.get("Tags", []), 222 | region=region, 223 | new_instances_protected_from_scale_in=asg_data.get("NewInstancesProtectedFromScaleIn", False), 224 | service_linked_role_arn=asg_data.get("ServiceLinkedRoleARN", ""), 225 | max_instance_lifetime=asg_data.get("MaxInstanceLifetime", ""), 226 | capacity_rebalance=asg_data.get("CapacityRebalance", ""), 227 | warm_pool_size=asg_data.get("WarmPoolSize", []), 228 | warm_pool_configuration=asg_data.get("WarmPoolConfiguration", []), 229 | context=asg_data.get("Context", ""), 230 | desired_capacity_type=asg_data.get("DesiredCapacityType", ""), 231 | default_instance_warmup=asg_data.get("DefaultInstanceWarmup", 0), 232 | traffic_sources=asg_data.get("TrafficSources", []), 233 | ) 234 | 235 | 236 | @dataclass 237 | class ManagedNodeGroup(EksResource): 238 | """Define the EKS Manage Node Group model.""" 239 | 240 | cluster: Cluster = field(default_factory=lambda: Cluster(arn="", version="1.24")) 241 | remote_access: Dict[str, Any] = field(default_factory=dict) 242 | health: Dict[str, List[Any]] = field(default_factory=lambda: ({"issues": []})) 243 | labels: Dict[str, str] = field(default_factory=dict) 244 | update_config: Dict[str, int] = field(default_factory=dict) 245 | launch_template: Dict[str, Any] = field(default_factory=dict) 246 | scaling_config: Dict[str, int] = field(default_factory=dict) 247 | instance_types: List[str] = field(default_factory=list) 248 | subnets: List[str] = field(default_factory=list) 249 | autoscaling_groups: List[Dict[str, str]] = field(default_factory=list) 250 | taints: List[Dict[str, str]] = field(default_factory=list) 251 | created_at: datetime.datetime = datetime.datetime.now() 252 | modified_at: datetime.datetime = datetime.datetime.now() 253 | release_version: str = "" 254 | remote_access_sg: str = "" 255 | node_role: str = "" 256 | publisher: str = "" 257 | owner: str = "" 258 | product_id: str = "" 259 | disk_size: int = 0 260 | ami_type: str = "" 261 | capacity_type: str = "" 262 | 263 | def __repr__(self) -> str: # pragma: no cover 264 | """Return the string representation of a EKS Managed Node Group.""" 265 | return f"<{self.__class__.__name__} - Name: {self.name} | Version: {self.version} | Last Status: {self.status}>" 266 | 267 | def __post_init__(self) -> None: 268 | """Perform the post initialization steps.""" 269 | self.active_waiter = self.eks_client.get_waiter("nodegroup_active") 270 | 271 | @property 272 | def nodegroup_name(self) -> str: 273 | """Return the nodegroup name.""" 274 | return self.name 275 | 276 | @cached_property 277 | def autoscaling_group_names(self) -> List[str]: 278 | """Return the list of autoscaling group names.""" 279 | return [asg["name"] for asg in self.autoscaling_groups] 280 | 281 | @classmethod 282 | def get(cls, node_group: str, cluster: Cluster, region: str): 283 | """Get the cluster's manage nodegroup details and build a ManagedNodeGroup object.""" 284 | echo_info("Getting cluster managed nodegroup details...") 285 | response: DescribeNodegroupResponseTypeDef = cluster.eks_client.describe_nodegroup( 286 | nodegroupName=node_group, 287 | clusterName=cluster.name, 288 | ) 289 | nodegroup_data: NodegroupTypeDef = response["nodegroup"] 290 | version: str = nodegroup_data.get("version", "") 291 | release_version: str = nodegroup_data.get("releaseVersion", "") 292 | echo_info( 293 | f"Managed Node Group: {node_group} - Version: {version} - Release Version: {release_version} - Cluster: {cluster.name}", 294 | ) 295 | _resources: NodegroupResourcesTypeDef = nodegroup_data.get("resources", {}) 296 | 297 | return cls( 298 | cluster=cluster, 299 | version=version, 300 | release_version=release_version, 301 | arn=nodegroup_data.get("nodegroupArn", ""), 302 | name=nodegroup_data.get("nodegroupName", ""), 303 | status=nodegroup_data.get("status", ""), 304 | created_at=nodegroup_data.get("createdAt", datetime.datetime.now()), 305 | modified_at=nodegroup_data.get("modifiedAt", datetime.datetime.now()), 306 | tags=nodegroup_data.get("tags", {}), 307 | region=region, 308 | node_role=nodegroup_data.get("nodeRole", ""), 309 | capacity_type=nodegroup_data.get("capacityType", ""), 310 | ami_type=nodegroup_data.get("amiType", ""), 311 | instance_types=nodegroup_data.get("instanceTypes", []), 312 | subnets=nodegroup_data.get("subnets", []), 313 | disk_size=nodegroup_data.get("diskSize", 0), 314 | labels=nodegroup_data.get("labels", {}), 315 | taints=nodegroup_data.get("taints", []), 316 | remote_access_sg=_resources.get("remoteAccessSecurityGroup", ""), 317 | update_config=nodegroup_data.get("updateConfig", {}), 318 | launch_template=nodegroup_data.get("launchTemplate", {}), 319 | autoscaling_groups=_resources.get("autoScalingGroups", []), 320 | scaling_config=nodegroup_data.get("scalingConfig", {}), 321 | remote_access=nodegroup_data.get("remoteAccess", {}), 322 | health=nodegroup_data.get("health", {}), 323 | ) 324 | 325 | @property 326 | def needs_upgrade(self) -> bool: 327 | """Determine whether or not the managed nodegroup needs upgraded.""" 328 | if self.status in ["ACTIVE", "UPDATING"] and not self.version == self.cluster.target_version: 329 | echo_info( 330 | f"Managed Node Group: {self.name} requires upgrade from version: {self.version} to target version: {self.cluster.target_version}", 331 | ) 332 | return True 333 | return False 334 | 335 | @requires_cluster 336 | def update( 337 | self, 338 | version: str = "", 339 | release_version: str = "", 340 | force: bool = False, 341 | client_request_id: str = "", 342 | launch_template: Optional[Dict[str, Any]] = None, 343 | wait: bool = True, 344 | ) -> UpdateTypeDef: 345 | """Update the nodegroup to the target version.""" 346 | update_kwargs: Dict[str, Any] = {} 347 | 348 | if not launch_template: 349 | update_kwargs["version"] = version or self.cluster.target_version 350 | elif launch_template and not version: 351 | update_kwargs["launchTemplate"] = launch_template 352 | elif launch_template and (self.ami_type != "CUSTOM" and version): 353 | update_kwargs["launchTemplate"] = launch_template 354 | update_kwargs["version"] = version 355 | elif launch_template and (self.ami_type == "CUSTOM" and version): 356 | echo_error("Version and launch template provided to managed nodegroug update with custom AMI!") 357 | 358 | if release_version: 359 | update_kwargs["releaseVersion"] = release_version 360 | 361 | if client_request_id: 362 | update_kwargs["clientRequestToken"] = client_request_id 363 | 364 | version = version or self.cluster.target_version 365 | echo_info(f"Updating nodegroup: {self.name} from version: {self.version} to version: {version}") 366 | update_response = self.eks_client.update_nodegroup_version( 367 | clusterName=self.cluster.name, nodegroupName=self.name, force=force, **update_kwargs 368 | ) 369 | update_response_body: UpdateTypeDef = update_response["update"] 370 | _update_errors = update_response_body.get("errors", []) 371 | 372 | if _update_errors: 373 | echo_error( 374 | f"Errors encountered while attempting to update addon: {self.name} - Errors: {_update_errors}", 375 | ) 376 | self.errors += _update_errors 377 | if wait: 378 | self.wait_for_active() 379 | 380 | return update_response_body 381 | 382 | def wait_for_active(self, delay: int = 35, initial_delay: int = 30, max_attempts: int = 160) -> None: 383 | """Wait for the nodegroup to become active.""" 384 | echo_info(f"Waiting for the Managed Node Group: {self.name} to become active...") 385 | time.sleep(initial_delay) 386 | waiter_config: WaiterConfigTypeDef = {"Delay": delay, "MaxAttempts": max_attempts} 387 | self.active_waiter.wait(clusterName=self.cluster.name, nodegroupName=self.name, WaiterConfig=waiter_config) 388 | echo_success(f"Managed Nodegroup: {self.name} now active!") 389 | 390 | 391 | @dataclass 392 | class ClusterAddon(EksResource): 393 | """Define the Kubernetes Cluster Addon model.""" 394 | 395 | health: Dict[str, List[Any]] = field(default_factory=lambda: ({"issues": []})) 396 | created_at: datetime.datetime = datetime.datetime.now() 397 | modified_at: datetime.datetime = datetime.datetime.now() 398 | service_account_role_arn: str = "" 399 | publisher: str = "" 400 | owner: str = "" 401 | product_id: str = "" 402 | product_url: str = "" 403 | configuration_values: str = "" 404 | cluster: Cluster = field(default_factory=lambda: Cluster(arn="", version="1.24")) 405 | 406 | def __repr__(self) -> str: # pragma: no cover 407 | """Return the string representation of a Cluster Addon.""" 408 | return f"<{self.__class__.__name__} - Name: {self.name} | Version: {self.version} | Last Status: {self.status}>" 409 | 410 | def __post_init__(self) -> None: 411 | """Perform the post initialization steps.""" 412 | self.active_waiter = self.eks_client.get_waiter("addon_active") 413 | 414 | @property 415 | def addon_name(self) -> str: 416 | """Return the addon name.""" 417 | return self.name 418 | 419 | @classmethod 420 | def get(cls, addon: str, cluster: Cluster, region: str): 421 | """Get the cluster addon details and build a ClusterAddon object.""" 422 | logger.debug("Getting cluster addon details...") 423 | response: DescribeAddonResponseTypeDef = cluster.eks_client.describe_addon( 424 | addonName=addon, 425 | clusterName=cluster.name, 426 | ) 427 | addon_data: AddonTypeDef = response["addon"] 428 | addon_version: str = addon_data.get("addonVersion", "") 429 | markplace_data = addon_data.get("marketplaceInformation", {}) 430 | logger.debug("Addon: %s - Current Version: %s - Cluster: %s", addon, addon_version, cluster.name) 431 | 432 | return cls( 433 | arn=addon_data.get("addonArn", ""), 434 | name=addon_data.get("addonName", ""), 435 | version=addon_version, 436 | status=addon_data.get("status", ""), 437 | health=addon_data.get("health", {}), 438 | created_at=addon_data.get("createdAt", datetime.datetime.now()), 439 | modified_at=addon_data.get("modifiedAt", datetime.datetime.now()), 440 | tags=addon_data.get("tags", {}), 441 | region=region, 442 | service_account_role_arn=addon_data.get("serviceAccountRoleArn", ""), 443 | publisher=addon_data.get("publisher", ""), 444 | owner=addon_data.get("owner", ""), 445 | product_id=markplace_data.get("productId", ""), 446 | product_url=markplace_data.get("productUrl", ""), 447 | cluster=cluster, 448 | ) 449 | 450 | @cached_property 451 | def _addon_update_kwargs(self) -> dict[str, Any]: 452 | """Get kwargs for subsequent update to addon.""" 453 | kwargs: Dict[str, Any] = {} 454 | 455 | if self.service_account_role_arn: 456 | kwargs["serviceAccountRoleArn"] = self.service_account_role_arn 457 | if self.configuration_values: 458 | kwargs["configurationValues"] = self.configuration_values 459 | return kwargs 460 | 461 | @requires_cluster 462 | def update( 463 | self, 464 | version: str = "", 465 | resolve_conflicts: ResolveConflictsType = "OVERWRITE", 466 | wait: bool = False, 467 | ) -> list[UpdateTypeDef]: 468 | """Update the addon to the target version.""" 469 | responses: list[UpdateTypeDef] = [] 470 | if self.name == "vpc-cni": 471 | versions: list[str] = self.step_upgrade_versions 472 | wait = True 473 | else: 474 | versions = [version or self.target_version] 475 | 476 | for version in versions: 477 | echo_info(f"Updating addon: {self.name} from original version: {self.version} to version: {version}") 478 | update_response: UpdateAddonResponseTypeDef = self.eks_client.update_addon( 479 | clusterName=self.cluster.name, 480 | addonName=self.name, 481 | addonVersion=version, 482 | resolveConflicts=resolve_conflicts, 483 | **self._addon_update_kwargs, 484 | ) 485 | update_response_body: UpdateTypeDef = update_response["update"] 486 | _update_errors = update_response_body.get("errors", []) 487 | 488 | _update_id: str = update_response_body.get("id", "") 489 | _update_status: str = update_response_body.get("status", "") 490 | echo_info(f"Updating addon: {self.name} - ID: {_update_id} - Status: {_update_status}") 491 | responses.append(update_response_body) 492 | 493 | if _update_errors: 494 | echo_error( 495 | f"Errors encountered while attempting to update addon: {self.name} - Errors: {_update_errors}", 496 | ) 497 | self.errors += _update_errors 498 | elif wait: 499 | self.wait_for_active() 500 | return responses 501 | 502 | @cached_property 503 | def available_versions_data(self) -> AddonInfoTypeDef: 504 | """Get target addon versions.""" 505 | return next(item for item in self.cluster.available_addon_versions if item.get("addonName", "") == self.name) 506 | 507 | @cached_property 508 | def available_versions(self) -> list[str]: 509 | """Return the list of available versions.""" 510 | return [item.get("addonVersion", "") for item in self.available_versions_data.get("addonVersions", [])] 511 | 512 | @cached_property 513 | def default_version(self) -> str: 514 | """Get the EKS default version of the addon.""" 515 | return next( 516 | item.get("addonVersion", "") 517 | for item in self.available_versions_data.get("addonVersions", []) 518 | if item.get("compatibilities", [])[0].get("defaultVersion", False) is True 519 | ) 520 | 521 | @property 522 | def minors_to_target(self) -> list[int]: 523 | """Return the list of minor revisions to upgrade target.""" 524 | return list(range(self.semantic_version.minor, self._target_version_semver.minor + 1)) 525 | 526 | @cached_property 527 | def sorted_versions(self) -> list[str]: 528 | """Return the latest version.""" 529 | return sorted(self.available_versions, reverse=True, key=parse_version) 530 | 531 | @cached_property 532 | def semantic_version(self) -> Version: 533 | """Return the current version without eks platform details in the string.""" 534 | return Version(re.sub(r"-eksbuild.*", "", self.version)) 535 | 536 | @property 537 | def semantic_versions(self) -> list[Version]: 538 | """Return the list of semantic versions sorted with latest first.""" 539 | return [Version(re.sub(r"-eksbuild.*", "", version)) for version in self.sorted_versions] 540 | 541 | @property 542 | def step_upgrade_versions(self) -> list[str]: 543 | """Return the list of semantic versions to target for step upgrade by minor.""" 544 | versions: List[str] = [] 545 | for minor in self.minors_to_target: 546 | version: Optional[Version] = self.get_version_by_minor(minor) 547 | full_version: str = self.get_full_version_str(version) 548 | if full_version: 549 | versions.append(full_version) 550 | return versions 551 | 552 | @property 553 | def _target_version(self) -> str: 554 | """Return the target version.""" 555 | return self.latest_version if self.cluster.latest_addons else self.default_version 556 | 557 | @property 558 | def _target_version_semver(self) -> Version: 559 | """Return the target version.""" 560 | return Version(re.sub(r"-eksbuild.*", "", self._target_version)) 561 | 562 | @property 563 | def within_target_minor(self) -> bool: 564 | """Determine if the current version is within +1 of the minor target version.""" 565 | if self._target_version_semver.minor in (self.semantic_version.minor, self.semantic_version.minor + 1): 566 | return True 567 | return False 568 | 569 | def get_version_by_minor(self, minor: int) -> Optional[Version]: 570 | """Return the semantic version based on the input version.""" 571 | try: 572 | return [item for item in self.semantic_versions if item.minor == minor][0] 573 | except IndexError: 574 | return None 575 | 576 | def get_full_version_str(self, semantic_version: Optional[Version]) -> str: 577 | """Return the complete version string based on the semantic version.""" 578 | if not semantic_version: 579 | return "" 580 | return next(item for item in self.sorted_versions if item.startswith(f"v{str(semantic_version)}")) 581 | 582 | @property 583 | def next_minor_semver(self) -> Optional[Version]: 584 | """Return the next minor version's semantic version.""" 585 | return self.get_version_by_minor(minor=self.semantic_version.minor + 1) 586 | 587 | @property 588 | def next_minor(self) -> str: 589 | """Return the next minor's complete version string.""" 590 | if self.next_minor_semver: 591 | return self.get_full_version_str(self.next_minor_semver) 592 | return "" 593 | 594 | @property 595 | def latest_version(self) -> str: 596 | """Return the latest version.""" 597 | return self.sorted_versions[0] 598 | 599 | @property 600 | def target_version(self) -> str: 601 | """Return the target version.""" 602 | # If VPC CNI Add-on, use graduated upgrade by single minor version. 603 | if ( 604 | self.name == "vpc-cni" 605 | and not self.within_target_minor 606 | and parse_version(self.version) < parse_version(self.next_minor) 607 | ): 608 | echo_info( 609 | f"vpc-cni will target version: {self.next_minor} instead of {self._target_version} because it's not within +1 or current minor...", 610 | ) 611 | return self.next_minor 612 | return self._target_version 613 | 614 | @property 615 | def needs_upgrade(self) -> bool: 616 | """Determine whether or not this addon should be upgraded.""" 617 | return parse_version(self.version) < parse_version(self.target_version) 618 | 619 | def wait_for_active(self, delay: int = 35, initial_delay: int = 30, max_attempts: int = 160) -> None: 620 | """Wait for the addon to become active.""" 621 | echo_info(f"Waiting for the add-on: {self.name} to become active...") 622 | time.sleep(initial_delay) 623 | waiter_config: WaiterConfigTypeDef = {"Delay": delay, "MaxAttempts": max_attempts} 624 | self.active_waiter.wait(clusterName=self.cluster.name, addonName=self.name, WaiterConfig=waiter_config) 625 | echo_success(f"Add-on: {self.name} upgraded!") 626 | 627 | 628 | @dataclass 629 | class Cluster(EksResource): 630 | """Define the Kubernetes Cluster model. 631 | 632 | Attributes: 633 | certificate_authority_data: The certificate authority data. 634 | identity_oidc_issuer: The OIDC identity issuer. 635 | endpoint: The EKS endpoint. 636 | role_arn: The EKS cluster role ARN. 637 | platform_version: The EKS cluster platform version. 638 | secrets_key_arn: The EKS cluster's secrets key ARN. 639 | cluster_logging_enabled: Whether or not cluster logging is enabled. 640 | 641 | Properties: 642 | target_version: The target cluster version post upgrade. 643 | 644 | """ 645 | 646 | certificate_authority_data: str = "" 647 | identity_oidc_issuer: str = "" 648 | endpoint: str = "" 649 | role_arn: str = "" 650 | platform_version: str = "" 651 | secrets_key_arn: str = "" 652 | cluster_logging_enabled: bool = False 653 | target_version: str = "" 654 | latest_addons: bool = False 655 | 656 | def __repr__(self) -> str: # pragma: no cover 657 | """Return the string representation of a Cluster.""" 658 | return f"<{self.__class__.__name__} - Name: {self.name} | Version: {self.version} | Last Status: {self.status}>" 659 | 660 | def __post_init__(self) -> None: 661 | """Perform the post initialization steps.""" 662 | self._register_k8s_aws_id_handlers() 663 | self.load_config() 664 | self.target_version = self.target_version or str(float(self.version) + 0.01) 665 | self.active_waiter = self.eks_client.get_waiter("cluster_active") 666 | 667 | def _register_k8s_aws_id_handlers(self) -> None: 668 | """Register the kubernetes AWS ID header handlers.""" 669 | self.sts_client.meta.events.register( 670 | "provide-client-params.sts.GetCallerIdentity", 671 | self._retrieve_k8s_aws_id, 672 | ) 673 | self.sts_client.meta.events.register( 674 | "before-sign.sts.GetCallerIdentity", 675 | self._inject_k8s_aws_id_header, 676 | ) 677 | 678 | def _retrieve_k8s_aws_id(self, params, context, **_) -> None: 679 | """Retrieve the kubernetes AWS ID header for use in boto3 request headers.""" 680 | if TOKEN_HEADER_KEY in params: 681 | context[TOKEN_HEADER_KEY] = params.pop(TOKEN_HEADER_KEY) 682 | logger.debug("Retrieving cluster header %s: %s", TOKEN_HEADER_KEY, context[TOKEN_HEADER_KEY]) 683 | 684 | def _inject_k8s_aws_id_header(self, request, **_) -> None: 685 | """Inject the kubernetes AWS ID header into boto3 request headers.""" 686 | if TOKEN_HEADER_KEY in request.context: 687 | request.headers[TOKEN_HEADER_KEY] = request.context[TOKEN_HEADER_KEY] 688 | logger.debug("Patching boto3 STS calls with cluster headers: %s", request.headers) 689 | 690 | def _get_presigned_url(self, url_timeout: int = 60) -> str: 691 | """Get the pre-signed URL. 692 | 693 | Arguments: 694 | url_timeout: The number of seconds to lease the pre-signed URL for. 695 | Defaults to: 60. 696 | 697 | Returns: 698 | The pre-signed URL. 699 | """ 700 | logger.debug("Generating the pre-signed url for get-caller-identity...") 701 | return self.sts_client.generate_presigned_url( 702 | "get_caller_identity", 703 | Params={TOKEN_HEADER_KEY: self.cluster_identifier}, 704 | ExpiresIn=url_timeout, 705 | HttpMethod="GET", 706 | ) 707 | 708 | @cached_property 709 | def current_addons(self) -> List[str]: 710 | """Return a list of addon names currently installed in the cluster.""" 711 | echo_info(f"Getting the list of current cluster addons for cluster: {self.name}...") 712 | return self.eks_client.list_addons(clusterName=self.name).get("addons", []) 713 | 714 | @property 715 | def cluster_name(self) -> str: 716 | """Return the cluster name.""" 717 | return self.name 718 | 719 | @property 720 | def cluster_identifier(self) -> str: 721 | """Return the preferred identifier for the cluster. 722 | 723 | If the cluster is a local cluster deployed on AWS Outposts, the resource ID must be used. 724 | If not, use the cluster name. 725 | 726 | """ 727 | return self.resource_id or self.name 728 | 729 | @cached_property 730 | def addons(self) -> List[ClusterAddon]: 731 | """Get the list of current cluster addons. 732 | 733 | Returns: 734 | The list of `ClusterAddon` objects. 735 | 736 | """ 737 | echo_info("Fetching Cluster Addons...") 738 | return [ClusterAddon.get(addon, self, self.region) for addon in self.current_addons] 739 | 740 | @cached_property 741 | def needs_upgrade(self) -> bool: 742 | """Determine whether or not this addon should be upgraded.""" 743 | return self._version_object < self._target_version_object 744 | 745 | @cached_property 746 | def upgradable_addons(self) -> List[ClusterAddon]: 747 | """Get a list of addons that require upgrade.""" 748 | return [addon for addon in self.addons if addon.needs_upgrade] 749 | 750 | @cached_property 751 | def upgradable_managed_nodegroups(self) -> List[ManagedNodeGroup]: 752 | """Get a list of managed nodegroups that require upgrade.""" 753 | return [nodegroup for nodegroup in self.nodegroups if nodegroup.needs_upgrade] 754 | 755 | @cached_property 756 | def _version_object(self) -> Version: 757 | """Return the Cluster.version as a Version object.""" 758 | return Version(self.version) 759 | 760 | @cached_property 761 | def _target_version_object(self) -> Version: 762 | """Return the Cluster.target_version as a Version object.""" 763 | return Version(self.target_version) 764 | 765 | def update_cluster(self, wait: bool = True) -> Optional[UpdateTypeDef]: 766 | """Upgrade the cluster itself.""" 767 | if self._version_object > self._target_version_object: 768 | echo_warning( 769 | f"Cluster: {self.name} version: {self.version} already greater than target version: {self.target_version}! Skipping cluster upgrade!", 770 | ) 771 | return None 772 | 773 | if self._version_object == self._target_version_object: 774 | echo_warning(f"Cluster: {self.name} already on version: {self.version}! Skipping cluster upgrade!") 775 | return None 776 | 777 | if self._target_version_object.minor > self._version_object.minor + 1: 778 | echo_error( 779 | f"Cluster: {self.name} can't be upgraded more than one minor at a time! Please adjust the target cluster version and try again!", 780 | ) 781 | raise InvalidUpgradeTargetVersion() 782 | 783 | echo_info(f"Upgrading cluster: {self.name} from version: {self.version} to version: {self.target_version}") 784 | update_response: UpdateClusterVersionResponseTypeDef = self.eks_client.update_cluster_version( 785 | name=self.name, version=self.target_version 786 | ) 787 | update_response_body: UpdateTypeDef = update_response["update"] 788 | _update_errors = update_response_body.get("errors", []) 789 | 790 | if _update_errors: 791 | echo_error( 792 | f"Errors encountered while attempting to update cluster: {self.name} - Errors: {_update_errors}", 793 | ) 794 | self.errors += _update_errors 795 | if wait: 796 | self.wait_for_active() 797 | return update_response_body 798 | 799 | def upgrade_addons(self, wait: bool = False) -> Dict[str, Any]: 800 | """Upgrade all cluster addons.""" 801 | echo_info("The add-ons update has been initiated...") 802 | upgrade_details: Dict[str, Any] = {} 803 | for addon in self.upgradable_addons: 804 | _update_responses: list[UpdateTypeDef] = addon.update(wait=wait) 805 | upgrade_details[addon.name] = _update_responses 806 | return upgrade_details 807 | 808 | def upgrade_nodegroups(self, wait: bool = False) -> Dict[str, Any]: 809 | """Upgrade all EKS managed nodegroups.""" 810 | upgrade_details: Dict[str, Any] = {} 811 | for nodegroup in self.upgradable_managed_nodegroups: 812 | _update_response: UpdateTypeDef = nodegroup.update(wait=wait) 813 | _update_id: str = _update_response.get("id", "") 814 | _update_status: str = _update_response.get("status", "") 815 | echo_info(f"Updating nodegroup: {nodegroup.name} - ID: {_update_id} - Status: {_update_status}") 816 | upgrade_details[nodegroup.name] = _update_response 817 | return upgrade_details 818 | 819 | def get_token(self) -> str: 820 | """Generate a presigned url token to pass to client. 821 | 822 | Returns: 823 | The pre-signed STS token for use in the Kubernetes configuration. 824 | 825 | """ 826 | logger.debug("Getting the pre-signed STS token...") 827 | url = self._get_presigned_url() 828 | suffix: str = base64.urlsafe_b64encode(url.encode("utf-8")).decode("utf-8").rstrip("=") 829 | token = f"{TOKEN_PREFIX}.{suffix}" 830 | return token 831 | 832 | @property 833 | def user_config(self) -> Dict[str, Union[str, List[Dict[str, Any]]]]: 834 | """Get a configuration for the Kubernetes client library. 835 | 836 | The credentials of the given portal user are used, access is restricted to the default namespace. 837 | 838 | Returns: 839 | The dictionary representation of Kubernetes configuration for the current cluster. 840 | 841 | """ 842 | config_data: Dict[str, Union[str, List[Dict[str, Any]]]] = { 843 | "current-context": self.cluster_name, 844 | "contexts": [ 845 | { 846 | "name": self.cluster_name, 847 | "context": { 848 | "cluster": self.cluster_name, 849 | "user": self.arn, 850 | }, 851 | } 852 | ], 853 | "clusters": [ 854 | { 855 | "name": self.cluster_name, 856 | "cluster": { 857 | "certificate-authority-data": self.certificate_authority_data, 858 | "server": self.endpoint, 859 | }, 860 | } 861 | ], 862 | "users": [ 863 | { 864 | "name": self.arn, 865 | "user": { 866 | "token": self.get_token(), 867 | }, 868 | } 869 | ], 870 | } 871 | return config_data 872 | 873 | def load_config(self, user_config: Optional[Dict[str, Any]] = None) -> None: 874 | """Load the Kubernetes configuration. 875 | 876 | Arguments: 877 | user_config: The Kubernetes configuration to be used with the client. 878 | Defaults to: The current cluster's pre-populated configuration from `Cluster.user_config`. 879 | 880 | Returns: 881 | None. 882 | 883 | """ 884 | logger.debug("Loading Kubernetes config from user config dictionary...") 885 | user_config = user_config or self.user_config 886 | k8s_config.load_kube_config_from_dict(user_config) 887 | logger.debug("Loaded kubernetes config from user config!") 888 | 889 | @property 890 | def available(self) -> bool: 891 | """Whether or not the cluster exists and is active.""" 892 | return self.status in ["ACTIVE", "UPDATING"] 893 | 894 | @property 895 | def active(self) -> bool: 896 | """Whether or not the cluster exists and is active.""" 897 | return self.status == "ACTIVE" 898 | 899 | @property 900 | def updating(self) -> bool: 901 | """Whether or not the cluster exists and is active.""" 902 | return self.status == "UPDATING" 903 | 904 | @cached_property 905 | def autoscaling_groups(self) -> List[AutoscalingGroup]: 906 | """Get the list of AutoScaling Groups (ASGs). 907 | 908 | We get a list of ASGs which will match the format 909 | "kubernetes.io/cluster/{cluster_name}" 910 | and returns an empty list if none are found 911 | 912 | """ 913 | cluster_tag = f"kubernetes.io/cluster/{self.name}" 914 | response = self.autoscaling_client.describe_auto_scaling_groups( 915 | Filters=[{"Name": "tag-key", "Values": [cluster_tag]}] 916 | ).get("AutoScalingGroups", []) 917 | return [AutoscalingGroup.get(asg_data=asg, region=self.region, cluster=self) for asg in response] 918 | 919 | @cached_property 920 | def asg_names(self) -> List[str]: 921 | """Get the autoscaling group names.""" 922 | return [asg.name for asg in self.autoscaling_groups] 923 | 924 | @cached_property 925 | def available_addon_versions(self) -> List[AddonInfoTypeDef]: 926 | """Get the available addon versions for the associated Kubernetes version.""" 927 | addon_versions: List[AddonInfoTypeDef] = self.eks_client.describe_addon_versions( 928 | kubernetesVersion=self.target_version 929 | ).get("addons", []) 930 | return addon_versions 931 | 932 | @cached_property 933 | def nodegroup_names(self) -> List[str]: 934 | """Get the cluster's associated nodegroups.""" 935 | response: ListNodegroupsResponseTypeDef = self.eks_client.list_nodegroups(clusterName=self.name, maxResults=100) 936 | return response["nodegroups"] 937 | 938 | @cached_property 939 | def nodegroups(self) -> List[ManagedNodeGroup]: 940 | """Get the cluster's associated nodegroups.""" 941 | return [ 942 | ManagedNodeGroup.get(node_group=nodegroup, cluster=self, region=self.region) 943 | for nodegroup in self.nodegroup_names 944 | ] 945 | 946 | @classmethod 947 | def get(cls, cluster_name: str, region: str, target_version: str = "", latest_addons: bool = False): 948 | """Get the cluster details and build a Cluster. 949 | 950 | Arguments: 951 | cluster_name: The name the of the cluster. 952 | region: The AWS region where the cluster resides. 953 | target_version: The target cluster version of this upgrade. 954 | Defaults to: The current cluster version + 1 minor 955 | (e.g. current cluster: `1.24` will target version: `1.25`). 956 | latest_addons: Whether or not to target the latest versions of addons 957 | available versus the default versions. 958 | Defaults to: `False`. 959 | 960 | Returns: 961 | Cluster: The requested EKS cluster object. 962 | 963 | """ 964 | logger.debug("Getting cluster details...") 965 | eks_client: EKSClient = boto3.client("eks", region_name=region) 966 | 967 | response: DescribeClusterResponseTypeDef = eks_client.describe_cluster( 968 | name=cluster_name, 969 | ) 970 | cluster_data: ClusterTypeDef = response["cluster"] 971 | 972 | # If encryption config is present, use it to populate secrets ARN. 973 | try: 974 | _secrets_key_arn: str = cluster_data["encryptionConfig"][0]["provider"]["keyArn"] 975 | except (KeyError, IndexError): 976 | logger.debug("No secrets key ARN found for cluster... defaulting to empty string.") 977 | _secrets_key_arn = "" 978 | 979 | return cls( 980 | arn=cluster_data.get("arn", ""), 981 | name=cluster_data.get("name", ""), 982 | resource_id=cluster_data.get("id", ""), 983 | certificate_authority_data=cluster_data.get("certificateAuthority", {}).get("data", ""), 984 | endpoint=cluster_data.get("endpoint", ""), 985 | version=cluster_data.get("version", ""), 986 | status=cluster_data.get("status", ""), 987 | platform_version=cluster_data.get("platformVersion", ""), 988 | role_arn=cluster_data.get("roleArn", ""), 989 | identity_oidc_issuer=cluster_data.get("identity", {}).get("oidc", {}).get("issuer", ""), 990 | secrets_key_arn=_secrets_key_arn, 991 | region=region, 992 | target_version=target_version, 993 | latest_addons=latest_addons, 994 | ) 995 | 996 | def wait_for_active(self, delay: int = 35, initial_delay: int = 30, max_attempts: int = 160) -> None: 997 | """Wait for the cluster to become active.""" 998 | echo_info(f"Waiting for cluster: {self.name} to become active...") 999 | time.sleep(initial_delay) 1000 | waiter_config: WaiterConfigTypeDef = {"Delay": delay, "MaxAttempts": max_attempts} 1001 | self.active_waiter.wait(name=self.name, WaiterConfig=waiter_config) 1002 | echo_success(f"Cluster: {self.name} now active, control plane upgrade should be completed!") 1003 | -------------------------------------------------------------------------------- /eksupgrade/src/S3Files/coredns.json: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": "apps/v1", 3 | "kind": "Deployment", 4 | "metadata": { 5 | "name": "coredns", 6 | "namespace": "kube-system", 7 | "labels": { 8 | "k8s-app": "kube-dns", 9 | "kubernetes.io/name": "CoreDNS", 10 | "eks.amazonaws.com/component": "coredns" 11 | } 12 | }, 13 | "spec": { 14 | "replicas": 2, 15 | "strategy": { 16 | "type": "RollingUpdate", 17 | "rollingUpdate": { 18 | "maxUnavailable": 1 19 | } 20 | }, 21 | "selector": { 22 | "matchLabels": { 23 | "k8s-app": "kube-dns", 24 | "eks.amazonaws.com/component": "coredns" 25 | } 26 | }, 27 | "template": { 28 | "metadata": { 29 | "labels": { 30 | "k8s-app": "kube-dns", 31 | "eks.amazonaws.com/component": "coredns" 32 | }, 33 | "annotations": { 34 | "eks.amazonaws.com/compute-type": "ec2" 35 | } 36 | }, 37 | "spec": { 38 | "serviceAccountName": "coredns", 39 | "priorityClassName": "system-cluster-critical", 40 | "affinity": { 41 | "nodeAffinity": { 42 | "requiredDuringSchedulingIgnoredDuringExecution": { 43 | "nodeSelectorTerms": [ 44 | { 45 | "matchExpressions": [ 46 | { 47 | "key": "beta.kubernetes.io/os", 48 | "operator": "In", 49 | "values": [ 50 | "linux" 51 | ] 52 | }, 53 | { 54 | "key": "beta.kubernetes.io/arch", 55 | "operator": "In", 56 | "values": [ 57 | "amd64", 58 | "arm64" 59 | ] 60 | } 61 | ] 62 | } 63 | ] 64 | } 65 | }, 66 | "podAntiAffinity": { 67 | "preferredDuringSchedulingIgnoredDuringExecution": [ 68 | { 69 | "podAffinityTerm": { 70 | "labelSelector": { 71 | "matchExpressions": [ 72 | { 73 | "key": "k8s-app", 74 | "operator": "In", 75 | "values": [ 76 | "kube-dns" 77 | ] 78 | } 79 | ] 80 | }, 81 | "topologyKey": "kubernetes.io/hostname" 82 | }, 83 | "weight": 100 84 | } 85 | ] 86 | } 87 | }, 88 | "tolerations": [ 89 | { 90 | "key": "node-role.kubernetes.io/master", 91 | "effect": "NoSchedule" 92 | }, 93 | { 94 | "key": "CriticalAddonsOnly", 95 | "operator": "Exists" 96 | } 97 | ], 98 | "containers": [ 99 | { 100 | "name": "coredns", 101 | "image": "602401143452.dkr.ecr.REGION.amazonaws.com/eks/coredns:v1.7.0-eksbuild.1", 102 | "imagePullPolicy": "IfNotPresent", 103 | "resources": { 104 | "limits": { 105 | "memory": "170Mi" 106 | }, 107 | "requests": { 108 | "cpu": "100m", 109 | "memory": "70Mi" 110 | } 111 | }, 112 | "args": [ 113 | "-conf", 114 | "/etc/coredns/Corefile" 115 | ], 116 | "volumeMounts": [ 117 | { 118 | "name": "config-volume", 119 | "mountPath": "/etc/coredns", 120 | "readOnly": true 121 | }, 122 | { 123 | "name": "tmp", 124 | "mountPath": "/tmp" 125 | } 126 | ], 127 | "ports": [ 128 | { 129 | "containerPort": 53, 130 | "name": "dns", 131 | "protocol": "UDP" 132 | }, 133 | { 134 | "containerPort": 53, 135 | "name": "dns-tcp", 136 | "protocol": "TCP" 137 | }, 138 | { 139 | "containerPort": 9153, 140 | "name": "metrics", 141 | "protocol": "TCP" 142 | } 143 | ], 144 | "livenessProbe": { 145 | "httpGet": { 146 | "path": "/health", 147 | "port": 8080, 148 | "scheme": "HTTP" 149 | }, 150 | "initialDelaySeconds": 60, 151 | "timeoutSeconds": 5, 152 | "successThreshold": 1, 153 | "failureThreshold": 5 154 | }, 155 | "readinessProbe": { 156 | "httpGet": { 157 | "path": "/health", 158 | "port": 8080, 159 | "scheme": "HTTP" 160 | } 161 | }, 162 | "securityContext": { 163 | "allowPrivilegeEscalation": false, 164 | "capabilities": { 165 | "add": [ 166 | "NET_BIND_SERVICE" 167 | ], 168 | "drop": [ 169 | "all" 170 | ] 171 | }, 172 | "readOnlyRootFilesystem": true 173 | } 174 | } 175 | ], 176 | "dnsPolicy": "Default", 177 | "volumes": [ 178 | { 179 | "name": "tmp", 180 | "emptyDir": {} 181 | }, 182 | { 183 | "name": "config-volume", 184 | "configMap": { 185 | "name": "coredns", 186 | "items": [ 187 | { 188 | "key": "Corefile", 189 | "path": "Corefile" 190 | } 191 | ] 192 | } 193 | } 194 | ] 195 | } 196 | } 197 | } 198 | } -------------------------------------------------------------------------------- /eksupgrade/src/S3Files/kube-proxy-configmap.json: -------------------------------------------------------------------------------- 1 | {"apiVersion": "v1", "kind": "ConfigMap", "metadata": {"name": "kube-proxy", "namespace": "kube-system", "labels": {"k8s-app": "kube-proxy", "eks.amazonaws.com/component": "kube-proxy"}}, "data": {"kubeconfig": "kind: Config\napiVersion: v1\nclusters:\n- cluster:\n certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt\n server: MASTER_ENDPOINT\n name: default\ncontexts:\n- context:\n cluster: default\n namespace: default\n user: default\n name: default\ncurrent-context: default\nusers:\n- name: default\n user:\n tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token"}} -------------------------------------------------------------------------------- /eksupgrade/src/S3Files/kube-proxy.json: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": "apps/v1", 3 | "kind": "DaemonSet", 4 | "metadata": { 5 | "labels": { 6 | "k8s-app": "kube-proxy", 7 | "eks.amazonaws.com/component": "kube-proxy" 8 | }, 9 | "name": "kube-proxy", 10 | "namespace": "kube-system" 11 | }, 12 | "spec": { 13 | "selector": { 14 | "matchLabels": { 15 | "k8s-app": "kube-proxy" 16 | } 17 | }, 18 | "updateStrategy": { 19 | "type": "RollingUpdate", 20 | "rollingUpdate": { 21 | "maxUnavailable": "10%" 22 | } 23 | }, 24 | "template": { 25 | "metadata": { 26 | "labels": { 27 | "k8s-app": "kube-proxy" 28 | } 29 | }, 30 | "spec": { 31 | "affinity": { 32 | "nodeAffinity": { 33 | "requiredDuringSchedulingIgnoredDuringExecution": { 34 | "nodeSelectorTerms": [ 35 | { 36 | "matchExpressions": [ 37 | { 38 | "key": "kubernetes.io/os", 39 | "operator": "In", 40 | "values": [ 41 | "linux" 42 | ] 43 | }, 44 | { 45 | "key": "kubernetes.io/arch", 46 | "operator": "In", 47 | "values": [ 48 | "amd64", 49 | "arm64" 50 | ] 51 | }, 52 | { 53 | "key": "eks.amazonaws.com/compute-type", 54 | "operator": "NotIn", 55 | "values": [ 56 | "fargate" 57 | ] 58 | } 59 | ] 60 | } 61 | ] 62 | } 63 | } 64 | }, 65 | "hostNetwork": true, 66 | "tolerations": [ 67 | { 68 | "operator": "Exists" 69 | } 70 | ], 71 | "priorityClassName": "system-node-critical", 72 | "containers": [ 73 | { 74 | "name": "kube-proxy", 75 | "image": "602401143452.dkr.ecr.REGION.amazonaws.com/eks/kube-proxy:v1.18.8-eksbuild.1", 76 | "resources": { 77 | "requests": { 78 | "cpu": "100m" 79 | } 80 | }, 81 | "command": [ 82 | "kube-proxy", 83 | "--v=2", 84 | "--config=/var/lib/kube-proxy-config/config" 85 | ], 86 | "securityContext": { 87 | "privileged": true 88 | }, 89 | "volumeMounts": [ 90 | { 91 | "mountPath": "/var/log", 92 | "name": "varlog", 93 | "readOnly": false 94 | }, 95 | { 96 | "mountPath": "/run/xtables.lock", 97 | "name": "xtables-lock", 98 | "readOnly": false 99 | }, 100 | { 101 | "mountPath": "/lib/modules", 102 | "name": "lib-modules", 103 | "readOnly": true 104 | }, 105 | { 106 | "name": "kubeconfig", 107 | "mountPath": "/var/lib/kube-proxy/" 108 | }, 109 | { 110 | "name": "config", 111 | "mountPath": "/var/lib/kube-proxy-config/" 112 | } 113 | ] 114 | } 115 | ], 116 | "volumes": [ 117 | { 118 | "name": "varlog", 119 | "hostPath": { 120 | "path": "/var/log" 121 | } 122 | }, 123 | { 124 | "name": "xtables-lock", 125 | "hostPath": { 126 | "path": "/run/xtables.lock", 127 | "type": "FileOrCreate" 128 | } 129 | }, 130 | { 131 | "name": "lib-modules", 132 | "hostPath": { 133 | "path": "/lib/modules" 134 | } 135 | }, 136 | { 137 | "name": "kubeconfig", 138 | "configMap": { 139 | "name": "kube-proxy" 140 | } 141 | }, 142 | { 143 | "name": "config", 144 | "configMap": { 145 | "name": "kube-proxy-config" 146 | } 147 | } 148 | ], 149 | "serviceAccountName": "kube-proxy" 150 | } 151 | } 152 | } 153 | } -------------------------------------------------------------------------------- /eksupgrade/src/S3Files/version_dict.json: -------------------------------------------------------------------------------- 1 | { 2 | "1.26": { 3 | "cluster-autoscaler": "1.26.1" 4 | }, 5 | "1.25": { 6 | "cluster-autoscaler": "1.25.0" 7 | }, 8 | "1.24": { 9 | "cluster-autoscaler": "1.24.0" 10 | }, 11 | "1.23": { 12 | "cluster-autoscaler": "1.23.0" 13 | }, 14 | "1.22": { 15 | "cluster-autoscaler": "1.22.2" 16 | }, 17 | "1.21": { 18 | "cluster-autoscaler": "1.21.2" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /eksupgrade/src/S3Files/vpc-cni.json: -------------------------------------------------------------------------------- 1 | {"apiVersion": "apps/v1", "kind": "DaemonSet", "metadata": {"labels": {"k8s-app": "aws-node"}, "name": "aws-node", "namespace": "kube-system"}, "spec": {"selector": {"matchLabels": {"k8s-app": "aws-node"}}, "template": {"metadata": {"labels": {"k8s-app": "aws-node"}}, "spec": {"affinity": {"nodeAffinity": {"requiredDuringSchedulingIgnoredDuringExecution": {"nodeSelectorTerms": [{"matchExpressions": [{"key": "kubernetes.io/os", "operator": "In", "values": ["linux"]}, {"key": "kubernetes.io/arch", "operator": "In", "values": ["amd64", "arm64"]}, {"key": "eks.amazonaws.com/compute-type", "operator": "NotIn", "values": ["fargate"]}]}]}}}, "containers": [{"env": [{"name": "ADDITIONAL_ENI_TAGS", "value": "{}"}, {"name": "AWS_VPC_CNI_NODE_PORT_SUPPORT", "value": "true"}, {"name": "AWS_VPC_ENI_MTU", "value": "9001"}, {"name": "AWS_VPC_K8S_CNI_CONFIGURE_RPFILTER", "value": "false"}, {"name": "AWS_VPC_K8S_CNI_CUSTOM_NETWORK_CFG", "value": "false"}, {"name": "AWS_VPC_K8S_CNI_EXTERNALSNAT", "value": "false"}, {"name": "AWS_VPC_K8S_CNI_LOGLEVEL", "value": "DEBUG"}, {"name": "AWS_VPC_K8S_CNI_LOG_FILE", "value": "/host/var/log/aws-routed-eni/ipamd.log"}, {"name": "AWS_VPC_K8S_CNI_RANDOMIZESNAT", "value": "prng"}, {"name": "AWS_VPC_K8S_CNI_VETHPREFIX", "value": "eni"}, {"name": "AWS_VPC_K8S_PLUGIN_LOG_FILE", "value": "/var/log/aws-routed-eni/plugin.log"}, {"name": "AWS_VPC_K8S_PLUGIN_LOG_LEVEL", "value": "DEBUG"}, {"name": "DISABLE_INTROSPECTION", "value": "false"}, {"name": "DISABLE_METRICS", "value": "false"}, {"name": "ENABLE_POD_ENI", "value": "false"}, {"name": "MY_NODE_NAME", "valueFrom": {"fieldRef": {"fieldPath": "spec.nodeName"}}}, {"name": "WARM_ENI_TARGET", "value": "1"}], "image": "602401143452.dkr.ecr.REGION.amazonaws.com/amazon-k8s-cni:v1.7.5-eksbuild.1", "imagePullPolicy": "Always", "livenessProbe": {"exec": {"command": ["/app/grpc-health-probe", "-addr=:50051"]}, "initialDelaySeconds": 60}, "name": "aws-node", "ports": [{"containerPort": 61678, "name": "metrics"}], "readinessProbe": {"exec": {"command": ["/app/grpc-health-probe", "-addr=:50051"]}, "initialDelaySeconds": 1}, "resources": {"requests": {"cpu": "10m"}}, "securityContext": {"capabilities": {"add": ["NET_ADMIN"]}}, "volumeMounts": [{"mountPath": "/host/opt/cni/bin", "name": "cni-bin-dir"}, {"mountPath": "/host/etc/cni/net.d", "name": "cni-net-dir"}, {"mountPath": "/host/var/log/aws-routed-eni", "name": "log-dir"}, {"mountPath": "/var/run/aws-node", "name": "run-dir"}, {"mountPath": "/var/run/dockershim.sock", "name": "dockershim"}, {"mountPath": "/run/xtables.lock", "name": "xtables-lock"}]}], "hostNetwork": true, "initContainers": [{"env": [{"name": "DISABLE_TCP_EARLY_DEMUX", "value": "false"}], "image": "602401143452.dkr.ecr.REGION.amazonaws.com/amazon-k8s-cni-init:v1.7.5-eksbuild.1", "imagePullPolicy": "Always", "name": "aws-vpc-cni-init", "securityContext": {"privileged": true}, "volumeMounts": [{"mountPath": "/host/opt/cni/bin", "name": "cni-bin-dir"}]}], "priorityClassName": "system-node-critical", "serviceAccountName": "aws-node", "terminationGracePeriodSeconds": 10, "tolerations": [{"operator": "Exists"}], "volumes": [{"hostPath": {"path": "/opt/cni/bin"}, "name": "cni-bin-dir"}, {"hostPath": {"path": "/etc/cni/net.d"}, "name": "cni-net-dir"}, {"hostPath": {"path": "/var/run/dockershim.sock"}, "name": "dockershim"}, {"hostPath": {"path": "/run/xtables.lock"}, "name": "xtables-lock"}, {"hostPath": {"path": "/var/log/aws-routed-eni", "type": "DirectoryOrCreate"}, "name": "log-dir"}, {"hostPath": {"path": "/var/run/aws-node", "type": "DirectoryOrCreate"}, "name": "run-dir"}]}}, "updateStrategy": {"rollingUpdate": {"maxUnavailable": "10%"}, "type": "RollingUpdate"}}} -------------------------------------------------------------------------------- /eksupgrade/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/eks-cluster-upgrade/d960eab8299b4b9c1e79e024907d4dfe35f83ad9/eksupgrade/src/__init__.py -------------------------------------------------------------------------------- /eksupgrade/src/boto_aws.py: -------------------------------------------------------------------------------- 1 | """Define the EKS upgrade boto specific logic.""" 2 | 3 | from __future__ import annotations 4 | 5 | import datetime 6 | import time 7 | import uuid 8 | from typing import Any, Dict, List 9 | 10 | import boto3 11 | 12 | from eksupgrade.utils import echo_error, echo_info, echo_success, echo_warning, get_logger 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | def status_of_cluster(cluster_name: str, region: str) -> List[str]: 18 | """Check the satus of the cluster and version of the cluster.""" 19 | eks_client = boto3.client("eks", region_name=region) 20 | try: 21 | response = eks_client.describe_cluster(name=cluster_name) 22 | return [response["cluster"]["status"], response["cluster"]["version"]] 23 | except Exception as e: 24 | echo_error(f"Exception encountered while attempting to get cluster status - Error: {e}") 25 | raise e 26 | 27 | 28 | def is_cluster_exists(cluster_name: str, region: str) -> str: 29 | """Check whether the cluster exists or not.""" 30 | try: 31 | response = status_of_cluster(cluster_name, region) 32 | return response[0] 33 | except Exception as e: 34 | echo_error(f"Exception encountered while checking if cluster exists. Error: {e}") 35 | raise e 36 | 37 | 38 | def get_latest_instance(asg_name: str, add_time: datetime.datetime, region: str) -> str: 39 | """Retrieve the most recently launched/launching instance. 40 | 41 | Note that this is not necessarily the same one that was launched by `add_node()`, 42 | but it's the best I could think of. 43 | 44 | """ 45 | asg_client = boto3.client("autoscaling", region_name=region) 46 | ec2_client = boto3.client("ec2", region_name=region) 47 | instances = [] 48 | 49 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 50 | time.sleep(20) 51 | instance_ids = [instance["InstanceId"] for instance in response["AutoScalingGroups"][0]["Instances"]] 52 | 53 | response = ec2_client.describe_instances(InstanceIds=instance_ids) 54 | for reservation in response["Reservations"]: 55 | for instance in reservation["Instances"]: 56 | instances.append(instance) 57 | 58 | instances_valid = [] 59 | instances_valid = [ 60 | instance 61 | for instance in instances 62 | if instance["State"]["Name"] in ["pending", "running"] and instance["LaunchTime"] > add_time 63 | ] 64 | 65 | latest_instance: Dict[str, Any] = {} 66 | try: 67 | time.sleep(10) 68 | latest_instance = sorted(instances_valid, key=lambda instance: instance["LaunchTime"])[-1] 69 | return latest_instance["InstanceId"] 70 | except Exception as e: 71 | echo_error(f"Exception encountered while sorting instances. Error: {e}") 72 | raise e 73 | 74 | 75 | def wait_for_ready(instanceid: str, region: str) -> bool: 76 | """Wait for the cluster to pass the status checks.""" 77 | ec2_client = boto3.client("ec2", region_name=region) 78 | echo_info(f"Instance {instanceid} waiting for the instance to pass the Health Checks") 79 | try: 80 | while ( 81 | ec2_client.describe_instance_status(InstanceIds=[instanceid])["InstanceStatuses"][0]["InstanceStatus"][ 82 | "Details" 83 | ][0]["Status"] 84 | != "passed" 85 | ): 86 | echo_info(f"Instance: {instanceid} waiting for the instance to pass the Health Checks") 87 | time.sleep(20) 88 | except Exception as e: 89 | echo_error(str(e)) 90 | raise Exception(f"{e}: Please rerun the Script the instance will be created") 91 | return True 92 | 93 | 94 | def check_asg_autoscaler(asg_name: str, region: str) -> bool: 95 | """Check whether the autoscaling is present or not.""" 96 | asg_client = boto3.client("autoscaling", region_name=region) 97 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 98 | pat = "k8s.io/cluster-autoscaler/enabled" 99 | asg_list = [] 100 | for asg in response["AutoScalingGroups"][0]["Tags"]: 101 | if asg["Key"] == pat: 102 | asg_list.append(asg) 103 | return bool(asg_list) 104 | 105 | 106 | def enable_disable_autoscaler(asg_name: str, action: str, region: str) -> str: 107 | """Enable or disable the autoscaler depending on the provided action.""" 108 | asg_client = boto3.client("autoscaling", region_name=region) 109 | try: 110 | if action == "pause": 111 | asg_client.delete_tags( 112 | Tags=[ 113 | { 114 | "ResourceId": asg_name, 115 | "ResourceType": "auto-scaling-group", 116 | "Key": "k8s.io/cluster-autoscaler/enabled", 117 | }, 118 | ] 119 | ) 120 | return "done" 121 | if action == "start": 122 | asg_client.create_or_update_tags( 123 | Tags=[ 124 | { 125 | "ResourceId": asg_name, 126 | "ResourceType": "auto-scaling-group", 127 | "Key": "k8s.io/cluster-autoscaler/enabled", 128 | "Value": "true", 129 | "PropagateAtLaunch": False, 130 | }, 131 | ] 132 | ) 133 | return "done" 134 | echo_warning("Invalid action provided to enable_disable_autoscaler!") 135 | except Exception as e: 136 | echo_error( 137 | f"Exception encountered while attempting to {action} the autoscaler associated with ASG: {asg_name} - Error: {e}", 138 | ) 139 | raise Exception(e) 140 | finally: 141 | return "Something went Wrong auto scaling operation failed" 142 | 143 | 144 | def worker_terminate(instance_id: str, region: str) -> None: 145 | """Terminate instance and decreasing the desired capacity whit asg terminate instance.""" 146 | asg_client = boto3.client("autoscaling", region_name=region) 147 | 148 | try: 149 | asg_client.terminate_instance_in_auto_scaling_group(InstanceId=instance_id, ShouldDecrementDesiredCapacity=True) 150 | except Exception as e: 151 | echo_error(f"Exception encountered while attempting to terminate worker: {instance_id} - Error: {e}") 152 | raise e 153 | 154 | 155 | def add_node(asg_name: str, region: str) -> None: 156 | """Add node to particular ASG.""" 157 | asg_client = boto3.client("autoscaling", region_name=region) 158 | 159 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 160 | try: 161 | old_capacity_mx = response["AutoScalingGroups"][0]["MaxSize"] 162 | old_capacity_des = response["AutoScalingGroups"][0]["DesiredCapacity"] 163 | except (KeyError, IndexError): 164 | echo_error(f"Exception encountered while getting old ASG capacity during add_node - ASG: {asg_name}") 165 | raise Exception("Error Index out of bound due to no max capacity field") 166 | 167 | if int(old_capacity_des) >= int(old_capacity_mx): 168 | asg_client.update_auto_scaling_group( 169 | AutoScalingGroupName=asg_name, MaxSize=(int(old_capacity_mx) + int(old_capacity_des)) 170 | ) 171 | 172 | old_capacity = response["AutoScalingGroups"][0]["DesiredCapacity"] 173 | new_capacity = old_capacity + 1 174 | 175 | try: 176 | asg_client.set_desired_capacity(AutoScalingGroupName=asg_name, DesiredCapacity=new_capacity) 177 | echo_info(f"New Node has been Added to {asg_name}") 178 | except Exception as e: 179 | echo_error(f"Exception encountered while attempting to add node to ASG: {asg_name} - Error: {e}") 180 | raise e 181 | 182 | 183 | def get_num_of_instances(asg_name: str, exclude_ids: List[str], region: str) -> int: 184 | """Count the number of instances.""" 185 | asg_client = boto3.client("autoscaling", region_name=region) 186 | ec2_client = boto3.client("ec2", region_name=region) 187 | instances = [] 188 | 189 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 190 | instance_ids = [ 191 | instance["InstanceId"] 192 | for instance in response["AutoScalingGroups"][0]["Instances"] 193 | if instance["InstanceId"] not in exclude_ids 194 | ] 195 | response = ec2_client.describe_instances(InstanceIds=instance_ids) 196 | for reservation in response["Reservations"]: 197 | for instance in reservation["Instances"]: 198 | instances.append(instance) 199 | # getting the instance in running or pending state 200 | instances = [instance for instance in instances if instance["State"]["Name"] in ["running", "pending"]] 201 | 202 | return len(instances) 203 | 204 | 205 | def old_lt_scenarios(inst: Dict[str, Any], asg_lt_name: str, asg_lt_version: int) -> bool: 206 | """Get the old launch template based on launch template name and version 1!=2.""" 207 | lt_name = inst["LaunchTemplate"]["LaunchTemplateName"] 208 | lt_version = int(inst["LaunchTemplate"]["Version"]) 209 | return (lt_name != asg_lt_name) or (lt_version != int(asg_lt_version)) 210 | 211 | 212 | def get_old_lt(asg_name: str, region: str) -> List[str]: 213 | """Get the old launch template.""" 214 | asg_client = boto3.client("autoscaling", region_name=region) 215 | ec2_client = boto3.client("ec2", region_name=region) 216 | 217 | old_lt_instance_ids = [] 218 | instances = [] 219 | 220 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 221 | asg_lt_name = "" 222 | # finding the launch type 223 | if "LaunchTemplate" in response["AutoScalingGroups"][0]: 224 | response["AutoScalingGroups"][0]["LaunchTemplate"]["LaunchTemplateId"] 225 | asg_lt_name = response["AutoScalingGroups"][0]["LaunchTemplate"]["LaunchTemplateName"] 226 | elif "MixedInstancesPolicy" in response["AutoScalingGroups"][0]: 227 | response["AutoScalingGroups"][0]["MixedInstancesPolicy"]["LaunchTemplate"]["LaunchTemplateSpecification"][ 228 | "LaunchTemplateId" 229 | ] 230 | asg_lt_name = response["AutoScalingGroups"][0]["MixedInstancesPolicy"]["LaunchTemplate"][ 231 | "LaunchTemplateSpecification" 232 | ]["LaunchTemplateName"] 233 | else: 234 | echo_error(f"Old Launch Template not found! ASG: {asg_name} - Region: {region}") 235 | return [] 236 | 237 | # checking whether there are instances with 1!=2 mismatch template version 238 | old_lt_instance_ids = [ 239 | instance["InstanceId"] 240 | for instance in response["AutoScalingGroups"][0]["Instances"] 241 | if old_lt_scenarios(instance, asg_lt_name, int(instance["LaunchTemplate"]["Version"])) 242 | ] 243 | if not old_lt_instance_ids: 244 | return [] 245 | response = ec2_client.describe_instances(InstanceIds=old_lt_instance_ids) 246 | for reservation in response["Reservations"]: 247 | for instance in reservation["Instances"]: 248 | instances.append(instance) 249 | return instances 250 | 251 | 252 | def old_launch_config_instances(asg_name: str, region: str) -> List[str]: 253 | """Get the old launch configuration instance IDs.""" 254 | asg_client = boto3.client("autoscaling", region_name=region) 255 | old_lc_ids = [] 256 | # describing the asg group 257 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 258 | instances = response["AutoScalingGroups"][0]["Instances"] 259 | for inst in instances: 260 | # checking the LaunchConfiguration is matching or not 261 | if inst.get("LaunchConfigurationName") != response["AutoScalingGroups"][0]["LaunchConfigurationName"]: 262 | old_lc_ids.append(inst["InstanceId"]) 263 | return old_lc_ids 264 | 265 | 266 | def outdated_lt(asgs, region: str) -> List[str]: 267 | """Get the outdated launch template.""" 268 | asg_client = boto3.client("autoscaling", region_name=region) 269 | asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asgs]) 270 | asg_name = asg["AutoScalingGroups"][0]["AutoScalingGroupName"] 271 | launch_type = "" 272 | if "LaunchConfigurationName" in asg["AutoScalingGroups"][0]: 273 | launch_type = "LaunchConfiguration" 274 | elif "LaunchTemplate" in asg["AutoScalingGroups"][0]: 275 | launch_type = "LaunchTemplate" 276 | elif "MixedInstancesPolicy" in asg["AutoScalingGroups"][0]: 277 | launch_type = "LaunchTemplate" 278 | else: 279 | return [] 280 | old_instances = [] 281 | 282 | if launch_type == "LaunchConfiguration": 283 | temp = old_launch_config_instances(asg_name, region) 284 | if temp: 285 | old_instances = temp 286 | return old_instances 287 | return [] 288 | 289 | # checking with launch Template 290 | if launch_type == "LaunchTemplate": 291 | temp = get_old_lt(asg_name, region) 292 | if temp: 293 | old_instances = temp 294 | return old_instances 295 | return [] 296 | return [] 297 | 298 | 299 | def add_autoscaling(asg_name: str, img_id: str, region: str) -> Dict[str, Any]: 300 | """Add the new Launch Configuration to the ASG.""" 301 | asg_client = boto3.client("autoscaling", region_name=region) 302 | timestamp = time.time() 303 | timestamp_string = datetime.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H-%M-%S") 304 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 305 | 306 | source_instance_id = response.get("AutoScalingGroups")[0]["Instances"][0]["InstanceId"] 307 | new_launch_config_name = f"LC {img_id} {timestamp_string} {str(uuid.uuid4())}" 308 | 309 | try: 310 | asg_client.create_launch_configuration( 311 | InstanceId=source_instance_id, LaunchConfigurationName=new_launch_config_name, ImageId=img_id 312 | ) 313 | response = asg_client.update_auto_scaling_group( 314 | AutoScalingGroupName=asg_name, LaunchConfigurationName=new_launch_config_name 315 | ) 316 | echo_success("Updated to latest launch configuration") 317 | except Exception as e: 318 | echo_error( 319 | f"Exception encountered while executing add_autoscaling with ASG: {asg_name} - Image ID: {img_id} - Region: {region} - Error: {e}", 320 | ) 321 | raise e 322 | return response 323 | 324 | 325 | def get_outdated_asg(asg_name: str, latest_img: str, region: str) -> bool: 326 | """Get the outdated autoscaling group.""" 327 | asg_client = boto3.client("autoscaling", region_name=region) 328 | ec2_client = boto3.client("ec2", region_name=region) 329 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 330 | instance_ids = [instance["InstanceId"] for instance in response["AutoScalingGroups"][0]["Instances"]] 331 | old_ami_inst = [] 332 | # filtering old instance where the logic is used to check whether we should add new launch configuration or not 333 | inst_response = ec2_client.describe_instances(InstanceIds=instance_ids) 334 | for reservation in inst_response["Reservations"]: 335 | for instance in reservation["Instances"]: 336 | if instance["ImageId"] != latest_img: 337 | old_ami_inst.append(instance["InstanceId"]) 338 | instance_ids.sort() 339 | old_ami_inst.sort() 340 | if len(old_ami_inst) != len(instance_ids): 341 | return False 342 | 343 | for count, value in enumerate(old_ami_inst): 344 | if value != instance_ids[count]: 345 | return False 346 | return True 347 | -------------------------------------------------------------------------------- /eksupgrade/src/eks_get_image_type.py: -------------------------------------------------------------------------------- 1 | """Define the image type logic for EKS.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Optional 6 | 7 | import boto3 8 | 9 | from eksupgrade.utils import echo_error, echo_warning, get_logger 10 | 11 | from .k8s_client import find_node 12 | 13 | logger = get_logger(__name__) 14 | 15 | 16 | def image_type(node_type: str, image_id: str, region: str) -> Optional[str]: 17 | """Return the image location.""" 18 | ec2_client = boto3.client("ec2", region_name=region) 19 | node_type = node_type.lower() 20 | filters = [ 21 | {"Name": "is-public", "Values": ["true"]}, 22 | ] 23 | 24 | if "amazon linux 2" in node_type: 25 | filters.append({"Name": "name", "Values": ["amazon-eks-node*"]}) 26 | elif "bottlerocket" in node_type: 27 | filters.append({"Name": "name", "Values": ["bottlerocket-aws-k8s-*"]}) 28 | elif "windows" in node_type: 29 | filters.append({"Name": "name", "Values": ["Windows_Server-*-English-*-EKS_Optimized*"]}) 30 | else: 31 | echo_warning(f"Node type: {node_type} is unsupported - Image ID: {image_id}") 32 | return None 33 | 34 | # describing image types 35 | images = ec2_client.describe_images(Filters=filters) 36 | images_list = [[item.get("ImageId"), item.get("Name")] for item in images.get("Images", [])] 37 | 38 | logger.debug("Images List: %s", images_list) 39 | 40 | for i in images_list: 41 | if image_id in i[0]: 42 | logger.debug("Found image ID: %s in list - returning image name: %s", image_id, i[1]) 43 | return i[1] 44 | return None 45 | 46 | 47 | def get_ami_name(cluster_name: str, asg_name: str, region: str): 48 | asg_client = boto3.client("autoscaling", region_name=region) 49 | ec2_client = boto3.client("ec2", region_name=region) 50 | response = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) 51 | instance_ids = [instance["InstanceId"] for instance in response["AutoScalingGroups"][0]["Instances"]] 52 | if not instance_ids: 53 | echo_error(f"No instances found to determine AMI - cluster: {cluster_name} - ASG: {asg_name}") 54 | raise Exception("No Instances") 55 | 56 | response = ec2_client.describe_instances(InstanceIds=instance_ids) 57 | ans = [] 58 | for reservation in response["Reservations"]: 59 | for instance in reservation["Instances"]: 60 | image_id = instance["ImageId"] 61 | # getting the instance type as amz2 or windows or ubuntu 62 | node_type = find_node(cluster_name, instance["InstanceId"], "os_type", region) 63 | _image_type = image_type(node_type=node_type, image_id=image_id, region=region) 64 | logger.debug("_image_type: %s", _image_type) 65 | ans.append( 66 | [ 67 | node_type, 68 | _image_type, 69 | ] 70 | ) 71 | # custom logic to check whether the os_type is same if same returning and if not returning the least repeated name 72 | result = False 73 | if ans: 74 | result = all(elem[0] == ans[0][0] for _, elem in enumerate(ans)) 75 | if result: 76 | return ans[0] 77 | dd = {} 78 | ac = {} 79 | for d, ak in ans: 80 | dd[d] = dd.get(d, 0) + 1 81 | ac[d] = ac.get(d, ak) 82 | return min((ac.get(d, ""), d) for d in dd) 83 | -------------------------------------------------------------------------------- /eksupgrade/src/k8s_client.py: -------------------------------------------------------------------------------- 1 | """The EKS Upgrade kubernetes client module. 2 | 3 | Attributes: 4 | queue (queue.Queue): The queue used for executing jobs (status checks, etc). 5 | 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | import base64 11 | import queue 12 | import re 13 | import threading 14 | import time 15 | from typing import Any, Dict, List, Optional, Union 16 | 17 | try: 18 | from functools import cache 19 | except ImportError: 20 | from functools import lru_cache as cache 21 | 22 | try: 23 | from kubernetes.client.models.v1beta1_eviction import V1beta1Eviction as V1Eviction 24 | except ImportError: 25 | from kubernetes.client.models.v1_eviction import V1Eviction 26 | 27 | import boto3 28 | from botocore.signers import RequestSigner 29 | from kubernetes import client, watch 30 | from kubernetes.client.rest import ApiException 31 | 32 | from eksupgrade.utils import echo_error, echo_info, get_logger 33 | 34 | logger = get_logger(__name__) 35 | 36 | queue = queue.Queue() 37 | 38 | 39 | class StatsWorker(threading.Thread): 40 | def __init__(self, queue, id): 41 | threading.Thread.__init__(self) 42 | self.queue = queue 43 | self.id = id 44 | 45 | def run(self): 46 | while self.queue.not_empty: 47 | cluster_name, namespace, new_pod_name, _, region = self.queue.get() 48 | status = addon_status( 49 | cluster_name=cluster_name, 50 | new_pod_name=new_pod_name, 51 | region=region, 52 | namespace=namespace, 53 | ) 54 | # signals to queue job is done 55 | if not status: 56 | echo_error( 57 | f"Pod not started! Cluster: {cluster_name} - Namespace: {namespace} - New Pod: {new_pod_name}" 58 | ) 59 | raise Exception("Pod Not Started", new_pod_name) 60 | 61 | self.queue.task_done() 62 | 63 | 64 | def get_bearer_token(cluster_id: str, region: str) -> str: 65 | """Authenticate the session with sts token.""" 66 | sts_token_expiration_ttl: int = 60 67 | session = boto3.session.Session() 68 | 69 | sts_client = session.client("sts", region_name=region) 70 | service_id = sts_client.meta.service_model.service_id 71 | signer = RequestSigner(service_id, region, "sts", "v4", session.get_credentials(), session.events) 72 | 73 | params = { 74 | "method": "GET", 75 | "url": f"https://sts.{region}.amazonaws.com/?Action=GetCallerIdentity&Version=2011-06-15", 76 | "body": {}, 77 | "headers": {"x-k8s-aws-id": cluster_id}, 78 | "context": {}, 79 | } 80 | 81 | # Getting a presigned Url 82 | signed_url = signer.generate_presigned_url( 83 | params, region_name=region, expires_in=sts_token_expiration_ttl, operation_name="" 84 | ) 85 | base64_url = base64.urlsafe_b64encode(signed_url.encode("utf-8")).decode("utf-8") 86 | 87 | # remove any base64 encoding padding and returning the kubernetes token 88 | return "k8s-aws-v1." + re.sub(r"=*", "", base64_url) 89 | 90 | 91 | def loading_config(cluster_name: str, region: str) -> str: 92 | """loading kubeconfig with sts""" 93 | eks = boto3.client("eks", region_name=region) 94 | resp = eks.describe_cluster(name=cluster_name) 95 | configs = client.Configuration() 96 | configs.host = resp["cluster"]["endpoint"] 97 | configs.verify_ssl = False 98 | configs.debug = False 99 | configs.api_key = {"authorization": "Bearer " + get_bearer_token(cluster_name, region)} 100 | client.Configuration.set_default(configs) 101 | return "Initialized" 102 | 103 | 104 | def unschedule_old_nodes(cluster_name: str, node_name: str, region: str) -> None: 105 | """Unschedule the nodes to avoid new nodes being launched.""" 106 | loading_config(cluster_name, region) 107 | try: 108 | core_v1_api = client.CoreV1Api() 109 | # unscheduling the nodes 110 | body = {"spec": {"unschedulable": True}} 111 | core_v1_api.patch_node(node_name, body) 112 | except Exception as e: 113 | echo_error( 114 | f"Exception encountered while attempting to unschedule old nodes - cluster: {cluster_name} - node: {node_name}", 115 | ) 116 | raise e 117 | return 118 | 119 | 120 | def watcher(cluster_name: str, name: str, region: str) -> bool: 121 | """Watch whether the pod is deleted or not.""" 122 | loading_config(cluster_name, region) 123 | core_v1_api = client.CoreV1Api() 124 | _watcher = watch.Watch() 125 | 126 | try: 127 | for event in _watcher.stream(core_v1_api.list_pod_for_all_namespaces, timeout_seconds=30): 128 | echo_info(f"{event['type']} {event['object'].metadata.name}") 129 | 130 | if event["type"] == "DELETED" and event["object"].metadata.name == name: 131 | _watcher.stop() 132 | return True 133 | return False 134 | except Exception as e: 135 | echo_error( 136 | f"Exception encountered in watcher method against cluster: {cluster_name} name: {name} Error: {e}", 137 | ) 138 | raise e 139 | 140 | 141 | def drain_nodes(cluster_name, node_name, forced, region) -> Optional[str]: 142 | """Pod eviction using the eviction API.""" 143 | loading_config(cluster_name, region) 144 | core_v1_api = client.CoreV1Api() 145 | api_response = core_v1_api.list_pod_for_all_namespaces(watch=False, field_selector=f"spec.nodeName={node_name}") 146 | retry = 0 147 | 148 | if not api_response.items: 149 | return f"Empty Nothing to Drain {node_name}" 150 | 151 | for i in api_response.items: 152 | if i.spec.node_name == node_name: 153 | try: 154 | if forced: 155 | core_v1_api.delete_namespaced_pod( 156 | i.metadata.name, i.metadata.namespace, grace_period_seconds=0, body=client.V1DeleteOptions() 157 | ) 158 | else: 159 | eviction_body = V1Eviction( 160 | metadata=client.V1ObjectMeta(name=i.metadata.name, namespace=i.metadata.namespace) 161 | ) 162 | core_v1_api.create_namespaced_pod_eviction( 163 | name=i.metadata.name, namespace=i.metadata.namespace, body=eviction_body 164 | ) 165 | # retry to if pod is not deleted with eviction api 166 | if not watcher(cluster_name, i.metadata.name, region) and retry < 2: 167 | drain_nodes(cluster_name, i.metadata.name, forced=forced, region=region) 168 | retry += 1 169 | if retry == 2: 170 | echo_error( 171 | f"Exception encountered - unable to delete the node: {i.metadata.name} in cluster: {cluster_name}", 172 | ) 173 | raise Exception("Error Not able to delete the Node" + i.metadata.name) 174 | return None 175 | except Exception as e: 176 | echo_error( 177 | f"Exception encountered while attempting to drain nodes! Node: {node_name} Cluster: {cluster_name} - Error: {e}", 178 | ) 179 | raise Exception("Unable to Delete the Node") 180 | 181 | 182 | def delete_node(cluster_name: str, node_name: str, region: str) -> None: 183 | """Delete the node from compute list this doesn't terminate the instance.""" 184 | try: 185 | loading_config(cluster_name, region) 186 | core_v1_api = client.CoreV1Api() 187 | core_v1_api.delete_node(node_name) 188 | return 189 | except ApiException as e: 190 | echo_error( 191 | f"Exception encountered attempting to delete a node! Cluster: {cluster_name} - Node: {node_name} - Error: {e}", 192 | ) 193 | raise e 194 | 195 | 196 | def find_node(cluster_name: str, instance_id: str, operation: str, region: str) -> str: 197 | """Find the node by instance id.""" 198 | loading_config(cluster_name, region) 199 | core_v1_api = client.CoreV1Api() 200 | nodes: List[List[str]] = [] 201 | response = core_v1_api.list_node() 202 | 203 | if not response.items: 204 | return "NAN" 205 | 206 | for node in response.items: 207 | nodes.append( 208 | [ 209 | node.spec.provider_id.split("/")[-1], 210 | node.metadata.name, 211 | node.status.node_info.kube_proxy_version.split("-")[0], 212 | node.status.node_info.kubelet_version.split("-")[0], 213 | node.status.node_info.os_image, 214 | ] 215 | ) 216 | 217 | if operation == "find": 218 | for i in nodes: 219 | if i[0] == instance_id: 220 | return i[1] 221 | return "NAN" 222 | 223 | if operation == "os_type": 224 | for i in nodes: 225 | if i[0] == instance_id: 226 | echo_info(i[0]) 227 | return i[-1] 228 | return "NAN" 229 | return "NAN" 230 | 231 | 232 | def addon_status(cluster_name: str, new_pod_name: str, region: str, namespace: str) -> bool: 233 | """Get the status of an addon pod.""" 234 | loading_config(cluster_name, region) 235 | core_v1_api = client.CoreV1Api() 236 | tts = 100 237 | now = time.time() 238 | 239 | while time.time() < now + tts: 240 | response = core_v1_api.read_namespaced_pod_status(name=new_pod_name, namespace=namespace) 241 | if response.status.container_statuses[0].ready and response.status.container_statuses[0].started: 242 | return True 243 | return False 244 | 245 | 246 | def sort_pods( 247 | cluster_name: str, 248 | region: str, 249 | original_name: str, 250 | pod_name: str, 251 | old_pods_names: List[str], 252 | namespace: str, 253 | count: int = 90, 254 | ) -> str: 255 | """Sort the pod results.""" 256 | if not count: 257 | echo_error( 258 | f"Pod has no associated new pod! Cluster: {cluster_name} - Namespace: {namespace} - Pod Name: {pod_name}", 259 | ) 260 | raise Exception("Pod has No associated New Launch") 261 | 262 | pods_nodes = [] 263 | loading_config(cluster_name, region) 264 | core_v1_api = client.CoreV1Api() 265 | try: 266 | if pod_name == "cluster-autoscaler": 267 | pod_list = core_v1_api.list_namespaced_pod(namespace=namespace, label_selector=f"app={pod_name}") 268 | else: 269 | pod_list = core_v1_api.list_namespaced_pod(namespace=namespace, label_selector=f"k8s-app={pod_name}") 270 | except Exception as e: 271 | echo_error( 272 | f"Exception encountered while attempting to get the pod list and sort_pods - cluster: {cluster_name}, error: {e}", 273 | ) 274 | return "Not Found" 275 | 276 | echo_info(f"Total Pods With {pod_name} = {len(pod_list.items)}") 277 | for i in pod_list.items: 278 | pods_nodes.append([i.metadata.name, i.metadata.creation_timestamp]) 279 | 280 | if pods_nodes: 281 | new_pod_name = sorted(pods_nodes, key=lambda x: x[1])[-1][0] 282 | else: 283 | count -= 1 284 | sort_pods(cluster_name, region, original_name, pod_name, old_pods_names, namespace, count) 285 | # TODO: Remove this. Adding to resolve possible use before assignment below. 286 | new_pod_name = "" 287 | 288 | if original_name != new_pod_name and new_pod_name in old_pods_names: 289 | count -= 1 290 | sort_pods(cluster_name, region, original_name, pod_name, old_pods_names, namespace, count) 291 | return new_pod_name 292 | 293 | 294 | @cache 295 | def get_addon_details(cluster_name: str, addon: str, region: str) -> Dict[str, Any]: 296 | """Get addon details which includes its current version.""" 297 | eks_client = boto3.client("eks", region_name=region) 298 | addon_details: Dict[str, Any] = eks_client.describe_addon(clusterName=cluster_name, addonName=addon).get( 299 | "addon", {} 300 | ) 301 | return addon_details 302 | 303 | 304 | @cache 305 | def get_addon_update_kwargs(cluster_name: str, addon: str, region: str) -> Dict[str, Any]: 306 | """Get kwargs for subsequent update to addon.""" 307 | addon_details: Dict[str, Any] = get_addon_details(cluster_name, addon, region) 308 | kwargs: Dict[str, Any] = {} 309 | iam_role_arn: Optional[str] = addon_details.get("serviceAccountRoleArn") 310 | config_values: Optional[str] = addon_details.get("configurationValues") 311 | 312 | if iam_role_arn: 313 | kwargs["serviceAccountRoleArn"] = iam_role_arn 314 | if config_values: 315 | kwargs["configurationValues"] = config_values 316 | return kwargs 317 | 318 | 319 | @cache 320 | def get_addon_versions(version: str, region: str) -> List[Dict[str, Any]]: 321 | """Get addon versions for the associated Kubernetes `version`.""" 322 | eks_client = boto3.client("eks", region_name=region) 323 | addon_versions: List[Dict[str, Any]] = eks_client.describe_addon_versions(kubernetesVersion=version).get( 324 | "addons", [] 325 | ) 326 | return addon_versions 327 | 328 | 329 | @cache 330 | def get_versions_by_addon(addon: str, version: str, region: str) -> Dict[str, Any]: 331 | """Get target addon versions.""" 332 | addon_versions: List[Dict[str, Any]] = get_addon_versions(version, region) 333 | return next(item for item in addon_versions if item["addonName"] == addon) 334 | 335 | 336 | @cache 337 | def get_default_version(addon: str, version: str, region: str) -> str: 338 | """Get the EKS default version of the `addon`.""" 339 | addon_dict: Dict[str, Any] = get_versions_by_addon(addon, version, region) 340 | return next( 341 | item["addonVersion"] 342 | for item in addon_dict["addonVersions"] 343 | if item["compatibilities"][0]["defaultVersion"] is True 344 | ) 345 | 346 | 347 | def is_cluster_auto_scaler_present(cluster_name: str, region: str) -> List[Union[bool, int]]: 348 | """Determine whether or not cluster autoscaler is present.""" 349 | loading_config(cluster_name, region) 350 | apps_v1_api = client.AppsV1Api() 351 | res = apps_v1_api.list_deployment_for_all_namespaces() 352 | for res_i in res.items: 353 | if res_i.metadata.name == "cluster-autoscaler": 354 | return [True, res_i.spec.replicas] 355 | return [False, 0] 356 | 357 | 358 | def cluster_auto_enable_disable(cluster_name: str, operation: str, mx_val: int, region: str) -> None: 359 | """Enable or disable deployment in cluster.""" 360 | loading_config(cluster_name, region) 361 | api = client.AppsV1Api() 362 | if operation == "pause": 363 | body = {"spec": {"replicas": 0}} 364 | elif operation == "start": 365 | body = {"spec": {"replicas": mx_val}} 366 | else: 367 | echo_error("Operation must be either pause or start to auto_enable_disable!") 368 | raise NotImplementedError("Operation must be either pause or start!") 369 | 370 | try: 371 | api.patch_namespaced_deployment(name="cluster-autoscaler", namespace="kube-system", body=body) 372 | except Exception as e: 373 | echo_error(f"Exception encountered while running auto enable disable - Error: {e}") 374 | raise e 375 | -------------------------------------------------------------------------------- /eksupgrade/src/latest_ami.py: -------------------------------------------------------------------------------- 1 | """Define the AMI specific logic.""" 2 | 3 | from __future__ import annotations 4 | 5 | import boto3 6 | 7 | from eksupgrade.utils import echo_error, get_logger 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | def get_latest_ami(cluster_version: str, instance_type: str, image_to_search: str, region: str) -> str: 13 | """Get the latest AMI.""" 14 | ssm = boto3.client("ssm", region_name=region) 15 | client = boto3.client("ec2", region_name=region) 16 | 17 | if "Amazon Linux 2" in instance_type: 18 | names = [f"/aws/service/eks/optimized-ami/{cluster_version}/amazon-linux-2/recommended/image_id"] 19 | elif "Windows" in instance_type: 20 | names = [f"/aws/service/ami-windows-latest/{image_to_search}-{cluster_version}/image_id"] 21 | elif "bottlerocket" in instance_type.lower(): 22 | names = [f"/aws/service/bottlerocket/aws-k8s-{cluster_version}/x86_64/latest/image_id"] 23 | elif "Ubuntu" in instance_type: 24 | filters = [ 25 | {"Name": "owner-id", "Values": ["099720109477"]}, 26 | {"Name": "name", "Values": [f"ubuntu-eks/k8s_{cluster_version}*"]}, 27 | {"Name": "is-public", "Values": ["true"]}, 28 | ] 29 | response = client.describe_images(Filters=filters) 30 | sorted_images = sorted(response["Images"], key=lambda x: x["CreationDate"], reverse=True) 31 | if sorted_images: 32 | return sorted_images[0].get("ImageId") 33 | raise Exception("Couldn't Find Latest Image Retry The Script") 34 | else: 35 | return "NAN" 36 | response = ssm.get_parameters(Names=names) 37 | if response.get("Parameters"): 38 | return response.get("Parameters")[0]["Value"] 39 | echo_error("Couldn't find the latest image - please retry the script!") 40 | raise Exception("Couldn't Find Latest Image Retry The Script") 41 | -------------------------------------------------------------------------------- /eksupgrade/src/self_managed.py: -------------------------------------------------------------------------------- 1 | """Define the self-managed node logic.""" 2 | 3 | from __future__ import annotations 4 | 5 | import time 6 | from typing import Any, Dict, List, Optional 7 | 8 | import boto3 9 | 10 | from eksupgrade.utils import echo_error, echo_info, get_logger 11 | 12 | from .latest_ami import get_latest_ami 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | def status_of_cluster(cluster_name: str, region: str) -> List[str]: 18 | """Get the self-managed Cluster Status.""" 19 | client = boto3.client("eks", region_name=region) 20 | response = client.describe_cluster(name=cluster_name) 21 | status = response["cluster"]["status"] 22 | version = response["cluster"]["version"] 23 | echo_info(f"The Cluster Status = {status} and Version = {version}") 24 | return [status, version] 25 | 26 | 27 | def describe_node_groups(cluster_name: str, nodegroup: str, region: str) -> List[str]: 28 | """Get the description of the Node Group.""" 29 | client = boto3.client("eks", region_name=region) 30 | response = client.describe_nodegroup(clusterName=cluster_name, nodegroupName=nodegroup) 31 | status = response.get("nodegroup")["status"] 32 | version = response.get("nodegroup")["version"] 33 | echo_info(f"The NodeGroup = {nodegroup} Status = {status} and Version = {version}") 34 | return [status, version] 35 | 36 | 37 | def lt_id_func(cluster_name: str, nodegroup: str, version: str, region: str): 38 | """Get the launch template ID, AMI, and version information.""" 39 | client = boto3.client("eks", region_name=region) 40 | ec2 = boto3.client("ec2", region_name=region) 41 | res = client.describe_nodegroup(clusterName=cluster_name, nodegroupName=nodegroup) 42 | latest_ami: str = "" 43 | launch_template_id: str = "" 44 | version_no: str = "" 45 | ami_type: str = res["nodegroup"]["amiType"] 46 | launch_template: Optional[Dict[str, Any]] = res["nodegroup"].get("launchTemplate") 47 | 48 | if launch_template: 49 | launch_template_id = launch_template["id"] 50 | version_no = launch_template["version"] 51 | 52 | if ami_type == "CUSTOM": 53 | os_lt = ec2.describe_launch_template_versions(LaunchTemplateId=launch_template_id, Versions=[version_no]) 54 | current_ami = os_lt["LaunchTemplateVersions"][0]["LaunchTemplateData"]["ImageId"] 55 | os_type = ec2.describe_images(ImageIds=[current_ami])["Images"][0]["ImageLocation"] 56 | 57 | if isinstance(os_type, str) and "Windows_Server" in os_type: 58 | os_type = os_type[:46] 59 | 60 | latest_ami = get_latest_ami( 61 | cluster_version=version, instance_type=os_type, image_to_search=os_type, region=region 62 | ) 63 | 64 | return ami_type, launch_template_id, version_no, latest_ami 65 | 66 | 67 | def update_current_launch_template_ami(lt_id: str, latest_ami: str, region: str) -> None: 68 | """Update the current launch template's AMI.""" 69 | ec2 = boto3.client("ec2", region_name=region) 70 | ec2.create_launch_template_version( 71 | LaunchTemplateId=lt_id, 72 | SourceVersion="$Latest", 73 | VersionDescription="Latest-AMI", 74 | LaunchTemplateData={"ImageId": latest_ami}, 75 | ) 76 | echo_info(f"New launch template created with AMI {latest_ami}") 77 | 78 | 79 | def update_nodegroup(cluster_name: str, nodegroup: str, version: str, region: str) -> bool: 80 | """Update the Node group.""" 81 | client = boto3.client("eks", region_name=region) 82 | start = time.time() 83 | 84 | ami_type, lt_id, _, latest_ami = lt_id_func(cluster_name, nodegroup, version, region) 85 | if ami_type == "CUSTOM": 86 | update_current_launch_template_ami(lt_id, latest_ami, region) 87 | 88 | while True: 89 | try: 90 | if ( 91 | status_of_cluster(cluster_name, region)[0] == "ACTIVE" 92 | and describe_node_groups(cluster_name, nodegroup, region)[0] == "ACTIVE" 93 | and describe_node_groups(cluster_name, nodegroup, region)[1] != version 94 | ): 95 | if ami_type == "CUSTOM": 96 | client.update_nodegroup_version( 97 | clusterName=cluster_name, 98 | nodegroupName=nodegroup, 99 | launchTemplate={"version": "$Latest", "id": lt_id}, 100 | ) 101 | else: 102 | client.update_nodegroup_version( 103 | clusterName=cluster_name, 104 | nodegroupName=nodegroup, 105 | version=version, 106 | ) 107 | echo_info(f"Updating Node Group {nodegroup}") 108 | time.sleep(20) 109 | if describe_node_groups(cluster_name, nodegroup, region)[0] == "UPDATING": 110 | end = time.time() 111 | hours, rem = divmod(end - start, 3600) 112 | minutes, seconds = divmod(rem, 60) 113 | echo_info(f"The {nodegroup} NodeGroup is Still Updating {hours}:{minutes}:{seconds}") 114 | time.sleep(20) 115 | if describe_node_groups(cluster_name, nodegroup, region)[0] == "DEGRADED": 116 | raise Exception("NodeGroup has not started due to unavailability ") 117 | if ( 118 | describe_node_groups(cluster_name, nodegroup, region)[0] == "ACTIVE" 119 | and describe_node_groups(cluster_name, nodegroup, region)[1] == version 120 | ): 121 | end = time.time() 122 | hours, rem = divmod(end - start, 3600) 123 | minutes, seconds = divmod(rem, 60) 124 | echo_info( 125 | f"The Time Taken For the NodeGroup Upgrade {nodegroup} {hours}:{minutes}:{seconds}", 126 | ) 127 | return True 128 | except Exception as e: 129 | echo_error( 130 | f"Exception encountered while attempting to update nodegroup: {nodegroup} in cluster: {cluster_name} - {region}! Error: {e}", 131 | ) 132 | raise e 133 | -------------------------------------------------------------------------------- /eksupgrade/starter.py: -------------------------------------------------------------------------------- 1 | """Define the starter module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import datetime 6 | import queue 7 | import threading 8 | import time 9 | 10 | from eksupgrade.utils import echo_error, echo_info, echo_success, get_logger 11 | 12 | from .src.boto_aws import ( 13 | add_autoscaling, 14 | add_node, 15 | get_latest_instance, 16 | get_num_of_instances, 17 | get_outdated_asg, 18 | outdated_lt, 19 | wait_for_ready, 20 | worker_terminate, 21 | ) 22 | from .src.eks_get_image_type import get_ami_name 23 | from .src.k8s_client import delete_node, drain_nodes, find_node, unschedule_old_nodes 24 | from .src.latest_ami import get_latest_ami 25 | from .src.self_managed import update_nodegroup 26 | 27 | logger = get_logger(__name__) 28 | 29 | queue = queue.Queue() 30 | 31 | 32 | class StatsWorker(threading.Thread): 33 | """Define the Stats worker for process and queue handling.""" 34 | 35 | def __init__(self, queue, id) -> None: 36 | """Initialize the stats worker.""" 37 | threading.Thread.__init__(self) 38 | self.queue = queue 39 | self.id = id 40 | 41 | def run(self) -> None: 42 | """Run the thread routine.""" 43 | while self.queue.not_empty: 44 | cluster_name, ng_name, to_update, region, max_retry, forced, typse = self.queue.get() 45 | if typse == "managed": 46 | echo_info(f"Updating node group: {ng_name} to version: {to_update}") 47 | update_nodegroup(cluster_name, ng_name, to_update, region) 48 | echo_success(f"Updated node group: {ng_name} to version: {to_update}") 49 | self.queue.task_done() 50 | elif typse == "selfmanaged": 51 | echo_info(f"Updating node group: {ng_name} to version: {to_update}") 52 | actual_update( 53 | cluster_name=cluster_name, 54 | asg_iter=ng_name, 55 | to_update=to_update, 56 | region=region, 57 | max_retry=max_retry, 58 | forced=forced, 59 | ) 60 | echo_success(f"Updated node group: {ng_name} to version: {to_update}") 61 | self.queue.task_done() 62 | 63 | 64 | def actual_update(cluster_name, asg_iter, to_update, region, max_retry, forced): 65 | """Perform the update.""" 66 | instance_type, image_to_search = get_ami_name(cluster_name, asg_iter, region) 67 | echo_info(f"The Image Type Detected = {instance_type}") 68 | 69 | if instance_type == "NAN": 70 | return False 71 | if isinstance(image_to_search, str) and "Windows_Server" in image_to_search: 72 | image_to_search = image_to_search[:46] 73 | latest_ami = get_latest_ami(to_update, instance_type, image_to_search, region) 74 | echo_info(f"The Latest AMI Recommended = {latest_ami}") 75 | 76 | if get_outdated_asg(asg_iter, latest_ami, region): 77 | add_autoscaling(asg_iter, latest_ami, region) 78 | echo_info(f"New Launch Configuration Added to = {asg_iter} With EKS AMI = {latest_ami}") 79 | 80 | outdated_instances = outdated_lt(asg_iter, region) 81 | if not outdated_instances: 82 | return True 83 | 84 | try: 85 | terminated_ids = [] 86 | echo_info(f"The Outdate Instance Found Are = {outdated_instances}") 87 | for instance in outdated_instances: 88 | before_count = get_num_of_instances(asg_iter, terminated_ids, region) 89 | echo_info(f"Total Instance count = {before_count}") 90 | add_time = datetime.datetime.now(datetime.timezone.utc) 91 | 92 | if abs(before_count - len(outdated_instances)) != len(outdated_instances): 93 | add_node(asg_iter, region) 94 | time.sleep(45) 95 | latest_instance = get_latest_instance(asg_name=asg_iter, add_time=add_time, region=region) 96 | echo_info(f"The Instance Created = {latest_instance} and waiting for it to be ready") 97 | time.sleep(30) 98 | wait_for_ready(latest_instance, region) 99 | 100 | old_pod_id = find_node(cluster_name=cluster_name, instance_id=instance, operation="find", region=region) 101 | if old_pod_id != "NAN": 102 | retry = 0 103 | flag = 0 104 | while retry <= max_retry: 105 | if ( 106 | not find_node(cluster_name=cluster_name, instance_id=instance, operation="find", region=region) 107 | == "NAN" 108 | ): 109 | flag = 1 110 | retry += 1 111 | time.sleep(10) 112 | if flag == 0: 113 | worker_terminate(instance, region=region) 114 | echo_error("404 instance is not corresponded to particular node group") 115 | raise Exception("404 instance is not corresponded to particular node group") 116 | 117 | echo_info(f"Unscheduling the worker node = {old_pod_id}") 118 | 119 | unschedule_old_nodes(cluster_name=cluster_name, node_name=old_pod_id, region=region) 120 | echo_info(f"The node: {old_pod_id} has been unscheduled! Worker Node Draining...") 121 | drain_nodes(cluster_name=cluster_name, node_name=old_pod_id, forced=forced, region=region) 122 | echo_info(f"The worker node has been drained! Deleting worker Node Started = {old_pod_id}") 123 | delete_node(cluster_name=cluster_name, node_name=old_pod_id, region=region) 124 | echo_info(f"The worker node: {old_pod_id} has been deleted. Terminating Worker Node: {instance}...") 125 | worker_terminate(instance, region=region) 126 | terminated_ids.append(instance) 127 | echo_success(f"The worker node instance: {instance} has been terminated!") 128 | return True 129 | except Exception as e: 130 | echo_error(f"Error encountered during actual update! Exception: {e}") 131 | raise e 132 | -------------------------------------------------------------------------------- /eksupgrade/utils.py: -------------------------------------------------------------------------------- 1 | """Define module level utilities to be used across the EKS Upgrade package.""" 2 | 3 | import json 4 | import logging 5 | import pkgutil 6 | import sys 7 | 8 | import typer 9 | 10 | 11 | def get_package_asset(filename: str, base_path: str = "src/S3Files/") -> str: 12 | """Get the specified package asset data.""" 13 | return pkgutil.get_data(__package__, f"{base_path}/{filename}").decode("utf-8") 14 | 15 | 16 | def get_package_dict(filename: str, base_path: str = "src/S3Files/"): 17 | """Get the specified package asset data dictionary.""" 18 | _data = get_package_asset(filename, base_path) 19 | return json.loads(_data) 20 | 21 | 22 | def get_logger(logger_name): 23 | """Get a logger object with handler set to StreamHandler.""" 24 | logger = logging.getLogger(logger_name) 25 | console_handler = logging.StreamHandler(sys.stdout) 26 | log_formatter = logging.Formatter( 27 | "[%(levelname)s] : %(asctime)s : %(name)s.%(lineno)d : %(message)s", "%Y-%m-%d %H:%M:%S" 28 | ) 29 | console_handler.setFormatter(log_formatter) 30 | logger.addHandler(console_handler) 31 | logger.propagate = False 32 | return logger 33 | 34 | 35 | def confirm(message: str, abort: bool = True) -> bool: 36 | """Prompt the user with a confirmation dialog with the provided message. 37 | 38 | Raises: 39 | typer.Abort: The exception is raised when abort=True and confirmation fails. 40 | 41 | Returns: 42 | bool: Whether or not the prompt was confirmed. 43 | 44 | """ 45 | text = typer.style(message, fg=typer.colors.BRIGHT_BLUE, bold=True, bg=typer.colors.WHITE) 46 | return typer.confirm(text, abort=abort) 47 | 48 | 49 | def echo_deprecation(message: str) -> None: 50 | """Echo a message as a deprecation notice.""" 51 | typer.secho(message, fg=typer.colors.WHITE, bg=typer.colors.YELLOW, bold=True, blink=True) 52 | 53 | 54 | def echo_error(message: str) -> None: 55 | """Echo a message as an error.""" 56 | typer.secho(message, fg=typer.colors.WHITE, bg=typer.colors.RED, bold=True, blink=True, err=True) 57 | 58 | 59 | def echo_success(message: str) -> None: 60 | """Echo a message as an error.""" 61 | typer.secho(message, fg=typer.colors.WHITE, bg=typer.colors.GREEN, bold=True, blink=True) 62 | 63 | 64 | def echo_info(message: str) -> None: 65 | """Echo a message as an error.""" 66 | typer.secho(message, fg=typer.colors.BRIGHT_BLUE) 67 | 68 | 69 | def echo_warning(message: str) -> None: 70 | """Echo a message as an error.""" 71 | typer.secho(message, fg=typer.colors.BRIGHT_YELLOW, bold=True, blink=True) 72 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "eksupgrade" 3 | version = "0.9.0" 4 | description = "The Amazon EKS cluster upgrade utility" 5 | authors = ["EKS Upgrade Maintainers "] 6 | readme = "README.md" 7 | packages = [{include = "eksupgrade"}] 8 | license = "MIT-0" 9 | keywords = ["amazon", "aws", "aws-samples", "eks", "kubernetes", "upgrade", "eksupgrade"] 10 | classifiers = [ 11 | "Programming Language :: Python", 12 | "Programming Language :: Python :: 3", 13 | "Programming Language :: Python :: 3.8", 14 | "Programming Language :: Python :: 3.9", 15 | "Programming Language :: Python :: 3.10", 16 | "Programming Language :: Python :: 3.11", 17 | "License :: OSI Approved :: MIT No Attribution License (MIT-0)", 18 | ] 19 | include = [ 20 | "README.md", 21 | "LICENSE", 22 | ] 23 | 24 | [tool.poetry.scripts] 25 | eksupgrade = "eksupgrade.cli:app" 26 | 27 | [tool.bandit] 28 | exclude_dirs = ["tests"] 29 | 30 | # Styling and linting Configurations 31 | [tool.isort] 32 | profile = "black" 33 | line_length = 120 34 | 35 | [tool.black] 36 | line-length = 120 37 | target-version = ["py310"] 38 | 39 | [tool.ruff] 40 | line-length = 120 41 | target-version = "py310" 42 | 43 | [tool.poe.tasks] 44 | isort = "isort --profile=black ." 45 | black = "black ." 46 | check-black = {cmd = "black . --check --diff", help = "Check code for black styling"} 47 | check-isort = {cmd = "isort --check --profile=black .", help = "Check code for import styling"} 48 | check-docstrings = "pydocstyle -e ." 49 | check-ruff = "ruff check eksupgrade" 50 | check = ["check-isort", "check-black"] 51 | lint = ["check-docstrings", "check-ruff"] 52 | fix = ["isort", "black"] 53 | test = "pytest --cov=eksupgrade --cov-report=xml --cov-report=term" 54 | ruff = "ruff check --fix eksupgrade" 55 | safety = "safety check" 56 | bandit = "bandit -r eksupgrade" 57 | security = ["safety", "bandit"] 58 | # requires poethepoet outside of poetry. 59 | install = "poetry install" 60 | build = "poetry build" 61 | 62 | [tool.poetry.dependencies] 63 | python = "^3.8" 64 | boto3 = "^1.32" 65 | kubernetes = ">=23.0.0 <=29.0.0" 66 | packaging = ">=24.0,<25.0" 67 | typer = {extras = ["all"], version = "^0.9"} 68 | 69 | 70 | [tool.poetry.group.test.dependencies] 71 | pytest = "^7.4" 72 | pytest-cov = "^4.1" 73 | coverage = "^7.4" 74 | moto = {extras = ["autoscaling", "ec2", "eks", "ssm", "sts"], version = "^4.2"} 75 | 76 | 77 | [tool.poetry.group.dev.dependencies] 78 | isort = {extras = ["toml"], version = "^5.13"} 79 | black = ">=23.12,<25.0" 80 | pydocstyle = "^6.3" 81 | mypy = "^1.8" 82 | debugpy = "^1.8" 83 | ruff = "^0.1" 84 | 85 | 86 | [tool.poetry.group.security.dependencies] 87 | safety = "^3.0" 88 | bandit = {extras = ["toml"], version = "^1.7"} 89 | 90 | 91 | [tool.poetry.group.types.dependencies] 92 | types-pyyaml = "^6.0" 93 | boto3-stubs = {extras = ["autoscaling", "ec2", "eks", "ssm", "sts"], version = "^1.32"} 94 | 95 | [build-system] 96 | requires = ["poetry-core"] 97 | build-backend = "poetry.core.masonry.api" 98 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/eks-cluster-upgrade/d960eab8299b4b9c1e79e024907d4dfe35f83ad9/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Define the pytest configuration for fixture reuse.""" 2 | 3 | import os 4 | from typing import Any, Generator 5 | 6 | import boto3 7 | import pytest 8 | import typer 9 | from moto import mock_ec2, mock_eks, mock_sts 10 | 11 | 12 | @pytest.fixture 13 | def aws_creds() -> None: 14 | """Mock the AWS credentials to use for testing.""" 15 | os.environ["AWS_ACCESS_KEY_ID"] = "testaccesskeyid" # nosec 16 | os.environ["AWS_SECRET_ACCESS_KEY"] = "testsecretaccesskey" # nosec 17 | os.environ["AWS_SECURITY_TOKEN"] = "testsecuritytoken" # nosec 18 | os.environ["AWS_SESSION_TOKEN"] = "testsessiontoken" # nosec 19 | 20 | 21 | @pytest.fixture 22 | def region() -> str: 23 | """Define the region fixture for reuse.""" 24 | return "us-east-1" 25 | 26 | 27 | @pytest.fixture 28 | def sts_client(aws_creds, region) -> Generator[Any, None, None]: 29 | """Mock the STS boto client.""" 30 | with mock_sts(): 31 | client = boto3.client("sts", region_name=region) 32 | yield client 33 | 34 | 35 | @pytest.fixture 36 | def ec2_client(aws_creds, region) -> Generator[Any, None, None]: 37 | """Mock the EKS boto client.""" 38 | with mock_ec2(): 39 | client = boto3.client("ec2", region_name=region) 40 | yield client 41 | 42 | 43 | @pytest.fixture 44 | def eks_client(aws_creds, region) -> Generator[Any, None, None]: 45 | """Mock the EKS boto client.""" 46 | with mock_eks(): 47 | client = boto3.client("eks", region_name=region) 48 | yield client 49 | 50 | 51 | @pytest.fixture 52 | def cluster_name() -> str: 53 | """Define the EKS cluster name to be used across test mocks.""" 54 | return "eks-test" 55 | 56 | 57 | @pytest.fixture 58 | def eks_cluster(eks_client, cluster_name): 59 | """Define the EKS cluster to be reused for mocked calls.""" 60 | eks_client.create_cluster( 61 | name=cluster_name, 62 | version="1.23", 63 | roleArn=f"arn:aws:iam::123456789012:role/{cluster_name}", 64 | resourcesVpcConfig={}, 65 | ) 66 | yield 67 | 68 | 69 | @pytest.fixture 70 | def app(): 71 | """Define the typer cli fixture.""" 72 | return typer.Typer() 73 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | """Test the functionality of the CLI module.""" 2 | 3 | from typer.testing import CliRunner 4 | 5 | from eksupgrade.cli import app 6 | 7 | runner = CliRunner() 8 | 9 | 10 | def test_entry_version_arg() -> None: 11 | """Test the entry method with version argument.""" 12 | result = runner.invoke(app, ["--version"]) 13 | assert result.exit_code == 0 14 | assert "eksupgrade version" in result.stdout 15 | 16 | 17 | def test_entry_no_arg() -> None: 18 | """Test the entry method with no arguments.""" 19 | result = runner.invoke(app, []) 20 | assert result.exit_code == 2 21 | assert "OPTIONS" in result.stdout 22 | -------------------------------------------------------------------------------- /tests/test_eks_get_image_type.py: -------------------------------------------------------------------------------- 1 | """Test EKS Upgrade get image type specific logic.""" 2 | 3 | from typing import Optional 4 | 5 | import pytest 6 | 7 | from eksupgrade.src.eks_get_image_type import image_type 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "node_type,image_id", 12 | [ 13 | ("windows server 2019 datacenter ", "ami-ekswin"), 14 | ("windows server 2022", "ami-ekswin"), 15 | ("amazon linux 2", "ami-ekslinux"), 16 | ], 17 | ) 18 | def test_image_type(ec2_client, region, node_type, image_id) -> None: 19 | """Test the image_type method.""" 20 | ami_id: Optional[str] = image_type(node_type=node_type, image_id=image_id, region=region) 21 | assert ami_id 22 | -------------------------------------------------------------------------------- /tests/test_k8s_client.py: -------------------------------------------------------------------------------- 1 | """Test EKS Upgrade k8s client specific logic.""" 2 | 3 | import pytest 4 | 5 | from eksupgrade.src.k8s_client import get_bearer_token, loading_config 6 | 7 | 8 | def test_get_bearer_token(sts_client, eks_cluster, cluster_name, region) -> None: 9 | """Test the get_bearer_token method.""" 10 | token = get_bearer_token(cluster_id=cluster_name, region=region) 11 | assert token.startswith("k8s-aws-v1.") 12 | 13 | 14 | def test_loading_config(eks_client, eks_cluster, cluster_name, region) -> None: 15 | """Test the loading_config method.""" 16 | result = loading_config(cluster_name, region=region) 17 | assert result == "Initialized" 18 | -------------------------------------------------------------------------------- /tests/test_models_base.py: -------------------------------------------------------------------------------- 1 | """Test the base model logic.""" 2 | 3 | from eksupgrade.models.base import AwsRegionResource, AwsResource, BaseResource 4 | 5 | 6 | def test_base_resource() -> None: 7 | """Test the base resource.""" 8 | base_resource = BaseResource() 9 | base_dict = base_resource.to_dict() 10 | assert not base_dict 11 | assert isinstance(base_dict, dict) 12 | assert len(base_dict.keys()) == 0 13 | 14 | 15 | def test_aws_resource_no_optional() -> None: 16 | """Test the AWS resource without optional arguments.""" 17 | aws_resource = AwsResource(arn="abc") 18 | aws_dict = aws_resource.to_dict() 19 | assert isinstance(aws_dict, dict) 20 | assert aws_dict["arn"] == "abc" 21 | assert not aws_dict["resource_id"] 22 | assert not aws_dict["tags"] 23 | assert len(aws_dict.keys()) == 4 24 | 25 | 26 | def test_aws_resource_optional() -> None: 27 | """Test the AWS resource with optional arguments.""" 28 | aws_resource = AwsResource(arn="abc", resource_id="123", tags={"Name": "123"}) 29 | aws_dict = aws_resource.to_dict() 30 | assert isinstance(aws_dict, dict) 31 | assert aws_dict["arn"] == "abc" 32 | assert aws_dict["resource_id"] == "123" 33 | assert aws_dict["tags"]["Name"] == "123" 34 | assert len(aws_dict.keys()) == 4 35 | 36 | 37 | def test_aws_region_resource_no_optional() -> None: 38 | """Test the AWS region resource without optional arguments.""" 39 | aws_region_resource = AwsRegionResource(arn="abc") 40 | aws_dict = aws_region_resource.to_dict() 41 | assert isinstance(aws_dict, dict) 42 | assert aws_dict["arn"] == "abc" 43 | assert not aws_dict["resource_id"] 44 | assert not aws_dict["tags"] 45 | assert not aws_dict["region"] 46 | assert len(aws_dict.keys()) == 5 47 | 48 | 49 | def test_aws_region_resource_optional() -> None: 50 | """Test the AWS region resource with optional arguments.""" 51 | aws_region_resource = AwsRegionResource(arn="abc", resource_id="123", tags={"Name": "123"}, region="us-east-1") 52 | aws_dict = aws_region_resource.to_dict() 53 | assert isinstance(aws_dict, dict) 54 | assert aws_dict["arn"] == "abc" 55 | assert aws_dict["resource_id"] == "123" 56 | assert aws_dict["tags"]["Name"] == "123" 57 | assert aws_dict["region"] == "us-east-1" 58 | assert len(aws_dict.keys()) == 5 59 | 60 | 61 | def test_sts_client_region(sts_client) -> None: 62 | """Test the STS client on AwsResource.""" 63 | aws_resource = AwsRegionResource(arn="abc", resource_id="123", tags={"Name": "123"}, region="us-east-1") 64 | assert aws_resource.sts_client.meta.region_name == "us-east-1" 65 | -------------------------------------------------------------------------------- /tests/test_models_eks.py: -------------------------------------------------------------------------------- 1 | """Test the EKS model logic.""" 2 | 3 | from kubernetes.client.api.core_v1_api import CoreV1Api 4 | from kubernetes.client.api_client import ApiClient 5 | 6 | from eksupgrade.models.eks import Cluster, ClusterAddon, requires_cluster 7 | 8 | 9 | def test_cluster_resource(eks_client, eks_cluster, cluster_name, region) -> None: 10 | """Test the cluster resource.""" 11 | cluster_resource = Cluster.get(cluster_name, region) 12 | cluster_dict = cluster_resource.to_dict() 13 | assert cluster_dict 14 | assert isinstance(cluster_dict, dict) 15 | assert cluster_dict["version"] == "1.23" 16 | assert len(cluster_dict.keys()) == 20 17 | assert cluster_resource.name == cluster_resource.cluster_name 18 | 19 | 20 | def test_cluster_resource_eks_client(eks_client, eks_cluster, cluster_name, region) -> None: 21 | """Test the cluster resource.""" 22 | cluster_resource = Cluster.get(cluster_name, region) 23 | 24 | assert cluster_resource.eks_client 25 | assert cluster_resource.eks_client.meta.region_name == "us-east-1" 26 | 27 | 28 | def test_cluster_resource_core_client(eks_client, eks_cluster, cluster_name, region) -> None: 29 | """Test the cluster resource.""" 30 | cluster_resource = Cluster.get(cluster_name, region) 31 | assert isinstance(cluster_resource.core_api_client, CoreV1Api) 32 | assert isinstance(cluster_resource.core_api_client.api_client, ApiClient) 33 | 34 | 35 | def test_cluster_addon_resource(eks_client, eks_cluster, cluster_name, region) -> None: 36 | """Test the cluster addon resource.""" 37 | cluster_resource = Cluster.get(cluster_name, region) 38 | addon_resource = ClusterAddon( 39 | arn="abc", name="coredns", cluster=cluster_resource, region=region, owner="amazon", publisher="amazon" 40 | ) 41 | addon_dict = addon_resource.to_dict() 42 | assert isinstance(addon_dict, dict) 43 | assert addon_dict["arn"] == "abc" 44 | assert addon_resource.name == "coredns" 45 | assert not addon_dict["resource_id"] 46 | assert not addon_dict["tags"] 47 | assert len(addon_dict.keys()) == 20 48 | assert addon_resource.name == addon_resource.addon_name 49 | assert not addon_resource._addon_update_kwargs 50 | assert isinstance(addon_resource._addon_update_kwargs, dict) 51 | 52 | 53 | def test_cluster_addon_resource_update_kwargs(eks_client, eks_cluster, cluster_name, region) -> None: 54 | """Test the cluster addon resource.""" 55 | cluster_resource = Cluster.get(cluster_name, region) 56 | addon_resource = ClusterAddon( 57 | arn="abc", name="coredns", cluster=cluster_resource, region=region, owner="amazon", publisher="amazon" 58 | ) 59 | addon_resource.service_account_role_arn = "123" 60 | addon_resource.configuration_values = "123" 61 | assert addon_resource._addon_update_kwargs 62 | assert isinstance(addon_resource._addon_update_kwargs, dict) 63 | assert "serviceAccountRoleArn" in addon_resource._addon_update_kwargs.keys() 64 | assert "configurationValues" in addon_resource._addon_update_kwargs.keys() 65 | 66 | 67 | # def test_cluster_requires_cluster_decorator(eks_client, eks_cluster, cluster_name, region) -> None: 68 | # """Test the cluster addon resource.""" 69 | 70 | # @requires_cluster 71 | # def decorator_test(addon): 72 | # return addon 73 | 74 | # # Validate without populated cluster. 75 | # cluster_resource = Cluster(arn="123", version="1.24", target_version="1.25") 76 | # addon_resource = ClusterAddon( 77 | # arn="abc", name="coredns", cluster=cluster_resource, region=region, owner="amazon", publisher="amazon" 78 | # ) 79 | # assert not addon_resource.cluster.name 80 | # assert decorator_test(addon_resource) is None 81 | 82 | # # Validate with populated cluster. 83 | # addon_resource.cluster = Cluster.get(cluster_name, region) 84 | # assert addon_resource.cluster.name 85 | # assert decorator_test(addon_resource) 86 | 87 | 88 | # def test_cluster_addon_resource_no_cluster(eks_client, eks_cluster, cluster_name, region) -> None: 89 | # """Test the cluster addon resource.""" 90 | # cluster_resource = Cluster(arn="123", version="1.24", target_version="1.25") 91 | # addon_resource = ClusterAddon( 92 | # arn="abc", name="coredns", cluster=cluster_resource, region=region, owner="amazon", publisher="amazon" 93 | # ) 94 | # addon_dict = addon_resource.to_dict() 95 | # assert isinstance(addon_dict, dict) 96 | # assert addon_dict["arn"] == "abc" 97 | # assert addon_resource.name == "coredns" 98 | # assert not addon_dict["resource_id"] 99 | # assert not addon_dict["tags"] 100 | # assert len(addon_dict.keys()) == 17 101 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """Test the util logic.""" 2 | 3 | from typer.testing import CliRunner 4 | 5 | from eksupgrade.utils import ( 6 | confirm, 7 | echo_deprecation, 8 | echo_error, 9 | echo_info, 10 | echo_success, 11 | echo_warning, 12 | get_package_asset, 13 | get_package_dict, 14 | ) 15 | 16 | runner = CliRunner() 17 | 18 | 19 | def test_get_package_asset() -> None: 20 | """Test the get package asset method.""" 21 | data = get_package_asset("version_dict.json") 22 | assert data.startswith("{") 23 | assert data.endswith("\n") 24 | 25 | 26 | def test_get_package_asset_nondefault() -> None: 27 | """Test the get package asset method.""" 28 | data = get_package_asset("__init__.py", base_path="") 29 | assert "__version__" in data 30 | 31 | 32 | def test_get_package_dict() -> None: 33 | """Test the get package dict method.""" 34 | data = get_package_dict("version_dict.json") 35 | assert data["1.26"]["cluster-autoscaler"] 36 | 37 | 38 | def test_echo_deprecation(app) -> None: 39 | """Test the echo deprecation method.""" 40 | app.command()(echo_deprecation) 41 | result = runner.invoke(app, ["this is a deprecation"]) 42 | assert "this is a deprecation" in result.stdout 43 | assert result.exit_code == 0 44 | 45 | 46 | def test_echo_error(app) -> None: 47 | """Test the echo error method.""" 48 | app.command()(echo_error) 49 | result = runner.invoke(app, ["this is a error"]) 50 | assert "this is a error" in result.stdout 51 | assert result.exit_code == 0 52 | 53 | 54 | def test_echo_info(app) -> None: 55 | """Test the echo info method.""" 56 | app.command()(echo_info) 57 | result = runner.invoke(app, ["this is a info"]) 58 | assert "this is a info" in result.stdout 59 | assert result.exit_code == 0 60 | 61 | 62 | def test_echo_success(app) -> None: 63 | """Test the echo success method.""" 64 | app.command()(echo_success) 65 | result = runner.invoke(app, ["this is a success"]) 66 | assert "this is a success" in result.stdout 67 | assert result.exit_code == 0 68 | 69 | 70 | def test_echo_warning(app) -> None: 71 | """Test the echo warning method.""" 72 | app.command()(echo_warning) 73 | result = runner.invoke(app, ["this is a warning"]) 74 | assert "this is a warning" in result.stdout 75 | assert result.exit_code == 0 76 | 77 | 78 | def test_confirm_yes(app) -> None: 79 | """Test the confirm method with input y for yes.""" 80 | app.command()(confirm) 81 | result = runner.invoke(app, ["this is a confirmation prompt"], input="y\n") 82 | assert "this is a confirmation prompt" in result.stdout 83 | assert result.exit_code == 0 84 | 85 | 86 | def test_confirm_no(app) -> None: 87 | """Test the confirm method with input n for no.""" 88 | app.command()(confirm) 89 | result = runner.invoke(app, ["this is a confirmation prompt"], input="n\n") 90 | assert "this is a confirmation prompt" in result.stdout 91 | assert result.exit_code == 1 92 | 93 | 94 | def test_confirm_no_without_abort(app) -> None: 95 | """Test the confirm method with input n for no and abort disabled.""" 96 | app.command()(confirm) 97 | result = runner.invoke(app, ["this is a confirmation prompt", "--no-abort"], input="n\n") 98 | assert "this is a confirmation prompt" in result.stdout 99 | assert result.exit_code == 0 100 | --------------------------------------------------------------------------------