├── .gitignore ├── CODE_OF_CONDUCT.md ├── cluster └── terraform │ ├── variables.tf │ ├── versions.tf │ ├── outputs.tf │ ├── karpenter.tf │ └── main.tf ├── blueprints ├── overprovision │ ├── workload.yaml │ ├── dummy-workload.yaml │ └── README.md ├── userdata │ ├── workload.yaml │ ├── userdata.yaml │ └── README.md ├── multi-ebs │ ├── workload.yaml │ ├── multi-ebs.yaml │ └── README.md ├── custom-ami │ ├── workload.yaml │ ├── custom-ami.yaml │ └── README.md ├── saving-plans │ ├── workload.yaml │ ├── savings-plans.yaml │ └── README.md ├── stateful │ ├── storage.yaml │ ├── workload.yaml │ └── README.md ├── update-nodes-with-drift │ ├── workload.yaml │ ├── latest-current-ami.yaml │ └── README.md ├── graviton │ ├── workload-graviton.yaml │ ├── workload-flexible.yaml │ └── README.md ├── od-spot-split │ ├── workload.yaml │ ├── od-spot.yaml │ └── README.md ├── disruption-budgets │ ├── workload.yaml │ ├── disruption-budgets.yaml │ └── README.md ├── ha-az-nodes │ ├── workload.yaml │ └── README.md ├── batch-jobs │ ├── workloads-evicted.yaml │ ├── workloads-not-evicted.yaml │ └── README.md ├── soci-snapshotter │ ├── workload.yaml │ ├── soci-snapshotter.yaml │ └── README.md └── nvidia-gpu-workload │ └── README.md ├── LICENSE ├── CONTRIBUTING.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .terraform* 3 | terraform.tfstate* 4 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /cluster/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | ## NOTE: It's going to use your AWS_REGION or AWS_DEFAULT_REGION environment variable, 2 | ## but you can define which on to use in terraform.tfvars file as well, or pass it as an argument 3 | ## in the CLI like this "terraform apply -var 'region=eu-west-1'" 4 | variable "region" { 5 | description = "Region to deploy the resources" 6 | type = string 7 | } 8 | -------------------------------------------------------------------------------- /cluster/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.3.2" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = "~> 5.95" 8 | } 9 | kubernetes = { 10 | source = "hashicorp/kubernetes" 11 | version = ">= 2.30" 12 | } 13 | helm = { 14 | source = "hashicorp/helm" 15 | version = "~> 2.17" 16 | } 17 | kubectl = { 18 | source = "alekc/kubectl" 19 | version = ">= 2.1" 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /blueprints/overprovision/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: workload 5 | spec: 6 | replicas: 10 7 | selector: 8 | matchLabels: 9 | app: workload 10 | template: 11 | metadata: 12 | labels: 13 | app: workload 14 | spec: 15 | nodeSelector: 16 | intent: apps 17 | containers: 18 | - name: workload 19 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 20 | resources: 21 | requests: 22 | cpu: 512m 23 | memory: 512Mi 24 | -------------------------------------------------------------------------------- /blueprints/userdata/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: userdata 5 | spec: 6 | replicas: 3 7 | selector: 8 | matchLabels: 9 | app: userdata 10 | template: 11 | metadata: 12 | labels: 13 | app: userdata 14 | spec: 15 | nodeSelector: 16 | intent: userdata 17 | containers: 18 | - name: userdata 19 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 20 | resources: 21 | requests: 22 | cpu: 512m 23 | memory: 512Mi 24 | -------------------------------------------------------------------------------- /blueprints/multi-ebs/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: multi-ebs 5 | spec: 6 | replicas: 3 7 | selector: 8 | matchLabels: 9 | app: multi-ebs 10 | template: 11 | metadata: 12 | labels: 13 | app: multi-ebs 14 | spec: 15 | nodeSelector: 16 | intent: multi-ebs 17 | containers: 18 | - name: multi-ebs 19 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 20 | resources: 21 | requests: 22 | cpu: 512m 23 | memory: 512Mi 24 | -------------------------------------------------------------------------------- /blueprints/custom-ami/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: custom-ami 5 | spec: 6 | replicas: 3 7 | selector: 8 | matchLabels: 9 | app: custom-ami 10 | template: 11 | metadata: 12 | labels: 13 | app: custom-ami 14 | spec: 15 | nodeSelector: 16 | intent: custom-ami 17 | containers: 18 | - name: custom-ami 19 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 20 | resources: 21 | requests: 22 | cpu: 512m 23 | memory: 512Mi 24 | -------------------------------------------------------------------------------- /blueprints/saving-plans/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: reserved-first 5 | spec: 6 | replicas: 20 7 | selector: 8 | matchLabels: 9 | app: reserved-first 10 | template: 11 | metadata: 12 | labels: 13 | app: reserved-first 14 | spec: 15 | nodeSelector: 16 | intent: apps 17 | containers: 18 | - name: reserved-first 19 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 20 | resources: 21 | requests: 22 | cpu: 950m 23 | memory: 512Mi 24 | -------------------------------------------------------------------------------- /blueprints/stateful/storage.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: storage.k8s.io/v1 2 | kind: StorageClass 3 | metadata: 4 | name: storage-gp3 5 | provisioner: ebs.csi.aws.com 6 | parameters: 7 | type: gp3 8 | volumeBindingMode: WaitForFirstConsumer 9 | allowedTopologies: 10 | - matchLabelExpressions: 11 | - key: topology.ebs.csi.aws.com/zone 12 | values: ["<>"] 13 | --- 14 | apiVersion: v1 15 | kind: PersistentVolumeClaim 16 | metadata: 17 | name: ebs-claim 18 | spec: 19 | accessModes: 20 | - ReadWriteOnce 21 | storageClassName: storage-gp3 22 | resources: 23 | requests: 24 | storage: 4Gi 25 | -------------------------------------------------------------------------------- /blueprints/update-nodes-with-drift/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: latest-current-ami 5 | spec: 6 | replicas: 3 7 | selector: 8 | matchLabels: 9 | app: latest-current-ami 10 | template: 11 | metadata: 12 | labels: 13 | app: latest-current-ami 14 | spec: 15 | nodeSelector: 16 | intent: latest-current-ami 17 | containers: 18 | - name: latest-current-ami 19 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 20 | resources: 21 | requests: 22 | cpu: 512m 23 | memory: 512Mi 24 | -------------------------------------------------------------------------------- /blueprints/graviton/workload-graviton.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: workload-graviton 5 | spec: 6 | replicas: 5 7 | selector: 8 | matchLabels: 9 | app: workload-graviton 10 | template: 11 | metadata: 12 | labels: 13 | app: workload-graviton 14 | spec: 15 | nodeSelector: 16 | intent: apps 17 | kubernetes.io/arch: arm64 18 | containers: 19 | - name: workload-flexible 20 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 21 | imagePullPolicy: Always 22 | resources: 23 | requests: 24 | cpu: 512m 25 | memory: 512Mi 26 | -------------------------------------------------------------------------------- /blueprints/graviton/workload-flexible.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: workload-flexible 5 | spec: 6 | replicas: 5 7 | selector: 8 | matchLabels: 9 | app: workload-flexible 10 | template: 11 | metadata: 12 | labels: 13 | app: workload-flexible 14 | spec: 15 | nodeSelector: 16 | intent: apps 17 | karpenter.sh/capacity-type: on-demand 18 | containers: 19 | - name: workload-flexible 20 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 21 | imagePullPolicy: Always 22 | resources: 23 | requests: 24 | cpu: 512m 25 | memory: 512Mi 26 | -------------------------------------------------------------------------------- /cluster/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | output "configure_kubectl" { 2 | description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" 3 | value = "aws eks --region ${var.region} update-kubeconfig --name ${module.eks.cluster_name}" 4 | } 5 | 6 | output "cluster_name" { 7 | description = "Cluster name of the EKS cluster" 8 | value = module.eks.cluster_name 9 | } 10 | 11 | output "vpc_id" { 12 | description = "VPC ID that the EKS cluster is using" 13 | value = module.vpc.vpc_id 14 | } 15 | 16 | output "node_instance_role_name" { 17 | description = "IAM Role name that each Karpenter node will use" 18 | value = local.name 19 | } 20 | -------------------------------------------------------------------------------- /blueprints/overprovision/dummy-workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: scheduling.k8s.io/v1 2 | kind: PriorityClass 3 | metadata: 4 | name: overprovisioning 5 | value: -10 6 | globalDefault: false 7 | description: "Priority class used by overprovisioning." 8 | --- 9 | apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | name: dummy-workload 13 | spec: 14 | replicas: 10 15 | selector: 16 | matchLabels: 17 | app: workload 18 | template: 19 | metadata: 20 | labels: 21 | app: workload 22 | spec: 23 | nodeSelector: 24 | intent: apps 25 | containers: 26 | - name: workload 27 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 28 | resources: 29 | requests: 30 | cpu: 512m 31 | memory: 512Mi 32 | priorityClassName: overprovisioning 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | -------------------------------------------------------------------------------- /blueprints/stateful/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: stateful 5 | spec: 6 | replicas: 3 7 | selector: 8 | matchLabels: 9 | app: stateful 10 | template: 11 | metadata: 12 | labels: 13 | app: stateful 14 | spec: 15 | terminationGracePeriodSeconds: 0 16 | nodeSelector: 17 | intent: apps 18 | containers: 19 | - name: stateful 20 | image: public.ecr.aws/docker/library/centos:centos7.9.2009 21 | command: ["/bin/sh"] 22 | args: ["-c", "while true; do echo Writing content every three minutes! Printing a random number: $(( $RANDOM % 1000 + 1 ))>> /data/out.txt; sleep 180; done"] 23 | volumeMounts: 24 | - name: persistent-storage 25 | mountPath: /data 26 | resources: 27 | requests: 28 | cpu: 1 29 | volumes: 30 | - name: persistent-storage 31 | persistentVolumeClaim: 32 | claimName: ebs-claim 33 | -------------------------------------------------------------------------------- /blueprints/od-spot-split/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: workload-split 5 | spec: 6 | replicas: 10 7 | selector: 8 | matchLabels: 9 | app: workload-split 10 | template: 11 | metadata: 12 | labels: 13 | app: workload-split 14 | spec: 15 | nodeSelector: 16 | intent: apps 17 | tolerations: 18 | - key: "intent" 19 | operator: "Equal" 20 | value: "workload-split" 21 | effect: "NoSchedule" 22 | containers: 23 | - name: workload-split 24 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 25 | imagePullPolicy: Always 26 | resources: 27 | requests: 28 | cpu: 512m 29 | memory: 512Mi 30 | topologySpreadConstraints: 31 | - labelSelector: 32 | matchLabels: 33 | app: workload-split 34 | maxSkew: 1 35 | topologyKey: capacity-spread 36 | whenUnsatisfiable: DoNotSchedule 37 | -------------------------------------------------------------------------------- /blueprints/disruption-budgets/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: disruption-budget 5 | spec: 6 | replicas: 30 7 | selector: 8 | matchLabels: 9 | intent: disruption-budget 10 | template: 11 | metadata: 12 | labels: 13 | intent: disruption-budget 14 | spec: 15 | nodeSelector: 16 | intent: disruption-budget 17 | containers: 18 | - name: disruption-budget 19 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 20 | imagePullPolicy: Always 21 | resources: 22 | requests: 23 | cpu: 512m 24 | memory: 512Mi 25 | topologySpreadConstraints: 26 | - labelSelector: 27 | matchLabels: 28 | intent: disruption-budget 29 | maxSkew: 1 30 | topologyKey: kubernetes.io/hostname 31 | whenUnsatisfiable: ScheduleAnyway 32 | - labelSelector: 33 | matchLabels: 34 | intent: disruption-budget 35 | maxSkew: 1 36 | topologyKey: topology.kubernetes.io/zone 37 | whenUnsatisfiable: ScheduleAnyway 38 | -------------------------------------------------------------------------------- /blueprints/ha-az-nodes/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: workload-multi-az-nodes 5 | spec: 6 | replicas: 30 7 | selector: 8 | matchLabels: 9 | app: workload-multi-az-nodes 10 | template: 11 | metadata: 12 | labels: 13 | app: workload-multi-az-nodes 14 | spec: 15 | nodeSelector: 16 | intent: apps 17 | containers: 18 | - name: workload-multi-az-nodes 19 | image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4 20 | imagePullPolicy: Always 21 | resources: 22 | requests: 23 | cpu: 512m 24 | memory: 512Mi 25 | topologySpreadConstraints: 26 | - labelSelector: 27 | matchLabels: 28 | app: workload-multi-az-nodes 29 | maxSkew: 1 30 | topologyKey: kubernetes.io/hostname 31 | whenUnsatisfiable: ScheduleAnyway 32 | - labelSelector: 33 | matchLabels: 34 | app: workload-multi-az-nodes 35 | maxSkew: 1 36 | topologyKey: topology.kubernetes.io/zone 37 | whenUnsatisfiable: ScheduleAnyway 38 | -------------------------------------------------------------------------------- /blueprints/saving-plans/savings-plans.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.sh/v1 2 | kind: NodePool 3 | metadata: 4 | name: savings-plans 5 | spec: 6 | disruption: 7 | consolidationPolicy: WhenEmptyOrUnderutilized 8 | consolidateAfter: 1m 9 | limits: 10 | cpu: "20" # For example: Limit to launch up to 5 c4.xlarge instances 11 | template: 12 | metadata: 13 | labels: 14 | intent: apps 15 | spec: 16 | expireAfter: 168h0m0s 17 | nodeClassRef: 18 | group: karpenter.k8s.aws 19 | name: default 20 | kind: EC2NodeClass 21 | requirements: 22 | - key: karpenter.k8s.aws/instance-family 23 | operator: In 24 | values: 25 | - c4 26 | # Alternatively, you can configure fixed instance types 27 | # - key: "node.kubernetes.io/instance-type" 28 | # operator: In 29 | # values: ["c4.xlarge"] # 4 vCPUs 30 | - key: kubernetes.io/os 31 | operator: In 32 | values: 33 | - linux 34 | - key: kubernetes.io/arch 35 | operator: In 36 | values: 37 | - amd64 38 | - key: karpenter.sh/capacity-type 39 | operator: In 40 | values: 41 | - on-demand 42 | weight: 100 43 | -------------------------------------------------------------------------------- /blueprints/disruption-budgets/disruption-budgets.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.sh/v1 2 | kind: NodePool 3 | metadata: 4 | name: disruption-budget 5 | spec: 6 | limits: 7 | cpu: 100 8 | memory: 100Gi 9 | template: 10 | metadata: 11 | labels: 12 | intent: disruption-budget 13 | spec: 14 | nodeClassRef: 15 | group: karpenter.k8s.aws 16 | name: disruption-budget 17 | kind: EC2NodeClass 18 | requirements: 19 | - key: karpenter.sh/capacity-type 20 | operator: In 21 | values: ["on-demand"] 22 | - key: karpenter.k8s.aws/instance-category 23 | operator: In 24 | values: ["c","m","r"] 25 | - key: karpenter.k8s.aws/instance-size 26 | operator: NotIn 27 | values: ["nano","micro","small","medium"] 28 | - key: karpenter.k8s.aws/instance-hypervisor 29 | operator: In 30 | values: ["nitro"] 31 | expireAfter: 720h 32 | disruption: 33 | consolidationPolicy: WhenEmptyOrUnderutilized 34 | consolidateAfter: 1m 35 | budgets: 36 | - nodes: "0" 37 | schedule: "0 0 * * *" 38 | duration: 24h 39 | --- 40 | apiVersion: karpenter.k8s.aws/v1 41 | kind: EC2NodeClass 42 | metadata: 43 | name: disruption-budget 44 | spec: 45 | amiSelectorTerms: 46 | - alias: bottlerocket@latest 47 | role: "<>" 48 | securityGroupSelectorTerms: 49 | - tags: 50 | karpenter.sh/discovery: <> 51 | subnetSelectorTerms: 52 | - tags: 53 | karpenter.sh/discovery: <> 54 | -------------------------------------------------------------------------------- /blueprints/custom-ami/custom-ami.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.k8s.aws/v1 2 | kind: EC2NodeClass 3 | metadata: 4 | name: custom-ami-template 5 | spec: 6 | amiFamily: AL2023 7 | amiSelectorTerms: 8 | - name: '*amazon-eks-node-al2023*' 9 | role: "<>" 10 | securityGroupSelectorTerms: 11 | - tags: 12 | karpenter.sh/discovery: <> 13 | subnetSelectorTerms: 14 | - tags: 15 | karpenter.sh/discovery: <> 16 | --- 17 | apiVersion: karpenter.sh/v1 18 | kind: NodePool 19 | metadata: 20 | name: custom-ami 21 | spec: 22 | disruption: 23 | consolidationPolicy: WhenEmptyOrUnderutilized 24 | consolidateAfter: 1m 25 | limits: 26 | cpu: 1k 27 | memory: 500Gi 28 | template: 29 | metadata: 30 | labels: 31 | intent: custom-ami 32 | spec: 33 | nodeClassRef: 34 | group: karpenter.k8s.aws 35 | name: custom-ami-template 36 | kind: EC2NodeClass 37 | requirements: 38 | - key: karpenter.k8s.aws/instance-hypervisor 39 | operator: NotIn 40 | values: 41 | - "" 42 | - key: karpenter.sh/capacity-type 43 | operator: In 44 | values: 45 | - spot 46 | - on-demand 47 | - key: kubernetes.io/os 48 | operator: In 49 | values: 50 | - linux 51 | - key: kubernetes.io/arch 52 | operator: In 53 | values: 54 | - amd64 55 | - key: karpenter.k8s.aws/instance-category 56 | operator: In 57 | values: 58 | - c 59 | - m 60 | - r 61 | - key: karpenter.k8s.aws/instance-generation 62 | operator: Gt 63 | values: 64 | - "2" 65 | -------------------------------------------------------------------------------- /blueprints/update-nodes-with-drift/latest-current-ami.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.k8s.aws/v1 2 | kind: EC2NodeClass 3 | metadata: 4 | name: latest-current-ami 5 | spec: 6 | amiSelectorTerms: 7 | - id: <> 8 | - id: <> 9 | role: "<>" 10 | securityGroupSelectorTerms: 11 | - tags: 12 | karpenter.sh/discovery: <> 13 | subnetSelectorTerms: 14 | - tags: 15 | karpenter.sh/discovery: <> 16 | tags: 17 | KubernetesVersion: "1.31" 18 | --- 19 | apiVersion: karpenter.sh/v1 20 | kind: NodePool 21 | metadata: 22 | name: latest-current-ami 23 | spec: 24 | disruption: 25 | consolidationPolicy: WhenEmptyOrUnderutilized 26 | consolidateAfter: 1m 27 | limits: 28 | cpu: 100k 29 | memory: 5000Gi 30 | template: 31 | metadata: 32 | labels: 33 | intent: latest-current-ami 34 | spec: 35 | expireAfter: 168h0m0s 36 | nodeClassRef: 37 | group: karpenter.k8s.aws 38 | kind: EC2NodeClass 39 | name: latest-current-ami 40 | requirements: 41 | - key: karpenter.k8s.aws/instance-category 42 | operator: In 43 | values: 44 | - c 45 | - m 46 | - r 47 | - i 48 | - d 49 | - key: karpenter.k8s.aws/instance-cpu 50 | operator: In 51 | values: 52 | - "4" 53 | - "8" 54 | - "16" 55 | - "32" 56 | - "48" 57 | - "64" 58 | - key: karpenter.sh/capacity-type 59 | operator: In 60 | values: 61 | - spot 62 | - on-demand 63 | - key: kubernetes.io/os 64 | operator: In 65 | values: 66 | - linux 67 | - key: kubernetes.io/arch 68 | operator: In 69 | values: 70 | - amd64 71 | -------------------------------------------------------------------------------- /blueprints/batch-jobs/workloads-evicted.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: nginx 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: nginx 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: nginx 14 | spec: 15 | nodeSelector: 16 | intent: apps 17 | karpenter.sh/capacity-type: on-demand 18 | containers: 19 | - name: nginx 20 | image: nginx 21 | imagePullPolicy: IfNotPresent 22 | resources: 23 | requests: 24 | cpu: "2" 25 | --- 26 | apiVersion: batch/v1 27 | kind: Job 28 | metadata: 29 | name: 2-min-job 30 | spec: 31 | ttlSecondsAfterFinished: 10 32 | template: 33 | spec: 34 | nodeSelector: 35 | intent: apps 36 | karpenter.sh/capacity-type: on-demand 37 | containers: 38 | - name: 2-min-job 39 | image: alpine 40 | imagePullPolicy: IfNotPresent 41 | resources: 42 | requests: 43 | cpu: "7" 44 | command: ['sh', '-c', 'echo 2 minutes Job Pod is Running ; sleep 120'] 45 | restartPolicy: Never 46 | terminationGracePeriodSeconds: 0 47 | backoffLimit: 2 48 | --- 49 | apiVersion: batch/v1 50 | kind: Job 51 | metadata: 52 | name: 5-min-job 53 | spec: 54 | ttlSecondsAfterFinished: 10 55 | template: 56 | spec: 57 | nodeSelector: 58 | intent: apps 59 | karpenter.sh/capacity-type: on-demand 60 | containers: 61 | - name: 5-min-job 62 | image: alpine 63 | imagePullPolicy: IfNotPresent 64 | resources: 65 | requests: 66 | cpu: "2" 67 | command: ['sh', '-c', 'echo 5 minutes Job Pod is Running ; sleep 300'] 68 | restartPolicy: Never 69 | terminationGracePeriodSeconds: 0 70 | backoffLimit: 2 71 | -------------------------------------------------------------------------------- /blueprints/userdata/userdata.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.k8s.aws/v1 2 | kind: EC2NodeClass 3 | metadata: 4 | name: userdata-template 5 | spec: 6 | amiSelectorTerms: 7 | - alias: al2023@2023.0.20230222 # Amazon Linux 2023 8 | role: "<>" 9 | securityGroupSelectorTerms: 10 | - tags: 11 | karpenter.sh/discovery: "<>" 12 | subnetSelectorTerms: 13 | - tags: 14 | karpenter.sh/discovery: "<>" 15 | userData: | 16 | MIME-Version: 1.0 17 | Content-Type: multipart/mixed; boundary="BOUNDARY" 18 | 19 | --BOUNDARY 20 | Content-Type: text/x-shellscript; charset="us-ascii" 21 | 22 | #!/bin/bash 23 | echo "Running a custom user data script" 24 | 25 | --BOUNDARY-- 26 | --- 27 | apiVersion: karpenter.sh/v1 28 | kind: NodePool 29 | metadata: 30 | name: userdata 31 | spec: 32 | disruption: 33 | consolidationPolicy: WhenEmptyOrUnderutilized 34 | consolidateAfter: 1m 35 | template: 36 | metadata: 37 | labels: 38 | intent: userdata 39 | spec: 40 | nodeClassRef: 41 | group: karpenter.k8s.aws 42 | kind: EC2NodeClass 43 | name: userdata-template 44 | requirements: 45 | - key: karpenter.k8s.aws/instance-hypervisor 46 | operator: NotIn 47 | values: 48 | - "" 49 | - key: karpenter.sh/capacity-type 50 | operator: In 51 | values: 52 | - spot 53 | - on-demand 54 | - key: kubernetes.io/os 55 | operator: In 56 | values: 57 | - linux 58 | - key: kubernetes.io/arch 59 | operator: In 60 | values: 61 | - amd64 62 | - key: karpenter.k8s.aws/instance-category 63 | operator: In 64 | values: 65 | - c 66 | - m 67 | - r 68 | - key: karpenter.k8s.aws/instance-generation 69 | operator: Gt 70 | values: 71 | - "2" 72 | -------------------------------------------------------------------------------- /blueprints/multi-ebs/multi-ebs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.sh/v1 2 | kind: NodePool 3 | metadata: 4 | name: multi-ebs 5 | spec: 6 | disruption: 7 | consolidationPolicy: WhenEmptyOrUnderutilized 8 | consolidateAfter: 1m 9 | limits: 10 | cpu: 1k 11 | memory: 500Gi 12 | template: 13 | metadata: 14 | labels: 15 | intent: multi-ebs 16 | spec: 17 | expireAfter: 168h0m0s 18 | nodeClassRef: 19 | group: karpenter.k8s.aws 20 | name: multi-ebs 21 | kind: EC2NodeClass 22 | requirements: 23 | - key: karpenter.k8s.aws/instance-category 24 | operator: In 25 | values: 26 | - c 27 | - m 28 | - r 29 | - i 30 | - d 31 | - key: karpenter.k8s.aws/instance-cpu 32 | operator: In 33 | values: 34 | - "4" 35 | - "8" 36 | - "16" 37 | - "32" 38 | - "48" 39 | - "64" 40 | - key: karpenter.sh/capacity-type 41 | operator: In 42 | values: 43 | - spot 44 | - on-demand 45 | - key: kubernetes.io/os 46 | operator: In 47 | values: 48 | - linux 49 | - key: kubernetes.io/arch 50 | operator: In 51 | values: 52 | - amd64 53 | --- 54 | apiVersion: karpenter.k8s.aws/v1 55 | kind: EC2NodeClass 56 | metadata: 57 | name: multi-ebs 58 | spec: 59 | amiFamily: Bottlerocket 60 | amiSelectorTerms: 61 | - alias: bottlerocket@v1.39.1 62 | blockDeviceMappings: 63 | - deviceName: /dev/xvda 64 | ebs: 65 | deleteOnTermination: true 66 | volumeSize: 20Gi 67 | volumeType: gp3 68 | - deviceName: /dev/xvdb 69 | ebs: 70 | deleteOnTermination: true 71 | volumeSize: 100Gi 72 | volumeType: gp3 73 | role: "<>" 74 | securityGroupSelectorTerms: 75 | - tags: 76 | karpenter.sh/discovery: <> 77 | subnetSelectorTerms: 78 | - tags: 79 | karpenter.sh/discovery: <> 80 | -------------------------------------------------------------------------------- /blueprints/batch-jobs/workloads-not-evicted.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: nginx 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: nginx 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: nginx 14 | spec: 15 | nodeSelector: 16 | intent: apps 17 | karpenter.sh/capacity-type: on-demand 18 | containers: 19 | - name: nginx 20 | image: nginx 21 | imagePullPolicy: IfNotPresent 22 | resources: 23 | requests: 24 | cpu: "2" 25 | --- 26 | apiVersion: batch/v1 27 | kind: Job 28 | metadata: 29 | name: 2-min-job 30 | spec: 31 | ttlSecondsAfterFinished: 10 #Eliminate job pods after 10 seconds of being completed 32 | template: 33 | spec: 34 | nodeSelector: 35 | intent: apps 36 | karpenter.sh/capacity-type: on-demand 37 | containers: 38 | - name: 2-min-job 39 | image: alpine 40 | imagePullPolicy: IfNotPresent 41 | resources: 42 | requests: 43 | cpu: "7" 44 | command: ['sh', '-c', 'echo 2 minutes Job Pod is Running ; sleep 120'] 45 | restartPolicy: Never 46 | terminationGracePeriodSeconds: 0 47 | metadata: 48 | annotations: 49 | karpenter.sh/do-not-disrupt: "true" 50 | backoffLimit: 2 51 | --- 52 | apiVersion: batch/v1 53 | kind: Job 54 | metadata: 55 | name: 5-min-job 56 | spec: 57 | ttlSecondsAfterFinished: 10 #Eliminate job pods after 10 seconds of being completed 58 | template: 59 | spec: 60 | nodeSelector: 61 | intent: apps 62 | karpenter.sh/capacity-type: on-demand 63 | containers: 64 | - name: 5-min-job 65 | image: alpine 66 | imagePullPolicy: IfNotPresent 67 | resources: 68 | requests: 69 | cpu: "2" 70 | command: ['sh', '-c', 'echo 5 minutes Job Pod is Running ; sleep 300'] 71 | restartPolicy: Never 72 | terminationGracePeriodSeconds: 0 73 | metadata: 74 | annotations: 75 | karpenter.sh/do-not-disrupt: "true" 76 | backoffLimit: 2 77 | -------------------------------------------------------------------------------- /blueprints/od-spot-split/od-spot.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.sh/v1 2 | kind: NodePool 3 | metadata: 4 | name: node-od 5 | spec: 6 | disruption: 7 | consolidationPolicy: WhenEmptyOrUnderutilized 8 | consolidateAfter: 1m 9 | limits: 10 | cpu: 1k 11 | memory: 500Gi 12 | template: 13 | metadata: 14 | labels: 15 | intent: apps 16 | spec: 17 | expireAfter: 168h0m0s 18 | nodeClassRef: 19 | group: karpenter.k8s.aws 20 | name: default 21 | kind: EC2NodeClass 22 | requirements: 23 | - key: capacity-spread 24 | operator: In 25 | values: 26 | - "1" 27 | - key: kubernetes.io/arch 28 | operator: In 29 | values: 30 | - amd64 31 | - key: karpenter.sh/capacity-type 32 | operator: In 33 | values: 34 | - on-demand 35 | - key: kubernetes.io/os 36 | operator: In 37 | values: 38 | - linux 39 | - key: karpenter.k8s.aws/instance-category 40 | operator: In 41 | values: 42 | - c 43 | - m 44 | - r 45 | - key: karpenter.k8s.aws/instance-generation 46 | operator: Gt 47 | values: 48 | - "2" 49 | taints: 50 | - effect: NoSchedule 51 | key: intent 52 | value: workload-split 53 | --- 54 | apiVersion: karpenter.sh/v1 55 | kind: NodePool 56 | metadata: 57 | name: node-spot 58 | spec: 59 | disruption: 60 | consolidationPolicy: WhenEmptyOrUnderutilized 61 | consolidateAfter: 1m 62 | limits: 63 | cpu: 1k 64 | memory: 500Gi 65 | template: 66 | metadata: 67 | labels: 68 | intent: apps 69 | spec: 70 | expireAfter: 168h0m0s 71 | nodeClassRef: 72 | group: karpenter.k8s.aws 73 | name: default 74 | kind: EC2NodeClass 75 | requirements: 76 | - key: capacity-spread 77 | operator: In 78 | values: 79 | - "2" 80 | - "3" 81 | - "4" 82 | - "5" 83 | - key: kubernetes.io/arch 84 | operator: In 85 | values: 86 | - amd64 87 | - key: karpenter.sh/capacity-type 88 | operator: In 89 | values: 90 | - spot 91 | - key: kubernetes.io/os 92 | operator: In 93 | values: 94 | - linux 95 | - key: karpenter.k8s.aws/instance-category 96 | operator: In 97 | values: 98 | - c 99 | - m 100 | - r 101 | - key: karpenter.k8s.aws/instance-generation 102 | operator: Gt 103 | values: 104 | - "2" 105 | taints: 106 | - effect: NoSchedule 107 | key: intent 108 | value: workload-split 109 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /blueprints/soci-snapshotter/workload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: vllm-soci 5 | labels: 6 | app: vllm-soci 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: vllm-soci 12 | template: 13 | metadata: 14 | labels: 15 | app: vllm-soci 16 | spec: 17 | containers: 18 | - name: vllm 19 | image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2 20 | command: ["bash", "-c"] 21 | args: ["trap 'exit 0' TERM; sleep 9999 & wait"] 22 | nodeSelector: 23 | intent: soci-snapshotter 24 | kubernetes.io/arch: amd64 25 | affinity: 26 | nodeAffinity: 27 | requiredDuringSchedulingIgnoredDuringExecution: 28 | nodeSelectorTerms: 29 | - matchExpressions: 30 | - key: karpenter.k8s.aws/instance-ebs-bandwidth 31 | operator: Gt 32 | values: 33 | - "8000" 34 | - key: karpenter.k8s.aws/instance-network-bandwidth 35 | operator: Gt 36 | values: 37 | - "8000" 38 | --- 39 | apiVersion: apps/v1 40 | kind: Deployment 41 | metadata: 42 | name: vllm-soci-br 43 | labels: 44 | app: vllm-soci-br 45 | spec: 46 | replicas: 1 47 | selector: 48 | matchLabels: 49 | app: vllm-soci-br 50 | template: 51 | metadata: 52 | labels: 53 | app: vllm-soci-br 54 | spec: 55 | containers: 56 | - name: vllm 57 | image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2 58 | command: ["bash", "-c"] 59 | args: ["trap 'exit 0' TERM; sleep 9999 & wait"] 60 | nodeSelector: 61 | intent: soci-snapshotter-br 62 | kubernetes.io/arch: amd64 63 | affinity: 64 | nodeAffinity: 65 | requiredDuringSchedulingIgnoredDuringExecution: 66 | nodeSelectorTerms: 67 | - matchExpressions: 68 | - key: karpenter.k8s.aws/instance-ebs-bandwidth 69 | operator: Gt 70 | values: 71 | - "8000" 72 | - key: karpenter.k8s.aws/instance-network-bandwidth 73 | operator: Gt 74 | values: 75 | - "8000" 76 | --- 77 | apiVersion: apps/v1 78 | kind: Deployment 79 | metadata: 80 | name: vllm 81 | labels: 82 | app: vllm 83 | spec: 84 | replicas: 1 85 | selector: 86 | matchLabels: 87 | app: vllm 88 | template: 89 | metadata: 90 | labels: 91 | app: vllm 92 | spec: 93 | containers: 94 | - name: vllm 95 | image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2 96 | command: ["bash", "-c"] 97 | args: ["trap 'exit 0' TERM; sleep 9999 & wait"] 98 | nodeSelector: 99 | intent: non-soci-snapshotter 100 | kubernetes.io/arch: amd64 101 | affinity: 102 | nodeAffinity: 103 | requiredDuringSchedulingIgnoredDuringExecution: 104 | nodeSelectorTerms: 105 | - matchExpressions: 106 | - key: karpenter.k8s.aws/instance-ebs-bandwidth 107 | operator: Gt 108 | values: 109 | - "8000" 110 | - key: karpenter.k8s.aws/instance-network-bandwidth 111 | operator: Gt 112 | values: 113 | - "8000" -------------------------------------------------------------------------------- /blueprints/userdata/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Karpenter Blueprint: Customizing nodes with your own User Data automation 3 | 4 | ## Purpose 5 | 6 | When you need to bootstrap the data plane nodes to either overwrite certain Kubernetes settings, mount volumes or anything else you need to do when a node is launched. Within the `EC2NodeClass` there's a `userData` field you can use to control the [user data](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html) that is applied to your worker nodes. This way, you can continue using the [EKS optimized AMI](https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html) with any additional configuration you need to run on top of the base AMI. 7 | 8 | ## Requirements 9 | 10 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 11 | 12 | ## Deploy 13 | 14 | You need to create a new `EC2NodeClass` with the `userData` field, along with a `NodePool` to use this new template. 15 | 16 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes: 17 | 18 | ```sh 19 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name) 20 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name) 21 | ``` 22 | 23 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN). 24 | 25 | Now, make sure you're in this blueprint folder, then run the following command to create the new `EC2NodeClass` and `NodePool`: 26 | 27 | ```sh 28 | sed -i '' "s/<>/$CLUSTER_NAME/g" userdata.yaml 29 | sed -i '' "s/<>/$KARPENTER_NODE_IAM_ROLE_NAME/g" userdata.yaml 30 | kubectl apply -f . 31 | ``` 32 | 33 | ## Results 34 | 35 | The pods from the sample workload should be running: 36 | 37 | ```sh 38 | > kubectl get pods 39 | NAME READY STATUS RESTARTS AGE 40 | userdata-75d87b5b6c-6s978 1/1 Running 0 45s 41 | userdata-75d87b5b6c-gnglz 1/1 Running 0 45s 42 | userdata-75d87b5b6c-krmxm 1/1 Running 0 45s 43 | ``` 44 | 45 | You can confirm the Kubernetes settings have been added to the user data of the instance by running this command: 46 | 47 | ```sh 48 | aws ec2 describe-instance-attribute \ 49 | --instance-id $(aws ec2 describe-instances \ 50 | --filters "Name=tag:karpenter.sh/nodepool,Values=userdata" \ 51 | --output text --query 'Reservations[0].Instances[0].InstanceId') \ 52 | --attribute userData --query 'UserData.Value' --output text | base64 --decode 53 | ``` 54 | 55 | You should get an output like this with the `[settings.kubernetes]` configured in the `EC2NodeClass`: 56 | 57 | ```text 58 | MIME-Version: 1.0 59 | Content-Type: multipart/mixed; boundary="//" 60 | 61 | --// 62 | Content-Type: text/x-shellscript; charset="us-ascii" 63 | 64 | #!/bin/bash 65 | echo "Running a custom user data script" 66 | 67 | --// 68 | Content-Type: application/node.eks.aws 69 | ``` 70 | 71 | Look at how the `userdata` from the instance has the `userdata` you specified within the `EC2NodeClass` manifest. 72 | 73 | ## Cleanup 74 | 75 | To remove all objects created, simply run the following commands: 76 | 77 | ```sh 78 | kubectl delete -f . 79 | ``` 80 | -------------------------------------------------------------------------------- /blueprints/od-spot-split/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Split Between On-Demand & Spot Instances 2 | 3 | ## Purpose 4 | 5 | This setup works if you're interested in having a portion the EKS nodes running using On-Demand instances, and another portion on Spot. For example, a split of 20% On-Demand, and 80% on Spot. You're can take advantage of the labels Karpenter adds automatically to each node, and use [Topology Spread Constraints (TSC)](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/) within a `Deployment` or `Pod` to split capacity in a desired ratio. 6 | 7 | To do this, you can create a NodePool each for Spot and On-Demand with disjoint values for a unique new label called `capacity-spread`. Then, assign values to this label to configure the split. If you'd like to have a 20/80 split, you could add the values `["2","3","4","5"]` for the Spot NodePool, and `["1"]` for the On-Demand NodePool. 8 | 9 | ## Requirements 10 | 11 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 12 | * A `default` Karpenter NodePool as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository. 13 | 14 | ## Deploy 15 | 16 | To deploy the Karpenter `NodePool` and the sample `workload`, simply run this command: 17 | 18 | ```sh 19 | kubectl apply -f . 20 | ``` 21 | 22 | You should see the following output: 23 | 24 | ```console 25 | nodepool.karpenter.sh/node-od created 26 | nodepool.karpenter.sh/node-spot created 27 | deployment.apps/workload-split created 28 | ``` 29 | 30 | ## Results 31 | 32 | You can review the Karpenter logs and watch how it's deciding to launch multiple nodes following the workload constraints: 33 | 34 | ```sh 35 | kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20 36 | ``` 37 | 38 | Wait one minute and you should see the pods running within multiple nodes, run this command: 39 | 40 | ```sh 41 | kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true 42 | ``` 43 | 44 | You should see an output similar to this: 45 | 46 | ```console 47 | NAME STATUS ROLES AGE VERSION CAPACITY-TYPE INSTANCE-TYPE NODEPOOL ZONE 48 | ip-10-0-104-249.eu-west-2.compute.internal Ready 17s v1.32.3-eks-473151a spot c7i-flex.large node-spot eu-west-2c 49 | ip-10-0-40-176.eu-west-2.compute.internal Ready 6m29s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 50 | ip-10-0-47-113.eu-west-2.compute.internal Ready 6m29s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 51 | ip-10-0-53-185.eu-west-2.compute.internal Ready 6m29s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 52 | ip-10-0-54-129.eu-west-2.compute.internal Ready 6m29s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 53 | ip-10-0-83-213.eu-west-2.compute.internal Ready 20s v1.32.3-eks-473151a on-demand c6a.large node-od eu-west-2b 54 | ``` 55 | 56 | As you can see, pods were spread within the `spot` and `od` nodepools because of the `capacity-spread` TSC: 57 | 58 | ```yaml 59 | topologySpreadConstraints: 60 | - labelSelector: 61 | matchLabels: 62 | app: workload-split 63 | maxSkew: 1 64 | topologyKey: capacity-spread 65 | whenUnsatisfiable: DoNotSchedule 66 | ``` 67 | 68 | And each `NodePool` has a weight configured, the `od` NodePool has the following requirement: 69 | 70 | ```yaml 71 | - key: capacity-spread 72 | operator: In 73 | values: ["1"] 74 | ``` 75 | 76 | And the `spot` has the following requirement: 77 | 78 | ```yaml 79 | - key: capacity-spread 80 | operator: In 81 | values: ["2","3","4","5"] 82 | ``` 83 | 84 | ## Cleanup 85 | 86 | ```sh 87 | kubectl delete -f workload.yaml 88 | kubectl delete -f od-spot.yaml 89 | ``` 90 | -------------------------------------------------------------------------------- /blueprints/saving-plans/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Prioritize Savings Plans and/or Reserved Instances 2 | 3 | ## Purpose 4 | 5 | You might want to consume your Saving Plans and/or Reserved Instances before any other purchase model when using Karpenter. Currently, to cover this scenario you need to have a prioritized NodePool for the reserved instances. This NodePool needs to have a high weight configuration to tell Karpenter to user this NodePool first, along with a `limits` configuration to limit the number of EC2 instances to launch. When this NodePool meet the limits, Karpenter will continue launching instances from other NodePools, typically from the `default` one. 6 | 7 | ## Requirements 8 | 9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 10 | * A list of instance types or families that match with your Savings Plans and/or Reserved Instances, along with the total number of vCPUs you've reserved. 11 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository. 12 | 13 | ## Deploy 14 | 15 | Let's suppose you purchased a Saving Plans of 20 vCPUs for `c4` family. Your NodePool should look like this: 16 | 17 | ```yaml 18 | apiVersion: karpenter.sh/v1 19 | kind: NodePool 20 | metadata: 21 | name: savings-plans 22 | spec: 23 | disruption: 24 | consolidationPolicy: WhenEmptyOrUnderutilized 25 | consolidateAfter: 1m 26 | limits: 27 | cpu: "20" # For example: Limit to launch up to 5 c4.xlarge instances 28 | template: 29 | metadata: 30 | labels: 31 | intent: apps 32 | spec: 33 | expireAfter: 168h0m0s 34 | nodeClassRef: 35 | group: karpenter.k8s.aws 36 | name: default 37 | kind: EC2NodeClass 38 | requirements: 39 | - key: karpenter.k8s.aws/instance-family 40 | operator: In 41 | values: 42 | - c4 43 | - key: kubernetes.io/os 44 | operator: In 45 | values: 46 | - linux 47 | - key: kubernetes.io/arch 48 | operator: In 49 | values: 50 | - amd64 51 | - key: karpenter.sh/capacity-type 52 | operator: In 53 | values: 54 | - on-demand 55 | weight: 100 56 | ``` 57 | 58 | Notice that the above `NodePool` has a `weight` configuration of `100` and a `cpu` limit of 20 (5 x c4.xlarge instances). 59 | 60 | Deploy the prioritized NodePool and the sample workload with 20 pods requesting `950m` cpu units: 61 | 62 | ```sh 63 | kubectl apply -f savings-plans.yaml 64 | kubectl apply -f workload.yaml 65 | ``` 66 | 67 | ## Results 68 | 69 | Wait around three minutes to get all the pods running. Run the following command to see the nodes launched by Karpenter including the `NodePool-name` column to see which `NodePool` was used: 70 | 71 | ```sh 72 | kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true 73 | ``` 74 | 75 | You should get a similar output like this: 76 | 77 | ```console 78 | NAME STATUS ROLES AGE VERSION CAPACITY-TYPE INSTANCE-TYPE NODEPOOL ZONE 79 | ip-10-0-119-235.eu-west-2.compute.internal Ready 23s v1.32.3-eks-473151a on-demand c4.4xlarge savings-plans eu-west-2c 80 | ip-10-0-127-154.eu-west-2.compute.internal Ready 35m v1.32.3-eks-473151a on-demand c6g.xlarge default eu-west-2c 81 | ip-10-0-78-33.eu-west-2.compute.internal Ready 24s v1.32.3-eks-473151a on-demand c4.xlarge savings-plans eu-west-2b 82 | ``` 83 | 84 | Notice how the `savings-plans` NodePool launched all the capacity it could. Two instances: `c4.xlarge` (4 vCPUs) and `c4.4xlarge` (16 vCPUs), which together reach the limit of 20 vCPUs you configured for this NodePool. Additionally, you see Karpenter launched a `c5.large` Spot instance for the rest of the pods using the `default` NodePool. Remember, each node always launch the `kubelet` and `kube-proxy` pods, that's why by Karpenter launched an extra node because 20 vCPUs of reserved capacity wasn't enough if system pods need to be included. 85 | 86 | ## Cleanup 87 | 88 | To remove all objects created, simply run the following commands: 89 | 90 | ```sh 91 | kubectl delete -f . 92 | ``` 93 | -------------------------------------------------------------------------------- /cluster/terraform/karpenter.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | karpenter_namespace = "karpenter" 3 | } 4 | 5 | ################################################################################ 6 | # Controller & Node IAM roles, SQS Queue, Eventbridge Rules 7 | ################################################################################ 8 | 9 | module "karpenter" { 10 | source = "terraform-aws-modules/eks/aws//modules/karpenter" 11 | version = "20.37.0" 12 | 13 | cluster_name = module.eks.cluster_name 14 | enable_v1_permissions = true 15 | namespace = local.karpenter_namespace 16 | 17 | # Name needs to match role name passed to the EC2NodeClass 18 | node_iam_role_use_name_prefix = false 19 | node_iam_role_name = local.name 20 | create_pod_identity_association = true 21 | 22 | tags = local.tags 23 | } 24 | 25 | ################################################################################ 26 | # Helm charts 27 | ################################################################################ 28 | 29 | resource "helm_release" "karpenter" { 30 | name = "karpenter" 31 | namespace = local.karpenter_namespace 32 | create_namespace = true 33 | repository = "oci://public.ecr.aws/karpenter" 34 | repository_username = data.aws_ecrpublic_authorization_token.token.user_name 35 | repository_password = data.aws_ecrpublic_authorization_token.token.password 36 | chart = "karpenter" 37 | version = "1.5.0" 38 | wait = false 39 | 40 | values = [ 41 | <<-EOT 42 | nodeSelector: 43 | karpenter.sh/controller: 'true' 44 | settings: 45 | clusterName: ${module.eks.cluster_name} 46 | clusterEndpoint: ${module.eks.cluster_endpoint} 47 | interruptionQueue: ${module.karpenter.queue_name} 48 | tolerations: 49 | - key: CriticalAddonsOnly 50 | operator: Exists 51 | webhook: 52 | enabled: false 53 | EOT 54 | ] 55 | 56 | lifecycle { 57 | ignore_changes = [ 58 | repository_password 59 | ] 60 | } 61 | } 62 | 63 | # Karpenter default EC2NodeClass and NodePool 64 | 65 | resource "kubectl_manifest" "karpenter_default_ec2_node_class" { 66 | yaml_body = <<-YAML 67 | apiVersion: karpenter.k8s.aws/v1 68 | kind: EC2NodeClass 69 | metadata: 70 | name: default 71 | spec: 72 | role: "${module.karpenter.node_iam_role_name}" 73 | amiSelectorTerms: 74 | - alias: al2023@latest 75 | securityGroupSelectorTerms: 76 | - tags: 77 | karpenter.sh/discovery: ${module.eks.cluster_name} 78 | subnetSelectorTerms: 79 | - tags: 80 | karpenter.sh/discovery: ${module.eks.cluster_name} 81 | tags: 82 | IntentLabel: apps 83 | KarpenterNodePoolName: default 84 | NodeType: default 85 | intent: apps 86 | karpenter.sh/discovery: ${module.eks.cluster_name} 87 | project: karpenter-blueprints 88 | YAML 89 | 90 | depends_on = [ 91 | helm_release.karpenter, 92 | ] 93 | } 94 | 95 | resource "kubectl_manifest" "karpenter_default_node_pool" { 96 | yaml_body = <<-YAML 97 | apiVersion: karpenter.sh/v1 98 | kind: NodePool 99 | metadata: 100 | name: default 101 | spec: 102 | template: 103 | metadata: 104 | labels: 105 | intent: apps 106 | spec: 107 | requirements: 108 | - key: kubernetes.io/arch 109 | operator: In 110 | values: ["amd64", "arm64"] 111 | - key: "karpenter.k8s.aws/instance-cpu" 112 | operator: In 113 | values: ["4", "8", "16", "32", "48", "64"] 114 | - key: karpenter.sh/capacity-type 115 | operator: In 116 | values: ["spot", "on-demand"] 117 | - key: karpenter.k8s.aws/instance-category 118 | operator: In 119 | values: ["c", "m", "r", "i", "d"] 120 | nodeClassRef: 121 | name: default 122 | group: karpenter.k8s.aws 123 | kind: EC2NodeClass 124 | kubelet: 125 | containerRuntime: containerd 126 | systemReserved: 127 | cpu: 100m 128 | memory: 100Mi 129 | disruption: 130 | consolidationPolicy: WhenEmptyOrUnderutilized 131 | consolidateAfter: 1m 132 | YAML 133 | 134 | depends_on = [ 135 | helm_release.karpenter, 136 | kubectl_manifest.karpenter_default_ec2_node_class, 137 | ] 138 | } 139 | -------------------------------------------------------------------------------- /blueprints/multi-ebs/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Using multiple EBS volumes 2 | 3 | ## Purpose 4 | 5 | This blueprint shows how to attach more than one EBS volume to a data plane node. Maybe you need to use a volume for logs, cache, or any container resources such as images. You do this configuration in the `EC2NodeClass`, then you configure a `NodePool` to use such template when launching a machine. 6 | 7 | ## Requirements 8 | 9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 10 | * An IAM Role name that Karpenter nodes will use 11 | * AWS CLI configured with permissions to describe EC2 instances (`ec2:DescribeInstances`) 12 | 13 | ## Deploy 14 | 15 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes: 16 | 17 | ```sh 18 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name) 19 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name) 20 | ``` 21 | 22 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role (not the ARN). 23 | 24 | Now, make sure you're in this blueprint folder, then run the following command: 25 | 26 | ```sh 27 | sed -i '' "s/<>/$CLUSTER_NAME/g" multi-ebs.yaml 28 | sed -i '' "s/<>/$KARPENTER_NODE_IAM_ROLE_NAME/g" multi-ebs.yaml 29 | kubectl apply -f . 30 | ``` 31 | 32 | Here's the important configuration block within the spec of an `EC2NodeClass`: 33 | 34 | ```yaml 35 | blockDeviceMappings: 36 | - deviceName: /dev/xvda 37 | ebs: 38 | volumeType: gp3 39 | volumeSize: 20 40 | deleteOnTermination: true 41 | - deviceName: /dev/xvdb 42 | ebs: 43 | volumeType: gp3 44 | volumeSize: 100Gi 45 | deleteOnTermination: true 46 | ``` 47 | 48 | ## Results 49 | 50 | After waiting for about one minute, you should see a machine ready, and all pods in a `Running` state, like this: 51 | 52 | ```sh 53 | ❯ kubectl get pods 1m 52s 54 | NAME READY STATUS RESTARTS AGE 55 | multi-ebs-f4fb69fdd-kstj9 1/1 Running 0 2m34s 56 | multi-ebs-f4fb69fdd-t9xnl 1/1 Running 0 2m34s 57 | multi-ebs-f4fb69fdd-x42ss 1/1 Running 0 2m34s 58 | ❯ kubectl get nodeclaims 59 | NAME TYPE ZONE NODE READY AGE 60 | multi-ebs-chvzv m5.xlarge eu-west-1a ip-10-0-43-92.eu-west-1.compute.internal True 3m55s 61 | ``` 62 | 63 | To validate that two EBS volumes have been attached to the EC2 instance, you need to run this command: 64 | 65 | ```sh 66 | aws ec2 describe-instances --filters "Name=tag:karpenter.sh/nodepool,Values=multi-ebs" --query 'Reservations[*].Instances[*].{Instance:InstanceId,Instance:BlockDeviceMappings}' --output json 67 | ``` 68 | 69 | The output should be similar to this: 70 | 71 | ```json 72 | [ 73 | [ 74 | { 75 | "Instance": [ 76 | { 77 | "DeviceName": "/dev/xvda", 78 | "Ebs": { 79 | "AttachTime": "2024-08-16T12:39:36+00:00", 80 | "DeleteOnTermination": true, 81 | "Status": "attached", 82 | "VolumeId": "vol-0561b68b188d4e63a" 83 | } 84 | }, 85 | { 86 | "DeviceName": "/dev/xvdb", 87 | "Ebs": { 88 | "AttachTime": "2024-08-16T12:39:36+00:00", 89 | "DeleteOnTermination": true, 90 | "Status": "attached", 91 | "VolumeId": "vol-0ca5ca8b749f6bed0" 92 | } 93 | } 94 | ] 95 | } 96 | ] 97 | ] 98 | ``` 99 | 100 | ## Cleanup 101 | 102 | To remove all objects created, simply run the following commands: 103 | 104 | ```sh 105 | kubectl delete -f . 106 | ``` 107 | -------------------------------------------------------------------------------- /blueprints/custom-ami/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Launching nodes using custom AMIs 2 | 3 | ## Purpose 4 | 5 | When you need to launch nodes using a custom AMI that you've created (i.e. to pre-load base container images), you need to configure an `EC2NodeClass` properly to get the AMI you need. With Karpenter, you might be able to use AMIs for different CPU architectures or other specifications like GPUs. So, our recommendation is that you use a naming convention or a tag to easily identify which AMIs Karpenter can use to launch nodes. 6 | 7 | ## Requirements 8 | 9 | * A custom AMI to use (for this example, we'll skip this requirement) 10 | * An EKS Cluster name with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 11 | * An IAM Role name that Karpenter nodes will use 12 | 13 | ## Deploy 14 | 15 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes: 16 | 17 | ``` 18 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name) 19 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name) 20 | ``` 21 | 22 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN). 23 | 24 | Now, make sure you're in this blueprint folder, then run the following command: 25 | 26 | ```sh 27 | sed -i '' "s/<>/$CLUSTER_NAME/g" custom-ami.yaml 28 | sed -i '' "s/<>/$KARPENTER_NODE_IAM_ROLE_NAME/g" custom-ami.yaml 29 | kubectl apply -f . 30 | ``` 31 | 32 | Here's the important configuration block within the spec of an [`EC2NodeClass`](https://karpenter.sh/preview/concepts/nodeclasses/#specamiselectorterms): **spec.amiSelectorTerms** 33 | 34 | `amiSelectorTerms` are required and are used to configure AMIs for Karpenter to use. AMIs are discovered through alias, id, owner, name, and [tags](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html). 35 | 36 | If amiSelectorTerms match more than one AMI, Karpenter will automatically determine which AMI best fits the workloads on the launched worker node under the following constraints: 37 | 38 | * When launching nodes, Karpenter automatically determines which architecture a custom AMI is compatible with and will use images that match an instanceType's requirements. 39 | * Unless using an alias, Karpenter cannot detect requirements other than architecture. If you need to specify different AMIs for different kind of nodes (e.g. accelerated GPU AMIs), you should use a separate EC2NodeClass. 40 | * If multiple AMIs are found that can be used, Karpenter will choose the latest one. 41 | * If no AMIs are found that can be used, then no nodes will be provisioned. 42 | 43 | To select an AMI by name, use the `name` field in the selector term. To select an AMI by id, use the `id` field in the selector term. To select an AMI using an alias, use the `alias` field which supports version pinning (e.g. `al2023@v20240807`) or latest version (`al2023@latest`). To ensure that AMIs are owned by the expected owner, use the `owner` field - you can use a combination of account aliases (e.g. self amazon, your-aws-account-name) and account IDs. If this is not set, it defaults to `self,amazon`. 44 | 45 | > **Tip** 46 | > AMIs may be specified by any AWS tag, including Name. Selecting by tag 47 | > or by name using wildcards (*) is supported. 48 | 49 | ```yaml 50 | amiSelectorTerms: 51 | - name: "*amazon-eks-node-al2023*" 52 | owner: self 53 | - name: "*amazon-eks-node-al2023*" 54 | owner: amazon 55 | ``` 56 | 57 | ***IMPORTANT NOTE:*** With this configuration, you're saying that you need to use the latest AMI available for an EKS cluster v1.32 which is either owned by you (customized) or Amazon (official image). We're using a regular expression to have the flexibility to use AMIs for either `x86` or `Arm`, workloads that need GPUs, or a nodes with different OS like `Windows`. You're basically letting the workload (pod) to decide which type of node(s) it needs. If you don't have a custom AMI created by you in your account, Karpenter will use the official EKS AMI owned by Amazon. 58 | 59 | ## Results 60 | 61 | After waiting for about one minute, you should see a machine ready, and all pods in a `Running` state, like this: 62 | 63 | ```sh 64 | ❯ kubectl get pods 65 | NAME READY STATUS RESTARTS AGE 66 | custom-ami-bdf66b777-2g27q 1/1 Running 0 2m2s 67 | custom-ami-bdf66b777-dbkls 1/1 Running 0 2m2s 68 | custom-ami-bdf66b777-rzlsz 1/1 Running 0 2m2s 69 | ❯ kubectl get nodeclaims 70 | NAME TYPE CAPACITY ZONE NODE READY AGE 71 | custom-ami-jhdbh c5a.large spot eu-west-2c ip-10-0-117-230.eu-west-2.compute.internal True 114s 72 | ``` 73 | 74 | ## Cleanup 75 | 76 | To remove all objects created, simply run the following commands: 77 | 78 | ```sh 79 | kubectl delete -f . 80 | ``` 81 | -------------------------------------------------------------------------------- /blueprints/graviton/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Working with Graviton Instances 2 | 3 | ## Purpose 4 | 5 | You might be wondering how to use Graviton instances with Karpenter. Well, first you need to make sure that your application can run on different CPUs such as `arm64` or `x86-64`. The programming language you’re using and its ecosystem needs to be multi-arch aware, as you'll need to container images for both `arm64` and `x86-64` architectures. [AWS Graviton](https://aws.amazon.com/ec2/graviton/) processors are custom built by AWS using 64-bit Arm Neoverse. They power Amazon EC2 instances such as: M6g, M6gd, T4g, C6g, C6gd, C6gn, R6g, R6gd, X2gd, and more. Graviton instances provide up to 40% better price performance over comparable current generation x86-based instances for a wide variety of workloads. 6 | 7 | Karpenter set the default architecture constraint on your NodePool that supports most common user workloads, which today will be `amd64` (or `x86-64` architecture). However, if you're flexible to support either `arm64` or `x86-64`, when working with AWS, you defer the decision of which architecture to use depending on purchase model: `On-Demand` or `Spot`. 8 | 9 | If it’s an On-Demand Instance, Karpenter uses the `lowest-price` (LP) allocation strategy to launch the cheapest instance type that has available capacity. If it’s a Spot Instance, Karpenter uses the `price-capacity-optimized` (PCO) allocation strategy. PCO looks at both price and capacity availability to launch from the [Spot Instance pools](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-instances.html#spot-features) that are the least likely to be interrupted and have the lowest possible price. 10 | 11 | ## Requirements 12 | 13 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 14 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository. 15 | * A container image built for `arm64` architecture hosted in a container image registry such as ECR. 16 | 17 | **NOTE:** To build a multi-arch container image, you can use Docker‘s [buildx](https://www.docker.com/blog/multi-arch-build-and-images-the-simple-way/) or, equally possible, a [remote](https://community.arm.com/developer/tools-software/tools/b/tools-software-ides-blog/posts/unifying-arm-software-development-with-docker) build. In this context, you want to check the [multi-arch readiness](https://github.com/aws-samples/aws-multiarch-container-build-pipeline) of your automated build and test pipeline, for example, [[support in Travis](https://docs.travis-ci.com/user/multi-cpu-architectures/#example-multi-architecture-build-matrix). Next, you need to [push your container images to a registry such as ECR](https://aws.amazon.com/blogs/containers/introducing-multi-architecture-container-images-for-amazon-ecr/). 18 | 19 | **NOTE:** The sample `workload` in this repository already supports `arm64`. 20 | 21 | ## Deploy 22 | 23 | You're going to use the `default` NodePool as there's no need to create a separate NodePool to launch Graviton instances. 24 | 25 | ## Results 26 | 27 | You can inspect the pods from the `workload-flexible` deployment, but they don't have something in particular for Graviton instances other than asking for On-Demand capacity (`karpenter.sh/capacity-type: on-demand`) as a node selector. So, let's deploy the following assets: 28 | 29 | ```sh 30 | kubectl apply -f workload-flexible.yaml 31 | ``` 32 | 33 | Wait for about one minute, and you'll see a new Graviton instance coming up: 34 | 35 | ```sh 36 | $> kubectl get nodeclaims 37 | NAME TYPE ZONE NODE READY AGE 38 | default-sgmkw c6g.xlarge eu-west-1b ip-10-0-66-182.eu-west-1.compute.internal True 42s 39 | ``` 40 | 41 | **NOTE:** All pods should be running now, and you didn't have to say anything special to Karpenter about which container image to use. Why? In Kubernetes, and by extension in Amazon EKS, the worker node-local supervisor called `kubelet` instructs the container runtime via a [standardized interface](https://kubernetes.io/blog/2016/12/container-runtime-interface-cri-in-kubernetes/) to pull container images from a registry such as Amazon ECR and launch them, accordingly. All of which is multi-arch enabled and automated. 42 | 43 | Now, let's suppose that you've make the decision to go all-in with Graviton. Instead of creating a new NodePool, you can control that behavior within the `Deployment` by using a `nodeSelector` of `kubernetes.io/arch: arm64` and without limiting to On-Demand only. This means that now chances are that Karpenter will launch a Spot instance as it's the one with a better price offering. Let's see, deploy the other workload: 44 | 45 | ```sh 46 | kubectl apply -f workload-graviton.yaml 47 | ``` 48 | 49 | Wait for about one minute, and run the following command to see which nodes Karpenter has launched and see if it's On-Demand or Spot: 50 | 51 | ```sh 52 | kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true 53 | ``` 54 | 55 | You should see something similar to this: 56 | 57 | ```console 58 | NAME STATUS ROLES AGE VERSION CAPACITY-TYPE 59 | ip-10-0-87-181.eu-west-2.compute.internal Ready 114s v1.32.3-eks-473151a on-demand c6g.xlarge default eu-west-2b 60 | ``` 61 | 62 | Notice that now Karpenter decided to launch a `c6g.2xlarge` Spot instance because the workload and the NodePool support both pricing models, and the one that has a better price at this moment was a Graviton Spot instance. 63 | 64 | ## Cleanup 65 | 66 | ```sh 67 | kubectl delete -f . 68 | ``` 69 | -------------------------------------------------------------------------------- /blueprints/soci-snapshotter/soci-snapshotter.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.k8s.aws/v1 2 | kind: EC2NodeClass 3 | metadata: 4 | name: soci-snapshotter 5 | spec: 6 | amiSelectorTerms: 7 | - alias: al2023@latest 8 | role: "<>" 9 | blockDeviceMappings: 10 | - deviceName: /dev/xvda 11 | ebs: 12 | volumeSize: 100Gi 13 | volumeType: gp3 14 | throughput: 1000 15 | iops: 16000 16 | securityGroupSelectorTerms: 17 | - tags: 18 | karpenter.sh/discovery: "<>" 19 | subnetSelectorTerms: 20 | - tags: 21 | karpenter.sh/discovery: "<>" 22 | userData: | 23 | apiVersion: node.eks.aws/v1alpha1 24 | kind: NodeConfig 25 | spec: 26 | featureGates: 27 | FastImagePull: true 28 | --- 29 | apiVersion: karpenter.sh/v1 30 | kind: NodePool 31 | metadata: 32 | name: soci-snapshotter 33 | spec: 34 | disruption: 35 | consolidationPolicy: WhenEmptyOrUnderutilized 36 | consolidateAfter: 5m 37 | template: 38 | metadata: 39 | labels: 40 | intent: soci-snapshotter 41 | spec: 42 | nodeClassRef: 43 | group: karpenter.k8s.aws 44 | kind: EC2NodeClass 45 | name: soci-snapshotter 46 | requirements: 47 | - key: kubernetes.io/os 48 | operator: In 49 | values: 50 | - linux 51 | - key: karpenter.k8s.aws/instance-category 52 | operator: In 53 | values: 54 | - c 55 | - m 56 | - r 57 | - key: kubernetes.io/arch 58 | operator: In 59 | values: 60 | - amd64 61 | - key: karpenter.sh/capacity-type 62 | operator: In 63 | values: 64 | - spot 65 | - on-demand 66 | --- 67 | apiVersion: karpenter.k8s.aws/v1 68 | kind: EC2NodeClass 69 | metadata: 70 | name: soci-snapshotter-br 71 | spec: 72 | amiSelectorTerms: 73 | - alias: bottlerocket@latest 74 | role: "<>" 75 | blockDeviceMappings: 76 | - deviceName: /dev/xvda 77 | ebs: 78 | volumeSize: 4Gi 79 | volumeType: gp3 80 | encrypted: true 81 | - deviceName: /dev/xvdb 82 | ebs: 83 | volumeSize: 100Gi 84 | volumeType: gp3 85 | throughput: 1000 86 | iops: 16000 87 | encrypted: true 88 | securityGroupSelectorTerms: 89 | - tags: 90 | karpenter.sh/discovery: "<>" 91 | subnetSelectorTerms: 92 | - tags: 93 | karpenter.sh/discovery: "<>" 94 | userData: | 95 | [settings.container-runtime] 96 | snapshotter = "soci" 97 | [settings.container-runtime-plugins.soci-snapshotter] 98 | pull-mode = "parallel-pull-unpack" 99 | [settings.container-runtime-plugins.soci-snapshotter.parallel-pull-unpack] 100 | max-concurrent-downloads-per-image = 20 101 | concurrent-download-chunk-size = "16mb" 102 | max-concurrent-unpacks-per-image = 12 103 | discard-unpacked-layers = true 104 | [settings.bootstrap-commands.k8s-ephemeral-storage] 105 | commands = [ 106 | ["apiclient", "ephemeral-storage", "init"], 107 | ["apiclient", "ephemeral-storage" ,"bind", "--dirs", "/var/lib/containerd", "/var/lib/kubelet", "/var/log/pods", "/var/lib/soci-snapshotter"] 108 | ] 109 | essential = true 110 | mode = "always" 111 | --- 112 | apiVersion: karpenter.sh/v1 113 | kind: NodePool 114 | metadata: 115 | name: soci-snapshotter-br 116 | spec: 117 | disruption: 118 | consolidationPolicy: WhenEmptyOrUnderutilized 119 | consolidateAfter: 5m 120 | template: 121 | metadata: 122 | labels: 123 | intent: soci-snapshotter-br 124 | spec: 125 | nodeClassRef: 126 | group: karpenter.k8s.aws 127 | kind: EC2NodeClass 128 | name: soci-snapshotter-br 129 | requirements: 130 | - key: kubernetes.io/os 131 | operator: In 132 | values: 133 | - linux 134 | - key: karpenter.k8s.aws/instance-category 135 | operator: In 136 | values: 137 | - c 138 | - m 139 | - r 140 | - key: kubernetes.io/arch 141 | operator: In 142 | values: 143 | - amd64 144 | - key: karpenter.sh/capacity-type 145 | operator: In 146 | values: 147 | - spot 148 | - on-demand 149 | --- 150 | apiVersion: karpenter.k8s.aws/v1 151 | kind: EC2NodeClass 152 | metadata: 153 | name: non-soci-snapshotter 154 | spec: 155 | amiSelectorTerms: 156 | - alias: al2023@latest 157 | role: "<>" 158 | blockDeviceMappings: 159 | - deviceName: /dev/xvda 160 | ebs: 161 | volumeSize: 100Gi 162 | volumeType: gp3 163 | throughput: 1000 164 | iops: 16000 165 | securityGroupSelectorTerms: 166 | - tags: 167 | karpenter.sh/discovery: "<>" 168 | subnetSelectorTerms: 169 | - tags: 170 | karpenter.sh/discovery: "<>" 171 | --- 172 | apiVersion: karpenter.sh/v1 173 | kind: NodePool 174 | metadata: 175 | name: non-soci-snapshotter 176 | spec: 177 | disruption: 178 | consolidationPolicy: WhenEmptyOrUnderutilized 179 | consolidateAfter: 5m 180 | template: 181 | metadata: 182 | labels: 183 | intent: non-soci-snapshotter 184 | spec: 185 | nodeClassRef: 186 | group: karpenter.k8s.aws 187 | kind: EC2NodeClass 188 | name: non-soci-snapshotter 189 | requirements: 190 | - key: kubernetes.io/os 191 | operator: In 192 | values: 193 | - linux 194 | - key: karpenter.k8s.aws/instance-category 195 | operator: In 196 | values: 197 | - c 198 | - m 199 | - r 200 | - key: kubernetes.io/arch 201 | operator: In 202 | values: 203 | - amd64 204 | - key: karpenter.sh/capacity-type 205 | operator: In 206 | values: 207 | - spot 208 | - on-demand -------------------------------------------------------------------------------- /blueprints/overprovision/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Overprovision capacity in advanced to increase responsiveness 2 | 3 | ## Purpose 4 | 5 | Let's say you have a data pipeline process that knows it will need to have the capacity to launch 100 pods at the same time. To reduce the initiation time, you could overprovision capacity in advanced to increase responsiveness so when the data pipeline launches the pods, the capacity is already there. 6 | 7 | To achieve this, you deploy a "dummy" workload with a low [PriorityClass](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass) to reserve capacity (to make Karpenter launch nodes). Then, when you deploy the workload with the pods you actually need, "dummy" pods are evicted to make rapidly start the pods you need for your workload. 8 | 9 | ## Requirements 10 | 11 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 12 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository. 13 | 14 | ## Deploy 15 | 16 | Let's start by deploying the "dummy" workload: 17 | 18 | ```sh 19 | kubectl apply -f dummy-workload.yaml 20 | ``` 21 | 22 | After waiting for around two minutes, notice how Karpenter will provision the machine(s) needed to run the "dummy" workload: 23 | 24 | ```sh 25 | > kubectl get nodeclaims 26 | NAME TYPE ZONE NODE READY AGE 27 | default-kpj7k c6i.2xlarge eu-west-1b ip-10-0-73-34.eu-west-1.compute.internal True 57s 28 | ``` 29 | 30 | And the "dummy" pods are now running simply to reserve this capacity: 31 | 32 | ```sh 33 | > kubectl get pods 7s 34 | NAME READY STATUS RESTARTS AGE 35 | dummy-workload-6bf87d68f-2ftbq 1/1 Running 0 53s 36 | dummy-workload-6bf87d68f-8pnp8 1/1 Running 0 53s 37 | dummy-workload-6bf87d68f-ctlvc 1/1 Running 0 53s 38 | dummy-workload-6bf87d68f-fznv6 1/1 Running 0 53s 39 | dummy-workload-6bf87d68f-hp4qs 1/1 Running 0 53s 40 | dummy-workload-6bf87d68f-pwtp9 1/1 Running 0 53s 41 | dummy-workload-6bf87d68f-rg7tj 1/1 Running 0 53s 42 | dummy-workload-6bf87d68f-t7bqz 1/1 Running 0 53s 43 | dummy-workload-6bf87d68f-xwln7 1/1 Running 0 53s 44 | dummy-workload-6bf87d68f-zmhk8 1/1 Running 0 53s 45 | ``` 46 | 47 | ## Results 48 | 49 | Now, when you deploy the actual workload you need to do some work (such as a data pipeline process), the "dummy" pods are going to be evicted. So, let's deploy the following workload to test it: 50 | 51 | ```sh 52 | kubectl apply -f workload.yaml 53 | ``` 54 | 55 | Notice how your new pods are almost immediately running, and some of the "dummy" pods are "Pending": 56 | 57 | ```sh 58 | > kubectl get pods 59 | NAME READY STATUS RESTARTS AGE 60 | dummy-workload-6bf87d68f-2ftbq 1/1 Running 0 11m 61 | dummy-workload-6bf87d68f-6bq4v 0/1 Pending 0 15s 62 | dummy-workload-6bf87d68f-8nkp8 0/1 Pending 0 14s 63 | dummy-workload-6bf87d68f-cchqx 0/1 Pending 0 15s 64 | dummy-workload-6bf87d68f-fznv6 1/1 Running 0 11m 65 | dummy-workload-6bf87d68f-hp4qs 1/1 Running 0 11m 66 | dummy-workload-6bf87d68f-r69g6 0/1 Pending 0 15s 67 | dummy-workload-6bf87d68f-rg7tj 1/1 Running 0 11m 68 | dummy-workload-6bf87d68f-w4zk8 0/1 Pending 0 15s 69 | dummy-workload-6bf87d68f-zmhk8 1/1 Running 0 11m 70 | workload-679c759476-6h47j 1/1 Running 0 15s 71 | workload-679c759476-hhjmp 1/1 Running 0 15s 72 | workload-679c759476-jxnc2 1/1 Running 0 15s 73 | workload-679c759476-lqv5t 1/1 Running 0 15s 74 | workload-679c759476-n269j 1/1 Running 0 15s 75 | workload-679c759476-nfjtp 1/1 Running 0 15s 76 | workload-679c759476-nv7sg 1/1 Running 0 15s 77 | workload-679c759476-p277d 1/1 Running 0 15s 78 | workload-679c759476-qw8sk 1/1 Running 0 15s 79 | workload-679c759476-sxjpt 1/1 Running 0 15s 80 | ``` 81 | 82 | After waiting for around two minutes, you'll see all pods running and a new machine registered: 83 | 84 | ```sh 85 | > kubectl get nodeclaims 18s 86 | NAME TYPE ZONE NODE READY AGE 87 | default-4q9dn c6g.xlarge on-demand eu-west-2c ip-10-0-127-154.eu-west-2.compute.internal True 29m 88 | default-xwbvp c7g.xlarge spot eu-west-2c ip-10-0-100-21.eu-west-2.compute.internal True 75s 89 | ``` 90 | 91 | The new machine is simply there because some "dummy" pods were pending and they exist to reserve capacity. If you think you won't need those "dummy" pods while your workload is running, you can simply reduce the "dummy" deployment replicas to 0, and Karpenter consolidation will kick in to remove unnecessary machines. 92 | 93 | ```sh 94 | > kubectl scale deployment dummy-workload --replicas 0 95 | deployment.apps/dummy-workload scaled 96 | > kubectl get nodeclaims 97 | NAME TYPE ZONE NODE READY AGE 98 | default-kpj7k c6i.2xlarge eu-west-1b ip-10-0-73-34.eu-west-1.compute.internal True 16m 99 | ``` 100 | 101 | ## Cleanup 102 | 103 | To remove all objects created, simply run the following commands: 104 | 105 | ```sh 106 | kubectl delete -f . 107 | ``` 108 | -------------------------------------------------------------------------------- /cluster/terraform/main.tf: -------------------------------------------------------------------------------- 1 | ## THIS TO AUTHENTICATE TO ECR, DON'T CHANGE IT 2 | provider "aws" { 3 | region = "us-east-1" 4 | alias = "virginia" 5 | } 6 | 7 | provider "aws" { 8 | region = var.region 9 | } 10 | 11 | provider "kubernetes" { 12 | host = module.eks.cluster_endpoint 13 | cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) 14 | token = data.aws_eks_cluster_auth.this.token 15 | } 16 | 17 | provider "helm" { 18 | kubernetes { 19 | host = module.eks.cluster_endpoint 20 | cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) 21 | token = data.aws_eks_cluster_auth.this.token 22 | } 23 | } 24 | 25 | provider "kubectl" { 26 | apply_retry_count = 10 27 | host = module.eks.cluster_endpoint 28 | cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) 29 | load_config_file = false 30 | token = data.aws_eks_cluster_auth.this.token 31 | } 32 | 33 | data "aws_eks_cluster_auth" "this" { 34 | name = module.eks.cluster_name 35 | } 36 | 37 | data "aws_ecrpublic_authorization_token" "token" { 38 | provider = aws.virginia 39 | } 40 | 41 | data "aws_availability_zones" "available" { 42 | filter { 43 | name = "opt-in-status" 44 | values = ["opt-in-not-required"] 45 | } 46 | } 47 | 48 | locals { 49 | name = "karpenter-blueprints" 50 | 51 | vpc_cidr = "10.0.0.0/16" 52 | # NOTE: You might need to change this less number of AZs depending on the region you're deploying to 53 | azs = slice(data.aws_availability_zones.available.names, 0, 3) 54 | 55 | tags = { 56 | blueprint = local.name 57 | } 58 | } 59 | 60 | ################################################################################ 61 | # Cluster 62 | ################################################################################ 63 | 64 | module "eks" { 65 | source = "terraform-aws-modules/eks/aws" 66 | version = "20.37.0" 67 | 68 | cluster_name = local.name 69 | cluster_version = "1.32" 70 | cluster_endpoint_public_access = true 71 | enable_cluster_creator_admin_permissions = true 72 | 73 | cluster_addons = { 74 | aws-ebs-csi-driver = { 75 | most_recent = true 76 | } 77 | coredns = { 78 | most_recent = true 79 | } 80 | eks-pod-identity-agent = { 81 | before_compute = true 82 | most_recent = true 83 | } 84 | kube-proxy = { 85 | most_recent = true 86 | } 87 | metrics-server = { 88 | most_recent = true 89 | } 90 | vpc-cni = { 91 | most_recent = true 92 | before_compute = true 93 | configuration_values = jsonencode({ 94 | env = { 95 | ENABLE_PREFIX_DELEGATION = "true" 96 | WARM_PREFIX_TARGET = "1" 97 | } 98 | }) 99 | } 100 | } 101 | 102 | vpc_id = module.vpc.vpc_id 103 | subnet_ids = module.vpc.private_subnets 104 | 105 | create_cloudwatch_log_group = false 106 | 107 | eks_managed_node_groups = { 108 | mng = { 109 | instance_types = ["m4.large", "m5.large", "m5a.large", "m5ad.large", "m5d.large", "t2.large", "t3.large", "t3a.large"] 110 | 111 | subnet_ids = module.vpc.private_subnets 112 | max_size = 2 113 | desired_size = 2 114 | min_size = 2 115 | 116 | labels = { 117 | # Used to ensure Karpenter runs on nodes that it does not manage 118 | "karpenter.sh/controller" = "true" 119 | } 120 | } 121 | } 122 | 123 | node_security_group_tags = merge(local.tags, { 124 | # NOTE - if creating multiple security groups with this module, only tag the 125 | # security group that Karpenter should utilize with the following tag 126 | # (i.e. - at most, only one security group should have this tag in your account) 127 | "karpenter.sh/discovery" = local.name 128 | }) 129 | 130 | tags = local.tags 131 | } 132 | 133 | module "eks_blueprints_addons" { 134 | source = "aws-ia/eks-blueprints-addons/aws" 135 | version = "1.21.0" 136 | 137 | cluster_name = module.eks.cluster_name 138 | cluster_endpoint = module.eks.cluster_endpoint 139 | cluster_version = module.eks.cluster_version 140 | oidc_provider_arn = module.eks.oidc_provider_arn 141 | 142 | create_delay_dependencies = [for grp in module.eks.eks_managed_node_groups : grp.node_group_arn] 143 | 144 | enable_aws_load_balancer_controller = true 145 | 146 | enable_aws_for_fluentbit = true 147 | aws_for_fluentbit = { 148 | set = [ 149 | { 150 | name = "cloudWatchLogs.region" 151 | value = var.region 152 | } 153 | ] 154 | } 155 | 156 | tags = local.tags 157 | } 158 | 159 | module "aws_ebs_csi_pod_identity" { 160 | source = "terraform-aws-modules/eks-pod-identity/aws" 161 | 162 | name = "aws-ebs-csi" 163 | version = "1.12.0" 164 | 165 | attach_aws_ebs_csi_policy = true 166 | 167 | # Pod Identity Associations 168 | association_defaults = { 169 | namespace = "kube-system" 170 | service_account = "ebs-csi-controller-sa" 171 | } 172 | 173 | associations = { 174 | default = { 175 | cluster_name = module.eks.cluster_name 176 | } 177 | } 178 | 179 | tags = local.tags 180 | } 181 | 182 | #--------------------------------------------------------------- 183 | # Supporting Resources 184 | #--------------------------------------------------------------- 185 | 186 | module "vpc" { 187 | source = "terraform-aws-modules/vpc/aws" 188 | version = "5.21.0" 189 | 190 | name = local.name 191 | cidr = local.vpc_cidr 192 | 193 | azs = local.azs 194 | private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)] 195 | public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)] 196 | 197 | enable_nat_gateway = true 198 | single_nat_gateway = true 199 | enable_dns_hostnames = true 200 | 201 | # Manage so we can name 202 | manage_default_network_acl = true 203 | default_network_acl_tags = { Name = "${local.name}-default" } 204 | manage_default_route_table = true 205 | default_route_table_tags = { Name = "${local.name}-default" } 206 | manage_default_security_group = true 207 | default_security_group_tags = { Name = "${local.name}-default" } 208 | 209 | public_subnet_tags = { 210 | "kubernetes.io/cluster/${local.name}" = "shared" 211 | "kubernetes.io/role/elb" = 1 212 | } 213 | 214 | private_subnet_tags = { 215 | "kubernetes.io/cluster/${local.name}" = "shared" 216 | "kubernetes.io/role/internal-elb" = 1 217 | "karpenter.sh/discovery" = local.name 218 | } 219 | 220 | tags = local.tags 221 | } 222 | -------------------------------------------------------------------------------- /blueprints/ha-az-nodes/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: High-Availability - Spread Pods across AZs & Nodes 2 | 3 | ## Purpose 4 | 5 | Karpenter can launch only one node for all pending pods. However, putting all application pods in the same node is not recommended if you want to have high-availability. To avoid this, and make the workload more highly-available, you can spread the pods within multiple availability zones (AZs). Additionally, you can configure a constraint to spread pods within multiple nodes in the same AZ. To do so, you configure [Topology Spread Constraints (TSC)](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/) within a `Deployment` or `Pod`. 6 | 7 | ## Requirements 8 | 9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 10 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository. 11 | 12 | ## Deploy 13 | 14 | To deploy the sample `workload`, simply run this command: 15 | 16 | ```sh 17 | kubectl apply -f workload.yaml 18 | ``` 19 | 20 | ## Results 21 | 22 | You can review the Karpenter logs and watch how it's deciding to launch multiple nodes following the workload constraints: 23 | 24 | ```sh 25 | kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20 26 | ``` 27 | 28 | Wait one minute and you should see the pods running within two nodes in each AZ, run this command: 29 | 30 | ```sh 31 | kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true 32 | ``` 33 | 34 | You should see an output similar to this: 35 | 36 | ```console 37 | NAME STATUS ROLES AGE VERSION CAPACITY-TYPE INSTANCE-TYPE NODEPOOL ZONE 38 | ip-10-0-101-160.eu-west-2.compute.internal Ready 19s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2c 39 | ip-10-0-109-204.eu-west-2.compute.internal Ready 20s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2c 40 | ip-10-0-112-15.eu-west-2.compute.internal Ready 20s v1.32.3-eks-473151a spot m8g.xlarge default eu-west-2c 41 | ip-10-0-117-72.eu-west-2.compute.internal Ready 2m51s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2c 42 | ip-10-0-36-130.eu-west-2.compute.internal Ready 22s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 43 | ip-10-0-37-110.eu-west-2.compute.internal Ready 21s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 44 | ip-10-0-40-176.eu-west-2.compute.internal Ready 22s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 45 | ip-10-0-44-135.eu-west-2.compute.internal Ready 21s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 46 | ip-10-0-45-90.eu-west-2.compute.internal Ready 22s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 47 | ip-10-0-47-113.eu-west-2.compute.internal Ready 22s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 48 | ip-10-0-48-218.eu-west-2.compute.internal Ready 21s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 49 | ip-10-0-53-185.eu-west-2.compute.internal Ready 22s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 50 | ip-10-0-54-107.eu-west-2.compute.internal Ready 23s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 51 | ip-10-0-54-129.eu-west-2.compute.internal Ready 22s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 52 | ip-10-0-66-57.eu-west-2.compute.internal Ready 22s v1.32.3-eks-473151a spot c7gd.xlarge default eu-west-2b 53 | ip-10-0-77-61.eu-west-2.compute.internal Ready 21s v1.32.3-eks-473151a spot c7gd.xlarge default eu-west-2b 54 | ip-10-0-85-117.eu-west-2.compute.internal Ready 23s v1.32.3-eks-473151a spot c8g.xlarge default eu-west-2b 55 | ip-10-0-87-181.eu-west-2.compute.internal Ready 4m22s v1.32.3-eks-473151a on-demand c6g.xlarge default eu-west-2b 56 | ``` 57 | 58 | As you can see, pods were spread within AZs (1a and 1b) because of the `topology.kubernetes.io/zone` TSC. But at the same time, pods were spread within multiple nodes in each AZ because of the `kubernetes.io/hostname` TSC. 59 | 60 | ```yaml 61 | topologySpreadConstraints: 62 | - labelSelector: 63 | matchLabels: 64 | app: workload-multi-az-nodes 65 | maxSkew: 1 66 | topologyKey: kubernetes.io/hostname 67 | whenUnsatisfiable: ScheduleAnyway 68 | - labelSelector: 69 | matchLabels: 70 | app: workload-multi-az-nodes 71 | maxSkew: 1 72 | topologyKey: topology.kubernetes.io/zone 73 | whenUnsatisfiable: ScheduleAnyway 74 | ``` 75 | 76 | If you're using a region with more than two AZs available, you might have noticed that pods were scheduled only in two AZs. This is because you're setting `whenUnsatisfiable` to `ScheduleAnyway` which is a soft constraint, the `kube-scheduler` gives higher precedence to topologies that would help reduce the skew. 77 | 78 | **NOTE**: If you strictly need to spread within all available AZs, you can set he `minDomains` to the number of AZs as this lets you tell the `kube-scheduler` that you expect there to be a particular number of AZs. Therefore, if `kube-scheduler` is not aware of all available AZs, pods are marked as unschedulable and Karpenter will launch a node in each AZ. However, it's important that you know that setting `whenUnsatisfiable` to `DoNotSchedule` will cause pods to be unschedulable if the topology spread constraint can't be fulfilled. It should only be set if its preferable for pods to not run instead of violating the topology spread constraint. 79 | 80 | In case you want to enforce this spread within `Deployments`, you can use projects like [Kyverno](https://kyverno.io) to mutate a `Deployment` object and set the TSC you've seen in this blueprint. Here's a [Kyverno policy example](https://kyverno.io/policies/other/s-z/spread-pods-across-topology/spread-pods-across-topology/) that mutates a `Deployment` to include a TSC, just make sure it replicates the same rule from this blueprint (`whenUnsatisfiable` to `ScheduleAnyway`). 81 | 82 | ## Cleanup 83 | 84 | ```sh 85 | kubectl delete -f workload.yaml 86 | ``` 87 | -------------------------------------------------------------------------------- /blueprints/update-nodes-with-drift/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Update Nodes using Drift 2 | 3 | ## Purpose 4 | 5 | After upgrading the Kubernetes control plane version, you might be wondering how to properly upgrade the data plane nodes launched by Karpenter. Currently, Karpenter has a feature gate to mark nodes as drifted. A drifted node is one whose spec and metadata does not match the spec of its `NodePool` and `nodeClassRef`. A node can drift when a user changes their `NodePool` or `nodeClassRef`. Moreover, underlying infrastructure in the nodepool can be changed outside of the cluster. For example, configuring an `amiSelectorTerms` to configure static AMI IDs match the control plane version in the `NodePool`. This allows you to control when to upgrade node's version or when a new AL2 EKS Optimized AMI is released, creating drifted nodes. 6 | 7 | Karpenter's drift will reconcile when a node's AMI drifts from `NodePool` requirements. When upgrading a node, Karpenter will minimize the downtime of the applications on the node by initiating `NodePool` logic for a replacement node before terminating drifted nodes. Once Karpenter has begun launching the replacement node, Karpenter will cordon and drain the old node, terminating it when it’s fully drained, then finishing the upgrade. 8 | 9 | ## Requirements 10 | 11 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 12 | 13 | ## Deploy 14 | 15 | Let's create a new `EC2NodeClass` to be more precise about the AMIs you'd like to use. For now, you'll intentionally create new nodes using a previous EKS version to simulate where you'll be after upgrading the control plane. Within the `amiSelectorTerms` you'll configure the most recent AMIs (both for `amd64` and `arm64`) from a previous version of the control plane to test the drift feature. 16 | 17 | ```yaml 18 | amiSelectorTerms: 19 | - id: <> 20 | - id: <> 21 | ``` 22 | 23 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes: 24 | 25 | ```sh 26 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name) 27 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name) 28 | ``` 29 | 30 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN). 31 | 32 | Karpenter will use the latest EKS-optimized AMIs, so when there's a new AMI available or after you update the Kubernetes control plane and you have `Drift` enabled, the nodes with older AMIs are recycled automatically. To test this feature, you need to configure static AMIs within the `EC2NodeClass`. Run the following commands to create an environment variable with the AMI IDs to use: 33 | 34 | ```sh 35 | export amd64PrevAMI=$(aws ssm get-parameter --name /aws/service/bottlerocket/aws-k8s-1.32/x86_64/latest/image_id --region $AWS_REGION --query "Parameter.Value" --output text) 36 | export arm64PrevAMI=$(aws ssm get-parameter --name /aws/service/bottlerocket/aws-k8s-1.32/arm64/latest/image_id --region $AWS_REGION --query "Parameter.Value" --output text) 37 | ``` 38 | 39 | Now, make sure you're in this blueprint folder, then run the following command to create the new `NodePool` and `EC2NodeClass`: 40 | 41 | ```sh 42 | sed -i '' "s/<>/$CLUSTER_NAME/g" latest-current-ami.yaml 43 | sed -i '' "s/<>/$KARPENTER_NODE_IAM_ROLE_NAME/g" latest-current-ami.yaml 44 | sed -i '' "s/<>/$amd64PrevAMI/g" latest-current-ami.yaml 45 | sed -i '' "s/<>/$arm64PrevAMI/g" latest-current-ami.yaml 46 | kubectl apply -f . 47 | ``` 48 | 49 | ## Results 50 | 51 | Wait for around two minutes. The pods from the sample workload should be running even if the node has a version that doesn't match with the control plane. 52 | 53 | ```sh 54 | > kubectl get pods 55 | NAME READY STATUS RESTARTS AGE 56 | latest-current-ami-5bbfbc98f7-6hxkw 1/1 Running 0 3m 57 | latest-current-ami-5bbfbc98f7-n7mgs 1/1 Running 0 3m 58 | latest-current-ami-5bbfbc98f7-rxjjx 1/1 Running 0 3m 59 | ``` 60 | 61 | You should see a new node registered with the latest AMI for EKS `v1.31`, like this: 62 | 63 | ```sh 64 | > kubectl get nodes -l karpenter.sh/initialized=true 65 | NAME STATUS ROLES AGE VERSION 66 | ip-10-0-103-18.eu-west-2.compute.internal Ready 5m6s v1.31.6-eks-aad632c 67 | ``` 68 | 69 | Let's simulate a node upgrade by changing the EKS version in the `EC2NodeClass`, run this command: 70 | 71 | ```sh 72 | export amd64LatestAMI=$(aws ssm get-parameter --name /aws/service/bottlerocket/aws-k8s-1.32/x86_64/latest/image_id --region $AWS_REGION --query "Parameter.Value" --output text) 73 | export arm64LatestAMI=$(aws ssm get-parameter --name /aws/service/bottlerocket/aws-k8s-1.32/arm64/latest/image_id --region $AWS_REGION --query "Parameter.Value" --output text) 74 | sed -i '' "s/$amd64PrevAMI/$amd64LatestAMI/g" latest-current-ami.yaml 75 | sed -i '' "s/$arm64PrevAMI/$arm64LatestAMI/g" latest-current-ami.yaml 76 | sed -i '' "s/1.31/1.32/g" latest-current-ami.yaml 77 | kubectl apply -f latest-current-ami.yaml 78 | ``` 79 | 80 | You can confirm the update has been applied by running this command: 81 | 82 | ```sh 83 | kubectl get ec2nodeclass latest-current-ami -o yaml 84 | ``` 85 | 86 | Wait around five minutes, in the mean time, you can monitor Karpenter logs until you see something like this: 87 | 88 | ```json 89 | {"level":"INFO","time":"2024-08-16T13:32:10.187Z","logger":"controller","message":"disrupting nodeclaim(s) via replace, terminating 1 nodes (3 pods) ip-10-0-119-175.eu-west-2.compute.internal/c7i-flex.xlarge/spot and replacing with node from types c6a.xlarge, m5.xlarge, c7i-flex.xlarge, m6a.xlarge, c5a.xlarge and 55 other(s)","commit":"5bdf9c3","controller":"disruption","namespace":"","name":"","reconcileID":"be617b33-df37-44fc-897d-737fd3198cee","command-id":"26f7f912-a8f5-4e94-aaaf-386f8da44988","reason":"drifted"} 90 | {"level":"INFO","time":"2024-08-16T13:32:10.222Z","logger":"controller","message":"created nodeclaim","commit":"5bdf9c3","controller":"disruption","namespace":"","name":"","reconcileID":"be617b33-df37-44fc-897d-737fd3198cee","NodePool":{"name":"latest-current-ami"},"NodeClaim":{"name":"latest-current-ami-smlh7"},"requests":{"cpu":"1766m","memory":"1706Mi","pods":"7"},"instance-types":"c4.2xlarge, c4.xlarge, c5.2xlarge, c5.xlarge, c5a.2xlarge and 55 other(s)"} 91 | ``` 92 | 93 | Wait around two minutes. You should now see a new node with the latest AMI version that matches the control plane's version. 94 | 95 | ```sh 96 | > kubectl get nodes -l karpenter.sh/initialized=true 97 | NAME STATUS ROLES AGE VERSION 98 | ip-10-0-102-231.eu-west-2.compute.internal Ready 51s v1.32.2-eks-677bac1 99 | ``` 100 | 101 | You can repeat this process every time you need to run a controlled upgrade of the nodes. Also, if you'd like to control when to replace a node, you can learn more about [Disruption Budgets](//blueprints/disruption-budgets/). 102 | 103 | ## Cleanup 104 | 105 | To remove all objects created, simply run the following commands: 106 | 107 | ```sh 108 | kubectl delete -f . 109 | ``` 110 | -------------------------------------------------------------------------------- /blueprints/stateful/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Working with Stateful Workloads using EBS 2 | 3 | ## Purpose 4 | 5 | For stateful workloads that use persistent volumes, Karpenter detects storage scheduling requirements when deciding which instance type to launch and in which AZ. If you have a `StorageClass` configured for multiple AZs, Karpenter randomly selects one AZ when the pod is created for the first time. If the same pod is then removed, a new pod is created to request the same Persistent Volume Claim (PVC) and Karpenter takes this into consideration when choosing the AZ of an existing claim. 6 | 7 | ## Requirements 8 | 9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 10 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository. 11 | * The [Amazon EBS CSI driver](https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html) installed in the cluster. If you're using the Terraform template in this repository, it's already configured. 12 | 13 | ## Deploy 14 | 15 | Let's start by creating the `PersistentVolumeClaim` and `StorageClass` to use only one AZ. To do so,first choose one of the AZs in the region where you deployed the EKS cluster. Run this command to get one automatically: 16 | 17 | ```sh 18 | export FIRSTAZ=$(aws ec2 describe-availability-zones --query 'AvailabilityZones[0].ZoneName' --output text) 19 | echo $FIRSTAZ 20 | ``` 21 | 22 | Then, run these commands to replace the placeholder with the AZ, and deploy the storage resources: 23 | 24 | ```sh 25 | sed -i '' "s/<>/$FIRSTAZ/g" storage.yaml 26 | kubectl apply -f storage.yaml 27 | ``` 28 | 29 | Wait around one minute, as long as you get an event of `WaitForFirstConsumer` in the PVC, you're good to continue: 30 | 31 | ```sh 32 | > kubectl describe pvc ebs-claim 33 | ... 34 | Events: 35 | Type Reason Age From Message 36 | ---- ------ ---- ---- ------- 37 | Normal WaitForFirstConsumer 14s (x16 over 3m47s) persistentvolume-controller waiting for first consumer to be created before binding 38 | ``` 39 | 40 | Deploy a sample workload: 41 | 42 | ```sh 43 | kubectl apply -f workload.yaml 44 | ``` 45 | 46 | ## Results 47 | 48 | After waiting for around two minutes, you should see the pods running, and the PVC claimed: 49 | 50 | ```sh 51 | > kubectl get pods 52 | NAME READY STATUS RESTARTS AGE 53 | stateful-7b68c8d7bc-6mkvn 1/1 Running 0 2m 54 | stateful-7b68c8d7bc-6mrj5 1/1 Running 0 2m 55 | stateful-7b68c8d7bc-858nd 1/1 Running 0 2m 56 | > kubectl get pvc 57 | NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE 58 | ebs-claim Bound pvc-d4c11e32-9da0-41d6-a477-d454a4aade94 4Gi RWO storage-gp3 116s 59 | ``` 60 | 61 | Notice that Karpenter launched a node in the AZ (using the value from `$FIRSTAZ` env var), following the constraint defined in the `StorageClass` (no need to constraint it within the `Deployment` or `Pod`): 62 | 63 | ```sh 64 | > kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true 65 | NAME STATUS ROLES AGE VERSION CAPACITY-TYPE INSTANCE-TYPE NODEPOOL ZONE 66 | ip-10-0-52-243.eu-west-2.compute.internal Ready 16s v1.32.3-eks-473151a spot m7g.xlarge default eu-west-2a 67 | ``` 68 | 69 | Let's read the file that the pods are writing to, like this: 70 | 71 | ```sh 72 | export POD=$(kubectl get pods -l app=stateful -o name | cut -d/ -f2 | tail -n1) 73 | kubectl exec $POD -- cat /data/out.txt 74 | ``` 75 | 76 | You should see that the three pods are writing something every three minutes, like this: 77 | 78 | ```console 79 | Writing content every three minutes! Printing a random number: 795 80 | Writing content every three minutes! Printing a random number: 600 81 | Writing content every three minutes! Printing a random number: 987 82 | ``` 83 | 84 | If you delete one pod, the new pod will continue using the same PVC and will be in a `Running` state: 85 | 86 | ```sh 87 | kubectl delete pod $POD 88 | ``` 89 | 90 | You can read the content of the file using the new pod: 91 | 92 | ```sh 93 | export POD=$(kubectl get pods -l app=stateful -o name | cut -d/ -f2 | tail -n1) 94 | kubectl exec $POD -- cat /data/out.txt 95 | ``` 96 | 97 | You should still see the previous content plus any additional content if three minutes have passed, like this: 98 | 99 | ```console 100 | Writing content every three minutes! Printing a random number: 795 101 | Writing content every three minutes! Printing a random number: 600 102 | Writing content every three minutes! Printing a random number: 987 103 | Writing content every three minutes! Printing a random number: 224 104 | Writing content every three minutes! Printing a random number: 307 105 | Writing content every three minutes! Printing a random number: 325 106 | ``` 107 | 108 | Lastly, you can simulate a scale-down event for the workload and scale the replicas to 0, like this: 109 | 110 | ```sh 111 | kubectl scale deployment stateful --replicas 0 112 | ``` 113 | 114 | Wait around two minutes, and consolidation will make sure to remove the node. You can then scale-out the workload again, like this: 115 | 116 | ```sh 117 | kubectl scale deployment stateful --replicas 3 118 | ``` 119 | 120 | And you should see that Karpenter launches a replacement node in the AZ you choose, and the pods are soon going to be in a `Running` state. 121 | 122 | **NOTE:** You might have a experience/simulate a node loss which can result in data corruption or loss. If this happens, when the new node launched by Karpenter is ready, pods might have a warning event like `Multi-Attach error for volume "pvc-19af27b8-fc0a-428d-bda5-552cb52b9806" Volume is already exclusively attached to one node and can't be attached to another`. You can wait around five minutes and the volume will try to get unattached, and attached again, making your pods successfully run again. Look at this series of events for reference: 123 | 124 | ```console 125 | Events: 126 | Type Reason Age From Message 127 | ---- ------ ---- ---- ------- 128 | Warning FailedScheduling 14m default-scheduler 0/3 nodes are available: 1 node(s) were unschedulable, 2 node(s) didn't match Pod's node affinity/selector. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling.. 129 | Normal Nominated 14m karpenter Pod should schedule on: machine/default-75hvl 130 | Warning FailedScheduling 14m (x2 over 14m) default-scheduler 0/3 nodes are available: 1 node(s) had volume node affinity conflict, 2 node(s) didn't match Pod's node affinity/selector. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling.. 131 | Normal Scheduled 14m default-scheduler Successfully assigned default/stateful-7b68c8d7bc-6mkvn to ip-10-0-63-154.eu-west-1.compute.internal 132 | Warning FailedAttachVolume 14m attachdetach-controller Multi-Attach error for volume "pvc-19af27b8-fc0a-428d-bda5-552cb52b9806" Volume is already exclusively attached to one node and can't be attached to another 133 | Warning FailedMount 9m52s (x2 over 12m) kubelet Unable to attach or mount volumes: unmounted volumes=[persistent-storage], unattached volumes=[persistent-storage], failed to process volumes=[]: timed out waiting for the condition 134 | Normal SuccessfulAttachVolume 8m53s attachdetach-controller AttachVolume.Attach succeeded for volume "pvc-19af27b8-fc0a-428d-bda5-552cb52b9806" 135 | Normal Pulling 8m51s kubelet Pulling image "centos" 136 | Normal Pulled 8m47s kubelet Successfully pulled image "centos" in 4.871822072s (4.871840882s including waiting) 137 | Normal Created 8m47s kubelet Created container stateful 138 | Normal Started 8m46s kubelet Started container stateful 139 | ``` 140 | 141 | Finally, you can read the content of the file again: 142 | 143 | ```sh 144 | export POD=$(kubectl get pods -l app=stateful -o name | cut -d/ -f2 | tail -n1) 145 | kubectl exec $POD -- cat /data/out.txt 146 | ``` 147 | 148 | ## Cleanup 149 | 150 | To remove all objects created, simply run the following commands: 151 | 152 | ```sh 153 | kubectl delete -f . 154 | ``` 155 | -------------------------------------------------------------------------------- /blueprints/nvidia-gpu-workload/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Deploy an NVIDIA GPU workload 2 | 3 | ## Purpose 4 | 5 | Karpenter streamlines node lifecycle management, and it can help provide the right compute just-in-time based on your workloads scheduling constraints. This is particularly helpful for your machine learning workflows with variable and heterogeneous compute demands (e.g., NVIDIA GPU-based inference followed by CPU-based plotting). When your Kubernetes workload requires accelerated instance, Karpenter automatically selects the appropriate [Amazon EKS optimized accelerated AMI](https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html). 6 | 7 | Therefore, the purpose of this Karpenter blueprint is to demonstrate how to launch a GPU-based workload on Amazon EKS with Karpenter and AL2023 EKS optimized accelerated AMI. This example assumes a simple one-to-one mapping between a Kubernetes Pod and a GPU. This blueprint does not go into the details about GPU sharing techniques such as MiG, time slicing or other software based GPU fractional scheduling. 8 | 9 | Before you start seeing Karpenter in action, when using AL2023 you need to deploy a Kubernetes device plugin to advertise GPU information from the host. 10 | 11 | ## Requirements 12 | 13 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 14 | 15 | ## Deploy NVIDIA device plugin for Kubernetes 16 | 17 | The [NVIDIA device plugin for Kubernetes](https://github.com/NVIDIA/k8s-device-plugin) is used to advertise the number of GPUs on the host to Kubernetes so that this information can be used for scheduling purposes. You can install the NVIDIA device plugin with helm. 18 | 19 | To install the device plugin run the following: 20 | 21 | ```sh 22 | helm repo add nvdp https://nvidia.github.io/k8s-device-plugin 23 | helm repo update 24 | helm upgrade -i nvdp nvdp/nvidia-device-plugin \ 25 | --namespace nvidia-device-plugin \ 26 | --create-namespace \ 27 | --version 0.17.2 28 | ``` 29 | 30 | Now that you have the device set-up, let’s enable Karpenter to launch NVIDIA GPU instances. 31 | 32 | ## Create a NodeClass and NodePool with GPU-instances (AL2023) 33 | 34 | The following NodeClass, specify the Security Group and Subnet selector, along with AMI. We are using AL2023 here, and when launching an accelerated instance Karpenter will pick the respective EKS optimized accelerated AMI. AL2023 comes packaged with the NVIDIA GPU drivers, and the container runtime is configured out of the box. 35 | 36 | Before applying the `gpu-nodeclass.yaml` replace `KARPENTER_NODE_IAM_ROLE_NAME` and `CLUSTER_NAME` in the file with your specific cluster details. If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes: 37 | 38 | ```sh 39 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name) 40 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name) 41 | ``` 42 | 43 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN). 44 | 45 | The EC2NodeClass we’ll deploy looks like this, execute the following command to create the EC2NodeClass file: 46 | 47 | ```sh 48 | cat << EOF > gpu-nodeclass.yaml 49 | apiVersion: karpenter.k8s.aws/v1 50 | kind: EC2NodeClass 51 | metadata: 52 | name: gpu 53 | spec: 54 | amiSelectorTerms: 55 | - alias: al2023@latest 56 | role: "$KARPENTER_NODE_IAM_ROLE_NAME" 57 | blockDeviceMappings: 58 | - deviceName: /dev/xvda 59 | ebs: 60 | deleteOnTermination: true 61 | iops: 10000 62 | throughput: 125 63 | volumeSize: 100Gi 64 | volumeType: gp3 65 | securityGroupSelectorTerms: 66 | - tags: 67 | karpenter.sh/discovery: $CLUSTER_NAME 68 | subnetSelectorTerms: 69 | - tags: 70 | karpenter.sh/discovery: $CLUSTER_NAME 71 | EOF 72 | ``` 73 | 74 | A separate [EC2NodeClass](https://karpenter.sh/docs/concepts/nodeclasses/) was created as you may want to tune node properties such as ephemeral storage size, block device mappings, [capacity reservations selector](https://karpenter.sh/docs/concepts/nodeclasses/). 75 | 76 | The next step is to create a dedicated NodePool to provision instances from the `g` Amazon EC2 instance category and nvidia gpu manufacturer, and only allow workloads that tolerate the `nvidia.com/gpu` taint to be scheduled. Such NodePool will look like this. Execute the following command to create the NodePool file: 77 | 78 | ```sh 79 | cat << EOF > gpu-nodepool.yaml 80 | apiVersion: karpenter.sh/v1 81 | kind: NodePool 82 | metadata: 83 | name: gpu 84 | spec: 85 | limits: 86 | cpu: 100 87 | memory: 100Gi 88 | nvidia.com/gpu: 5 89 | template: 90 | metadata: 91 | labels: 92 | nvidia.com/gpu.present: true 93 | spec: 94 | nodeClassRef: 95 | group: karpenter.k8s.aws 96 | name: gpu 97 | kind: EC2NodeClass 98 | requirements: 99 | - key: karpenter.sh/capacity-type 100 | operator: In 101 | values: ["on-demand"] 102 | - key: karpenter.k8s.aws/instance-category 103 | operator: In 104 | values: ["g"] 105 | - key: karpenter.k8s.aws/instance-gpu-manufacturer 106 | operator: In 107 | values: ["nvidia"] 108 | expireAfter: 720h 109 | taints: 110 | - key: nvidia.com/gpu 111 | effect: NoSchedule 112 | disruption: 113 | consolidationPolicy: WhenEmpty 114 | consolidateAfter: 5m 115 | EOF 116 | ``` 117 | 118 | We’ve added the `nivida.com/gpu` taint in the NodePool to prevent workloads that do not tolerate this taint being scheduled on nodes managed by this NodePool (they might not take advantage of it). Also, notice that the `.spec.disruption` policy has been set to WhenEmpty and only consolidate after 5 minutes, this is to support spiky workloads like jobs with a high-churn - you’ll likely want to tweak this based on your workloads requirements. 119 | 120 | Once the placeholders are complete, to apply the EC2NodeClass and NodePool execute the following: 121 | 122 | ```sh 123 | $> kubectl apply -f gpu-nodeclass.yaml 124 | ec2nodeclass.karpenter.k8s.aws/gpu created 125 | 126 | $> kubectl apply -f gpu-nodepool.yaml 127 | nodepool.karpenter.sh/gpu created 128 | ``` 129 | 130 | Now let’s deploy a test workload to see how Karpenter launches the GPU node. 131 | 132 | ### Deploy a test workload to test GPU drivers are loaded 133 | 134 | The following Pod manifest launches a pod and calls the NVIDIA systems management CLI to check if a GPU is detected and the driver versions printed to standard output, which you can see when you check the logs, like this: `kubectl logs pod/nvidia-smi`. Execute the following command to create the `workload.yaml`: 135 | 136 | ```sh 137 | cat << EOF > workload.yaml 138 | apiVersion: v1 139 | kind: Pod 140 | metadata: 141 | name: nvidia-smi 142 | spec: 143 | nodeSelector: 144 | nvidia.com/gpu.present: "true" 145 | karpenter.k8s.aws/instance-gpu-name: "t4" 146 | restartPolicy: OnFailure 147 | containers: 148 | - name: nvidia-smi 149 | image: public.ecr.aws/amazonlinux/amazonlinux:2023-minimal 150 | args: 151 | - "nvidia-smi" 152 | resources: 153 | requests: 154 | memory: "8Gi" 155 | cpu: "3500m" 156 | limits: 157 | memory: "8Gi" 158 | nvidia.com/gpu: 1 159 | tolerations: 160 | - key: nvidia.com/gpu 161 | effect: NoSchedule 162 | operator: Exists 163 | EOF 164 | ``` 165 | 166 | As GPU-based workloads are likely sensitive to different GPUs (e.g. GPU memory) we've specified a `karpenter.k8s.aws/instance-gpu-name` node selector to request an instance with a specific GPU for this workload. The following nodeSelector `karpenter.k8s.aws/instance-gpu-name: "t4"` influences Karpenter node provisioning and launch the workload on a node with a [NVIDIA T4 GPU](https://aws.amazon.com/ec2/instance-types/g4/). Review the [Karpenter documentation](https://karpenter.sh/docs/reference/instance-types/) for different Amazon EC2 instances and there labels. 167 | 168 | To deploy the workload execute the following: 169 | 170 | ```sh 171 | $> kubectl apply -f workload.yaml 172 | pod/nvidia-smi created 173 | ``` 174 | 175 | You can check the pods status by executing: 176 | 177 | ```sh 178 | $> kubectl get pods 179 | NAME READY STATUS RESTARTS AGE 180 | nvidia-smi 1/1 Running 0 3s 181 | ``` 182 | 183 | You can view the pods nvidia-smi logs by executing: 184 | 185 | ```sh 186 | $> kubectl logs pod/nvidia-smi 187 | 188 | +-----------------------------------------------------------------------------------------+ 189 | | NVIDIA-SMI 570.133.20 Driver Version: 570.133.20 CUDA Version: 12.8 | 190 | |-----------------------------------------+------------------------+----------------------+ 191 | | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | 192 | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | 193 | | | | MIG M. | 194 | |=========================================+========================+======================| 195 | | 0 Tesla T4 On | 00000000:00:1E.0 Off | 0 | 196 | | N/A 29C P8 17W / 70W | 0MiB / 15360MiB | 0% Default | 197 | | | | N/A | 198 | +-----------------------------------------+------------------------+----------------------+ 199 | 200 | +-----------------------------------------------------------------------------------------+ 201 | | Processes: | 202 | | GPU GI CI PID Type Process name GPU Memory | 203 | | ID ID Usage | 204 | |=========================================================================================| 205 | | No running processes found | 206 | +-----------------------------------------------------------------------------------------+ 207 | ``` 208 | 209 | To review which node was launched by Karpenter, execute the following: 210 | 211 | ```sh 212 | $> kubectl get nodeclaims 213 | 214 | NAME TYPE CAPACITY ZONE NODE READY AGE 215 | gpu-f69tm g4dn.2xlarge on-demand eu-west-1c ip-xxx-xxx-xxx-xxx.eu-west-1.compute.internal True 5m44s 216 | ``` 217 | 218 | ## Clean-up 219 | 220 | To clean-up execute the following commands: 221 | 222 | ```sh 223 | kubectl delete -f workload.yaml 224 | kubectl delete -f gpu-nodepool.yaml 225 | kubectl delete -f gpu-nodeclass.yaml 226 | helm -n nvidia-device-plugin uninstall nvdp 227 | ``` 228 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprints for Amazon EKS 2 | 3 | ## Motivation 4 | 5 | [Karpenter](https://karpenter.sh/), a node provisioning project built for Kubernetes has been helping many companies to improve the efficiency and cost of running workloads on Kubernetes. However, as Karpenter takes an application-first approach to provision compute capacity for the Kubernetes data plane, there are common workload scenarios that you might be wondering how to configure them properly. This repository includes a list of common workload scenarios, some of them go in depth with the explanation of why configuring Karpenter and Kubernetes objects in such a way is important. 6 | 7 | ## Blueprint Structure 8 | 9 | Each blueprint follows the same structure to help you better understand what's the motivation and the expected results: 10 | 11 | | Concept | Description | 12 | | -------------- | ----------------------------------------------------------------------------------------------- | 13 | | Purpose | Explains what the blueprint is about, and what problem is solving. | 14 | | Requirements | Any pre-requisites you might need to use the blueprint (i.e. An `arm64` container image). | 15 | | Deploy | The steps to follow to deploy the blueprint into an existing Kubernetes cluster. | 16 | | Results | The expected results when using the blueprint. | 17 | 18 | ## How to use these Blueprints? 19 | 20 | Before you get started, you need to have a Kubernetes cluster with Karpenter installed. If you're planning to work with an existing cluster, just make sure you've configured Karpenter following the [official guide](https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/). This project also has a template to create a cluster with everything you'll need to test each blueprint. 21 | 22 | ## Support & Feedback 23 | 24 | > [!IMPORTANT] 25 | > Karpenter Blueprints for Amazon EKS is maintained by AWS Solution Architects. It is not part of an AWS 26 | > service and support is provided as a best-effort by the Karpenter Blueprints community. To provide feedback, 27 | > please use the [issues templates](https://github.com/aws-samples/karpenter-blueprints/issues) 28 | > provided. If you are interested in contributing to EKS Blueprints, see the 29 | > [Contribution guide](https://github.com/aws-samples/karpenter-blueprints/blob/main/CONTRIBUTING.md). 30 | 31 | ### Requirements 32 | 33 | * You need access to an AWS account with IAM permissions to create an EKS cluster, and an AWS Cloud9 environment if you're running the commands listed in this tutorial. 34 | * Install and configure the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) 35 | * Install the [Kubernetes CLI (kubectl)](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) 36 | * (Optional*) Install the [Terraform CLI](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) 37 | * (Optional*) Install Helm ([the package manager for Kubernetes](https://helm.sh/docs/intro/install/)) 38 | 39 | ***NOTE:** If you're planning to use an existing EKS cluster, you don't need the **optional** prerequisites. 40 | 41 | ### Preparing to Deploy Blueprints 42 | 43 | Before you start deploying and testing blueprints, make sure you follow next steps. For example, all blueprints assume that you have an EKS cluster with Karpenter deployed, and others even required that you have a `default` Karpenter `NodePool` deployed. 44 | 45 | #### Create an EKS Cluster using Terraform (Optional) 46 | 47 | If you're planning on using an existing EKS cluster, you can use an existing node group with On-Demand instances to deploy the Karpenter controller. To do so, you need to follow the [Karpenter getting started guide](https://karpenter.sh/docs/getting-started/). 48 | 49 | You'll create an Amazon EKS cluster using the [EKS Blueprints for Terraform project](https://github.com/aws-ia/terraform-aws-eks-blueprints). The Terraform template included in this repository is going to create a VPC, an EKS control plane, and a Kubernetes service account along with the IAM role and associate them using IAM Roles for Service Accounts (IRSA) to let Karpenter launch instances. Additionally, the template configures the Karpenter node role to the `aws-auth` configmap to allow nodes to connect, and creates an On-Demand managed node group for the `kube-system` and `karpenter` namespaces. 50 | 51 | To create the cluster, clone this repository and open the `cluster/terraform` folder. Then, run the following commands: 52 | 53 | ```sh 54 | cd cluster/terraform 55 | helm registry logout public.ecr.aws 56 | export TF_VAR_region=$AWS_REGION 57 | terraform init 58 | terraform apply -target="module.vpc" -auto-approve 59 | terraform apply -target="module.eks" -auto-approve 60 | terraform apply --auto-approve 61 | ``` 62 | 63 | Before you continue, you need to enable your AWS account to launch Spot instances if you haven't launch any yet. To do so, create the [service-linked role for Spot](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html#service-linked-roles-spot-instance-requests) by running the following command: 64 | 65 | ```sh 66 | aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true 67 | ``` 68 | 69 | You might see the following error if the role has already been successfully created. You don't need to worry about this error, you simply had to run the above command to make sure you have the service-linked role to launch Spot instances: 70 | 71 | ```console 72 | An error occurred (InvalidInput) when calling the CreateServiceLinkedRole operation: Service role name AWSServiceRoleForEC2Spot has been taken in this account, please try a different suffix. 73 | ``` 74 | 75 | Once complete (after waiting about 15 minutes), run the following command to update the `kube.config` file to interact with the cluster through `kubectl`: 76 | 77 | ```sh 78 | aws eks --region $AWS_REGION update-kubeconfig --name karpenter-blueprints 79 | ``` 80 | 81 | You need to make sure you can interact with the cluster and that the Karpenter pods are running: 82 | 83 | ```sh 84 | $> kubectl get pods -n karpenter 85 | NAME READY STATUS RESTARTS AGE 86 | karpenter-5f97c944df-bm85s 1/1 Running 0 15m 87 | karpenter-5f97c944df-xr9jf 1/1 Running 0 15m 88 | ``` 89 | 90 | You can now proceed to deploy the default Karpenter NodePool, and deploy any blueprint you want to test. 91 | 92 | #### Deploy a Karpenter Default EC2NodeClass and NodePool 93 | 94 | Before you start deploying a blueprint, you need to have a default [EC2NodeClass](https://karpenter.sh/preview/concepts/nodeclasses/) and a default [NodePool](https://karpenter.sh/docs/concepts/nodepools/) as some blueprints need them. `EC2NodeClass` enable configuration of AWS specific settings for EC2 instances launched by Karpenter. The `NodePool` sets constraints on the nodes that can be created by Karpenter and the pods that can run on those nodes. Each NodePool must reference an `EC2NodeClass` using `spec.nodeClassRef`. 95 | 96 | If you create a new EKS cluster following the previous steps, a Karpenter `EC2NodeClass` "default" and a Karpenter `NodePool` "default" are installed automatically. 97 | 98 | **NOTE:** For existing EKS cluster you have to modify the provided `./cluster/terraform/karpenter.tf` according to your setup by properly modifying `securityGroupSelectorTerm` and `subnetSelectorTerms` removing the `depends_on` section. ***If you're not using Terraform***, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN). 99 | 100 | You can see that the NodePool has been deployed by running this: 101 | 102 | ```sh 103 | kubectl get nodepool 104 | ``` 105 | 106 | You can see that the `EC2NodeClass` has been deployed by running this: 107 | 108 | ```sh 109 | kubectl get ec2nodeclass 110 | ``` 111 | 112 | Throughout all the blueprints, you might need to review Karpenter logs, so let's create an alias for that to read logs by simply running `kl`: 113 | 114 | ```sh 115 | alias kl="kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20" 116 | ``` 117 | 118 | You can now proceed to deploy any blueprint you want to test. 119 | 120 | #### Terraform Cleanup (Optional) 121 | 122 | Once you're done with testing the blueprints, if you used the Terraform template from this repository, you can proceed to remove all the resources that Terraform created. To do so, run the following commands: 123 | 124 | ```sh 125 | kubectl delete --all nodeclaim 126 | kubectl delete --all nodepool 127 | kubectl delete --all ec2nodeclass 128 | export TF_VAR_region=$AWS_REGION 129 | terraform destroy -target="module.eks_blueprints_addons" --auto-approve 130 | terraform destroy -target="module.eks" --auto-approve 131 | terraform destroy --auto-approve 132 | ``` 133 | 134 | ## Deploying a Blueprint 135 | 136 | After you have a cluster up and running with Karpenter installed, you can start testing each blueprint. A blueprint might have a `NodePool`, `EC2NodeClass` and a workload example. You need to open the blueprint folder and follow the steps to deploy the resources needed to test the blueprint. 137 | 138 | Here's the list of blueprints we have so far: 139 | 140 | * [High-Availability: Spread Pods across AZs & Nodes](/blueprints/ha-az-nodes/) 141 | * [Split Between On-Demand & Spot Instances](/blueprints/od-spot-split/) 142 | * [Prioritize Savings Plans and/or Reserved Instances](/blueprints/saving-plans/) 143 | * [Working with Graviton Instances](/blueprints/graviton) 144 | * [Overprovision capacity in advanced to increase responsiveness](/blueprints/overprovision/) 145 | * [Using multiple EBS volumes](/blueprints/multi-ebs/) 146 | * [Working with Stateful Workloads using EBS](/blueprints/stateful/) 147 | * [Update Nodes using Drift](/blueprints/update-nodes-with-drift/) 148 | * [Launching nodes using custom AMIs](/blueprints/custom-ami/) 149 | * [Customizing nodes with your own User Data automation](/blueprints/userdata/) 150 | * [Protecting batch jobs during the consolidation process](/blueprints/batch-jobs/) 151 | * [NodePool Disruption Budgets](/blueprints/disruption-budgets/) 152 | * [Deploy an NVIDIA GPU workload](/blueprints/nvidia-gpu-workload/) 153 | * [Accelerating image pull time using SOCI parallel mode](/blueprints/soci-snapshotter/) 154 | 155 | **NOTE:** Each blueprint is independent from each other, so you can deploy and test multiple blueprints at the same time in the same Kubernetes cluster. However, to reduce noise, we recommend you to test one blueprint at a time. 156 | 157 | ## Supported Versions 158 | 159 | The following table describes the list of resources along with the versions where the blueprints in this repo have been tested. 160 | 161 | | Resources/Tool | Version | 162 | | --------------- | ------------------- | 163 | | [Kubernetes](https://kubernetes.io/releases/) | 1.32 | 164 | | [Karpenter](https://github.com/aws/karpenter/releases) | v1.5.0 | 165 | | [Terraform](https://github.com/hashicorp/terraform/releases) | v1.12.1 | 166 | | [AWS EKS](https://github.com/terraform-aws-modules/terraform-aws-eks/releases) | v20.37.0 | 167 | | [EKS Blueprints Addons](https://github.com/aws-ia/terraform-aws-eks-blueprints-addons/releases) | v1.21.0 | 168 | 169 | ## License 170 | 171 | MIT-0 Licensed. See [LICENSE](/LICENSE). 172 | -------------------------------------------------------------------------------- /blueprints/soci-snapshotter/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Karpenter Blueprint: Using SOCI snapshotter parallel pull/unpack mode 3 | 4 | ## Purpose 5 | 6 | Container image pull performance has become a bottleneck as container images grow larger, compared to when typical images were just a few hundred megabytes. 7 | The default pulling method uses sequential layer downloading and unpacking. SOCI parallel pull/unpack mode accelerates container image loading through concurrent downloads and unpacking operations, reducing image pull time by up to 50%. This makes it ideal for AI/ML and Batch workloads, where it is common for those applications to have a large container images. 8 | 9 | This blueprint demonstrate how to setup SOCI snapshotter parallel pull/unpack mode on AL2023 and Bottlerocket through a custom `EC2NodeClass` and customizing the `userData` field. 10 | 11 | > ***NOTE***: SOCI snapshotter parallel mode is supported on [Amazon Linux 2023 (AL2023) > v20250821](https://github.com/awslabs/amazon-eks-ami/releases/tag/v20250821) and [Bottlerocket > v1.44.0](https://github.com/bottlerocket-os/bottlerocket/releases/tag/v1.44.0) 12 | 13 | If you would like to learn more about SOCI snapshotter's new parallel pull/unpack mode you can visit the following resources: 14 | 1. [SOCI snapshotter parallel mode feature docs](https://github.com/awslabs/soci-snapshotter/blob/main/docs/parallel-mode.md) in the [SOCI project repository](https://github.com/awslabs/soci-snapshotter) on GitHub. 15 | 16 | ## Requirements 17 | 18 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository. 19 | * A Container Registry that supports HTTP range GET requests such as [Amazon Elastic Container Registry (ECR)](https://aws.amazon.com/ecr/) 20 | 21 | ## Deploy 22 | 23 | You need to create a new `EC2NodeClass` with the `userData` field and customize the root volume EBS with `blockDeviceMappings`, along with a `NodePool` to use this new template. 24 | 25 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes: 26 | 27 | ```sh 28 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name) 29 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name) 30 | ``` 31 | 32 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN). 33 | 34 | Now, make sure you're in this blueprint folder, then run the following command: 35 | 36 | ```sh 37 | sed -i '' "s/<>/$CLUSTER_NAME/g" soci-snapshotter.yaml 38 | sed -i '' "s/<>/$KARPENTER_NODE_IAM_ROLE_NAME/g" soci-snapshotter.yaml 39 | kubectl apply -f . 40 | ``` 41 | 42 | > ***NOTE***: It can take a couple of minutes for resource to be created, while resources are being created you can continue reading. 43 | 44 | Those commands creates the following: 45 | 1. `EC2NodeClass` and `NodePool` named `soci-snapshotter` for using SOCI snapshotter parallel pull/unpack mode with customized `blockDeviceMappings` for increased I/O and storage size on Amazon Linux 2023. 46 | 2. `EC2NodeClass` and `NodePool` named `soci-snapshotter-br` for using SOCI snapshotter parallel pull/unpack mode with customized `blockDeviceMappings` for increased I/O and storage size on Bottlerocket. 47 | 3. `EC2NodeClass` and `NodePool` named `non-soci-snapshotter` for using default containerd implementation with customized `blockDeviceMappings` for increased I/O and storage size. 48 | 4. Kubernetes `Deployment` named `vllm-soci` that uses the `soci-snapshotter` `NodePool` 49 | 5. Kubernetes `Deployment` named `vllm-soci-br` that uses the `soci-snapshotter-br` `NodePool` 50 | 6. Kubernetes `Deployment` named `vllm` that uses the `non-soci-snapshotter` `NodePool` 51 | 52 | > ***NOTE***: For our example both deployments will request instances that have network and ebs bandwidth greater than 8000 Mbps by using `nodeAffinity` in order to eliminate network and storage I/O bottlenecks to demonstrate SOCI parallel mode capabilities. 53 | ``` 54 | affinity: 55 | nodeAffinity: 56 | requiredDuringSchedulingIgnoredDuringExecution: 57 | nodeSelectorTerms: 58 | - matchExpressions: 59 | - key: karpenter.k8s.aws/instance-ebs-bandwidth 60 | operator: Gt 61 | values: 62 | - "8000" 63 | - key: karpenter.k8s.aws/instance-network-bandwidth 64 | operator: Gt 65 | values: 66 | - "8000" 67 | ``` 68 | ## Configuration 69 | 70 | The SOCI snapshotter `EC2NodeClass` configuration have several configuration parameters that affect SOCI parallel mode performance. 71 | 72 | The `blockDeviceMapping` field is used to increase root volume EBS performance and storage size. 73 | As SOCI parallel mode downloads layers, it buffers them on disk instead of in-memory, having a high performant storage subsystem is crucial to support it as well as enough storage to hold the container images. 74 | The example configure the root volume with IOPs of 16,000 and throughput of 1,000MiB/s which is the maximum for GP3, it is recommended that you modify those settings accordingly to trade-off between performance and cost. 75 | > ***NOTE***: From our benchmarks, we have also seen a good starting point by setting the throughput to 600MiB/s and keeping base IOPs to 3,000. 76 | 77 |
78 | Amazon Linux 2023 79 | 80 | ```yaml 81 | apiVersion: karpenter.k8s.aws/v1 82 | kind: EC2NodeClass 83 | metadata: 84 | name: soci-snapshotter 85 | ... 86 | ... 87 | spec: 88 | blockDeviceMappings: 89 | - deviceName: /dev/xvda 90 | ebs: 91 | volumeSize: 100Gi 92 | volumeType: gp3 93 | throughput: 1000 94 | iops: 16000 95 | ... 96 | ... 97 | ``` 98 |
99 |
100 | Bottlerocket 101 | 102 | Bottlerocket defaults to two block devices, one for Bottlerocket's control volume and the other for container resources such as images and logs, in the example below we have configured Bottlerocket's secondary block device with increased EBS storage & throughput to support SOCI parallel mode. 103 | 104 | ```yaml 105 | apiVersion: karpenter.k8s.aws/v1 106 | kind: EC2NodeClass 107 | metadata: 108 | name: soci-snapshotter-br 109 | ... 110 | ... 111 | spec: 112 | blockDeviceMappings: 113 | - deviceName: /dev/xvda 114 | ebs: 115 | volumeSize: 4Gi 116 | volumeType: gp3 117 | encrypted: true 118 | - deviceName: /dev/xvdb 119 | ebs: 120 | volumeSize: 100Gi 121 | volumeType: gp3 122 | throughput: 1000 123 | iops: 16000 124 | encrypted: true 125 | ... 126 | ... 127 | ``` 128 | 129 |
130 |
131 | 132 | The `userData` field is used to enable and configure SOCI snapshotter on AL2023 and Bottlerocket. 133 | 134 | SOCI parallel mode configuration is controlled by several key settings. While the default values align with containerd's standard configuration to ensure stability and safety, you can adjust these parameters to optimize performance based on your specific needs, but ensure the infastructure can support it. 135 | 136 | 1. `max_concurrent_downloads_per_image`: Limits the maximum concurrent downloads per individual image, Default is 3 for Bottlerocket and 20 for AL2023. For images hosted on Amazon ECR we recommend setting this to 10-20. 137 | 2. `max_concurrent_unpacks_per_image`: Sets the limit for concurrent unpacking of layers per image. Default is 1 for Bottlerocket and 12 for AL2023. Tuning this to match the number of avg layers count of your container images. 138 | 3. `concurrent_download_chunk_size`: Specifies the size of each download chunk when pulling image layers in parallel. Default is "unlimited" for Bottlerocket and "16mb" for AL2023. This feature will enable multiple concurrent downloads per layer, we recommend setting this value to >0 if your registry support HTTP range requests, if you're using ECR, we recommend setting this to "16mb". 139 | 4. `discard_unpacked_layers`: Controls whether to retain layer blobs after unpacking. Enabling this can reduce disk space usage and speed up pull times. Default is false for Bottlerocket and true for AL2023. We recommend to set this to true on EKS nodes. 140 | 141 | To learn more about other configuration options, visit the [official SOCI snapshotter doc](https://github.com/awslabs/soci-snapshotter/blob/main/docs/parallel-mode.md#configuration) 142 | 143 | As installing a snapshotter to containerd and EKS requires several configuration, this is all being done for you automatically in AL2023 and Bottlerocket as SOCI is already pre-installed in the latest AMIs. 144 | 145 |
146 | Amazon Linux 2023 147 | 148 | SOCI snapshotter parallel mode can be enabled in AL2023 through featureGate named "FastImagePull", in AL2023 we use [`NodeConfig`](https://awslabs.github.io/amazon-eks-ami/nodeadm/doc/examples/#enabling-fast-image-pull-experimental) simplify various data plane configurations. 149 | 150 | 151 | ```yaml 152 | apiVersion: karpenter.k8s.aws/v1 153 | kind: EC2NodeClass 154 | metadata: 155 | name: soci-snapshotter 156 | ... 157 | ... 158 | spec: 159 | ... 160 | ... 161 | userData: | 162 | apiVersion: node.eks.aws/v1alpha1 163 | kind: NodeConfig 164 | spec: 165 | featureGates: 166 | FastImagePull: true 167 | ``` 168 | 169 | Modifying SOCI snapshotter parallel mode configuration in AL2023 requires modifying the `/etc/soci-snapshotter-grpc/config.toml` file, this can be achieved by a `userData` script as additional to the `NodeConfig` configuration. 170 | 171 | The following sets `max_concurrent_downloads_per_image` and `max_concurrent_unpacks_per_image` to `10` respectively 172 | 173 | ```yaml 174 | apiVersion: karpenter.k8s.aws/v1 175 | kind: EC2NodeClass 176 | metadata: 177 | name: soci-snapshotter 178 | ... 179 | ... 180 | spec: 181 | ... 182 | ... 183 | userData: | 184 | MIME-Version: 1.0 185 | Content-Type: multipart/mixed; boundary="//" 186 | 187 | --// 188 | Content-Type: text/x-shellscript; charset="us-ascii" 189 | 190 | #!/bin/bash 191 | max_concurrent_downloads_per_image=10 192 | max_concurrent_unpacks_per_image=10 193 | 194 | sed -i "s/^max_concurrent_downloads_per_image = .*$/max_concurrent_downloads_per_image = $max_concurrent_downloads_per_image/" /etc/soci-snapshotter-grpc/config.toml 195 | sed -i "s/^max_concurrent_unpacks_per_image = .*$/max_concurrent_unpacks_per_image = $max_concurrent_unpacks_per_image/" /etc/soci-snapshotter-grpc/config.toml 196 | 197 | --// 198 | Content-Type: application/node.eks.aws 199 | 200 | apiVersion: node.eks.aws/v1alpha1 201 | kind: NodeConfig 202 | spec: 203 | featureGates: 204 | FastImagePull: true 205 | --// 206 | ``` 207 | 208 |
209 | 210 |
211 | Bottlerocket 212 | 213 | SOCI snapshotter parallel mode can be enabled and configured in Bottlerocket through the [Settings API](https://bottlerocket.dev/en/os/1.44.x/api/settings/container-runtime-plugins/#tag-soci-parallel-pull-configuration). 214 | 215 | In Bottlerocket, SOCI's data dir is configured at `/var/lib/soci-snapshotter`, to take advantage of instances with NVMe disks, we will need to configure ephemeral storage through Bottlerocket's Settings API, with `[settings.bootstrap-commands.k8s-ephemeral-storage]` as you can see below, we added `/var/lib/soci-snapshotter` as a bind dir. 216 | 217 | ```yaml 218 | apiVersion: karpenter.k8s.aws/v1 219 | kind: EC2NodeClass 220 | metadata: 221 | name: soci-snapshotter-br 222 | ... 223 | ... 224 | spec: 225 | ... 226 | ... 227 | userData: | 228 | [settings.container-runtime] 229 | snapshotter = "soci" 230 | [settings.container-runtime-plugins.soci-snapshotter] 231 | pull-mode = "parallel-pull-unpack" 232 | [settings.container-runtime-plugins.soci-snapshotter.parallel-pull-unpack] 233 | max-concurrent-downloads-per-image = 20 234 | concurrent-download-chunk-size = "16mb" 235 | max-concurrent-unpacks-per-image = 12 236 | discard-unpacked-layers = true 237 | [settings.bootstrap-commands.k8s-ephemeral-storage] 238 | commands = [ 239 | ["apiclient", "ephemeral-storage", "init"], 240 | ["apiclient", "ephemeral-storage" ,"bind", "--dirs", "/var/lib/containerd", "/var/lib/kubelet", "/var/log/pods", "/var/lib/soci-snapshotter"] 241 | ] 242 | essential = true 243 | mode = "always" 244 | ``` 245 |
246 | 247 | ## Results 248 | 249 | Wait until the pods from the sample workload are in running status: 250 | ```sh 251 | > kubectl wait --for=condition=Ready pods --all --namespace default --timeout=300s 252 | pod/vllm-59bfb6f86c-9nfxb condition met 253 | pod/vllm-soci-6d9bfd996d-vhr4j condition met 254 | pod/vllm-soci-br-74b59cc4bd-rq8cw condition met 255 | ``` 256 | 257 | The sample workload deploys three Deployments running [Amazon Deep Learning Container (DLC) for vLLM](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/dlc-vllm-x86-ec2.html) two using SOCI parallel pull/unpack mode (AL2023, Bottlerocket) and one remains using the default containerd implementation. 258 | > ***NOTE*** The Amazon DLC for vLLM container image size is about **~10GB** 259 | 260 | Let's examine the pull time for each Deployment: 261 | 262 | The `vllm` deployment using the default containerd implementation results in pull time of **1m52.33s**. 263 | ```sh 264 | > kubectl describe pod -l app=vllm | grep Pulled 265 | Normal Pulled 7m2s kubelet Successfully pulled image "763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2" 266 | in 1m52.33s (1m52.33s including waiting). Image size: 10778400361 bytes. 267 | ``` 268 | 269 | The `vllm-soci` deployment using SOCI snapshotter's parallel pull/unpack mode implementation results in pull time of **59.813s**. 270 | ```sh 271 | > kubectl describe pod -l app=vllm-soci | grep Pulled 272 | Normal Pulled 8m27s kubelet Successfully pulled image "763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2" 273 | in 59.813s (59.813s including waiting). Image size: 10778400361 bytes. 274 | ``` 275 | 276 | The `vllm-soci-br` deployment using SOCI snapshotter's parallel pull/unpack mode implementation on Bottlerocket, results in pull time of **44.974s**. 277 | ```sh 278 | > kubectl describe pod -l app=vllm-soci-br | grep Pulled 279 | Normal Pulled 9m46s kubelet Successfully pulled image "763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2" 280 | in 44.974s (44.974s including waiting). Image size: 10778400361 bytes. 281 | ``` 282 | 283 | We can see that using SOCI snapshotter's improved container pull time by about **50%** on Amazon Linux 2023, and about **60%** on Bottlerocket, the reason for that is that Bottlerocket have an improved decompression library for Intel based CPUs ([bottlerocket-core-kit PR #443](https://github.com/bottlerocket-os/bottlerocket-core-kit/pull/443)) 284 | 285 | 286 | ## Cleanup 287 | 288 | To remove all objects created, simply run the following commands: 289 | 290 | ```sh 291 | kubectl delete -f . 292 | ``` 293 | 294 | -------------------------------------------------------------------------------- /blueprints/batch-jobs/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: Protecting batch jobs during the disruption (consolidation) process 2 | 3 | ## Purpose 4 | 5 | Karpenter can actively reduce the cluster cost by identifying when nodes can be removed or replaced because they are empty or there are a cheaper one available after some workload change. This process is called [consolidation](https://karpenter.sh/preview/concepts/disruption/#consolidation), and it implies the disruption of pods that are running in the node, if any, as they need to be rescheduled into another node. In some cases, like when running long batch jobs, you don't want those pods to be disrupted. You want to run them from start to finish without disruption, and replace or delete the node once they finish. To achieve that, you can set the `karpenter.sh/do-not-disrupt: "true"` annotation on the pod (more information [here](https://karpenter.sh/preview/concepts/disruption/#pod-level-controls)). By opting pods out of this disruption, you are telling Karpenter that it should not voluntarily remove a node containing this pod. 6 | 7 | ## Requirements 8 | 9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint you have used to test this pattern at the `cluster` folder in the root of this repository. 10 | * A `default` Karpenter `NodePool` as that is the one you will use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository. 11 | 12 | ## Deploy 13 | 14 | You are going to use the `default` NodePool. 15 | 16 | If you want to first observe the default behaviour of pods being disrupted during the consolidation process, jump to [(Optional) Simulating the default behaviour](#(optional)-simulating-the-default-behaviour). 17 | 18 | If you want to directly see how to avoid the disruption of jobs by the consolidation process, jump to [Preventing jobs of being evicted](#preventing-jobs-of-being-evicted). 19 | 20 | ### (optional) Simulating the default behaviour 21 | 22 | This section simulates the default behaviour of the pods explained before, in which the Karpenter consolidation process disrupts the pods running the jobs, and re-schedule them into the cheaper node. To simulate it, deploy the [workloads-evicted yaml](/blueprints/batch-jobs/workloads-evicted.yaml): 23 | 24 | ```sh 25 | $> kubectl apply -f workloads-evicted.yaml 26 | deployment.apps/nginx created 27 | job.batch/2-min-job created 28 | job.batch/5-min-job created 29 | ``` 30 | 31 | This will create three pods that require **11 vCPU** in total: 32 | * NGINX server - 2 vCPU required 33 | * 2-minutes job - 7 vCPU required 34 | * 5-minutes job - 2 vCPU required 35 | 36 | During this test, Karpenter decided to launch a **c6g.4xlarge** on-demand instance (16 vCPU, 32 GiB). You can check this by executing: 37 | 38 | ```sh 39 | kubectl get nodes --label-columns node.kubernetes.io/instance-type 40 | ``` 41 | 42 | After two minutes, the first job finishes and the pod is terminated: 43 | 44 | ```sh 45 | >kubectl get events --field-selector involvedObject.kind=Job --sort-by='.lastTimestamp' 46 | LAST SEEN TYPE REASON OBJECT MESSAGE 47 | 5m Normal SuccessfulCreate job/2-min-job Created pod: 2-min-job-rst5w 48 | 5m Normal SuccessfulCreate job/5-min-job Created pod: 5-min-job-l72p8 49 | 3m Normal Completed job/2-min-job Job completed 50 | ``` 51 | 52 | ```sh 53 | > $kubectl get pods 54 | NAME READY STATUS RESTARTS AGE 55 | 5-min-job-6ffsg 1/1 Running 0 2m50s 56 | nginx-8467c776-r8j24 1/1 Running 0 2m50s 57 | ``` 58 | 59 | Now, the total number of vCPU required by the running pods are **4 vCPU**: 60 | * NGINX server - 2 vCPU required 61 | * 5-minutes job - 2 vCPU required 62 | 63 | The default behaviour is the one defined in the NodePool: `consolidationPolicy: WhenEmptyOrUnderutilized`. Karpenter identifies the **c6g.4xlarge** (12 vCPU) is underutilized, and performs a consolidation replacement of the node. It launches a cheaper and smaller node: a **c6g.2xlarge** (8 vCPU) instance. You can check these logs by executing the following command in another terminal: 64 | 65 | ```sh 66 | kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20 67 | ``` 68 | 69 | You should see these logs: 70 | 71 | ```json 72 | {"level":"INFO","time":"2025-05-30T10:15:01.605Z","logger":"controller","message":"disrupting node(s)","commit":"9458bb5","controller":"disruption","namespace":"","name":"","reconcileID":"f44d738f-e895-428b-b0ec-f1b5e5a96996","command-id":"dae73246-b739-42d8-91b8-c80ee651b6ac","reason":"underutilized","decision":"replace","disrupted-node-count":1,"replacement-node-count":1,"pod-count":2,"disrupted-nodes":[{"Node":{"name":"ip-10-0-116-149.eu-west-2.compute.internal"},"NodeClaim":{"name":"default-8t7np"},"capacity-type":"on-demand","instance-type":"c6g.4xlarge"}],"replacement-nodes":[{"capacity-type":"on-demand","instance-types":"c6g.2xlarge, c7g.2xlarge, m6g.2xlarge, c6a.2xlarge, c5a.2xlarge and 36 other(s)"}]} 73 | 74 | ... 75 | {"level":"INFO","time":"2025-05-30T10:10:49.907Z","logger":"controller","message":"launched nodeclaim","commit":"9458bb5","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-8t7np"},"namespace":"","name":"default-8t7np","reconcileID":"08263c3f-5565-4916-8932-db4596bd1f40","provider-id":"aws:///eu-west-2c/i-0f4e940ab58541307","instance-type":"c6g.4xlarge","zone":"eu-west-2c","capacity-type":"on-demand","allocatable":{"cpu":"15890m","ephemeral-storage":"17Gi","memory":"27322Mi","pods":"234","vpc.amazonaws.com/pod-eni":"54"}} 76 | ``` 77 | 78 | The NGINX server and the 5-min job pods are rescheduled into the new c6g.2xlarge node, so **the job is restarted**, which will cause a disruption the job might not be prepared to handle like doing a checkpoint. 79 | 80 | After five more minutes, the job will finish, and Karpenter will replace the node with a **c6g.xlarge** instance (4 vCPU) for the NGINX server. You can repeat the previous steps to verify this behaviour. 81 | 82 | To clean up, execute: 83 | 84 | ```sh 85 | kubectl delete -f workloads-evicted.yaml 86 | ``` 87 | 88 | To learn how to avoid this behaviour and wait for the job to be finished before replacing the node, go to [Preventing jobs of being evicted](#preventing-jobs-of-being-evicted). 89 | 90 | ### Preventing jobs of being evicted 91 | 92 | If you executed the [optional](#optional-simulating-the-default-behaviour) part, make sure to delete the `workloads-evicted` deployment: 93 | 94 | ```sh 95 | kubectl delete -f workloads-evicted.yaml 96 | ``` 97 | 98 | Let's start by deploying the workloads defined in the [workloads-not-evicted yaml](/blueprints/batch-jobs/workloads-not-evicted.yaml): 99 | 100 | ```sh 101 | $> kubectl apply -f workloads-not-evicted.yaml 102 | deployment.apps/nginx created 103 | job.batch/2-min-job created 104 | job.batch/5-min-job created 105 | ``` 106 | 107 | This will create three pods that require **11 vCPU** in total: 108 | * NGINX server - 2 vCPU required 109 | * 2-minutes job - 7 vCPU required 110 | * 5-minutes job - 2 vCPU required 111 | 112 | If you explore the [workloads-not-evicted yaml](/blueprints/batch-jobs/workloads-not-evicted.yaml), the `karpenter.sh/do-not-disrupt: "true"` annotations have been added to both jobs specifications. 113 | 114 | Go to [Results section](#results) to check the behaviour. 115 | 116 | ***NOTE:*** 117 | The sample deployment only allows scheduling pods on on-demand instances (`nodeSelector: karpenter.sh/capacity-type: on-demand`) to show the replace consolidation mechanism, as for spot nodes Karpenter only uses the deletion consolidation mechanism to avoid breaking the price-capacity-optimized strategy, as explained [here](https://karpenter.sh/preview/concepts/disruption/#consolidation). 118 | 119 | ## Results 120 | 121 | ### Deployment verification 122 | 123 | Karpenter launches the cheapest EC2 instance for the workloads with at least **11 vCPU**: a **c6g.4xlarge** on-demand instance (16 vCPU, 32 GiB). You can check this by executing: 124 | 125 | ```sh 126 | kubectl get nodes --label-columns node.kubernetes.io/instance-type 127 | ``` 128 | 129 | You should see something similar to this, where a new node just appeared: 130 | 131 | ```console 132 | NAME STATUS ROLES AGE VERSION INSTANCE-TYPE 133 | ip-10-0-125-209.eu-west-1.compute.internal Ready 16d v1.32.3-eks-473151a m4.large 134 | ip-10-0-46-139.eu-west-1.compute.internal Ready 16d v1.32.3-eks-473151a m4.large 135 | ip-10-0-47-60.eu-west-1.compute.internal Ready 44s v1.32.3-eks-473151a c6g.4xlarge 136 | ``` 137 | 138 | Check the three new pods are running by executing: 139 | 140 | ```sh 141 | $> kubectl get pods 142 | NAME READY STATUS RESTARTS AGE 143 | 2-min-job-ml6qj 1/1 Running 0 25s 144 | 5-min-job-9jc4b 1/1 Running 0 24s 145 | nginx-8467c776-bbl8w 1/1 Running 0 25s 146 | ``` 147 | 148 | You can check the jobs status by executing: 149 | 150 | ```sh 151 | $> kubectl get jobs 152 | NAME COMPLETIONS DURATION AGE 153 | 2-min-job 0/1 52s 52s 154 | 5-min-job 0/1 51s 51s 155 | $> kubectl get jobs 156 | NAME COMPLETIONS DURATION AGE 157 | 2-min-job 0/1 52s 52s 158 | 5-min-job 0/1 51s 51s 159 | ``` 160 | 161 | In a different terminal, execute the following command that will display the Karpenter logs in real time: 162 | 163 | ```sh 164 | kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20 165 | ``` 166 | 167 | You should see the following events indicating that Karpenter identified the need of a new node, and that it selected an instance type and purchase option: 168 | 169 | ```json 170 | {"level":"INFO","time":"2024-08-16T10:10:47.683Z","logger":"controller","message":"found provisionable pod(s)","commit":"5bdf9c3","controller":"provisioner","namespace":"","name":"","reconcileID":"d8e8907d-5b93-46bb-893a-63520f3ec12f","Pods":"default/2-min-job-czp5x","duration":"39.859328ms"} 171 | 172 | {"level":"INFO","time":"2024-08-16T10:10:47.683Z","logger":"controller","message":"computed new nodeclaim(s) to fit pod(s)","commit":"5bdf9c3","controller":"provisioner","namespace":"","name":"","reconcileID":"d8e8907d-5b93-46bb-893a-63520f3ec12f","nodeclaims":1,"pods":1} 173 | 174 | {"level":"INFO","time":"2024-08-16T10:10:47.699Z","logger":"controller","message":"created nodeclaim","commit":"5bdf9c3","controller":"provisioner","namespace":"","name":"","reconcileID":"d8e8907d-5b93-46bb-893a-63520f3ec12f","NodePool":{"name":"default"},"NodeClaim":{"name":"default-g4kgp"},"requests":{"cpu":"7260m","memory":"290Mi","pods":"6"},"instance-types":"c4.2xlarge, c5.2xlarge, c5.4xlarge, c5a.2xlarge, c5a.4xlarge and 55 other(s)"} 175 | ... 176 | {"level":"INFO","time":"2024-08-16T10:10:49.959Z","logger":"controller","message":"launched nodeclaim","commit":"5bdf9c3","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-g4kgp"},"namespace":"","name":"default-g4kgp","reconcileID":"ff5b7f6e-c52e-495e-94b1-3a30385c3439","provider-id":"aws:///eu-west-2a/i-022a05d79bceda579","instance-type":"c6g.2xlarge","zone":"eu-west-2a","capacity-type":"on-demand","allocatable":{"cpu":"7910m","ephemeral-storage":"17Gi","memory":"14103Mi","pods":"58","vpc.amazonaws.com/pod-eni":"38"}} 177 | 178 | ``` 179 | 180 | ### Consolidation Replace blocked due to ongoing job 181 | 182 | Around two minutes after the deployment, the first job finishes: 183 | 184 | ```sh 185 | $> kubectl get jobs 186 | NAME COMPLETIONS DURATION AGE 187 | 2-min-job 1/1 2m41s 2m46s 188 | 5-min-job 0/1 2m45s 2m45s 189 | ``` 190 | 191 | The pod executing the job is terminated. Now you should just see two pods, one for the NGINX server and one for the other 5-minutes job: 192 | 193 | ```sh 194 | $> kubectl get pods 195 | NAME READY STATUS RESTARTS AGE 196 | 5-min-job-9jc4b 1/1 Running 0 2m56s 197 | nginx-8467c776-bbl8w 1/1 Running 0 2m57s 198 | ``` 199 | 200 | Now, the total number of vCPU required by the running pods are **4 vCPU**: 201 | * NGINX server - 2 vCPU required 202 | * 5-minutes job - 2 vCPU required 203 | 204 | In contrast to the default behaviour, even though a smaller and cheaper instance could be used, Karpenter reads the `karpenter.sh/do-not-disrupt: "true"` annotation on the 5-minutes job pod and **blocks the consolidation replace** process for that node: 205 | 206 | ```sh 207 | $> kubectl describe node 208 | ... 209 | Normal NodeReady 6m7s kubelet Node ip-10-0-97-15.eu-west-1.compute.internal status is now: NodeReady 210 | Normal DisruptionBlocked 4m12s karpenter Cannot disrupt Node: pod "default/2-min-job-2fssd" has "karpenter.sh/do-not-disrupt" annotation 211 | Normal DisruptionBlocked 2m12s karpenter Cannot disrupt Node: pod "default/5-min-job-7pqdt" has "karpenter.sh/do-not-disrupt" annotation 212 | ``` 213 | 214 | ### Consolidation Replace allowed after last job finishes 215 | 216 | Around five minutes after the deployment, the other job finishes: 217 | 218 | ```sh 219 | $> kubectl get jobs 220 | NAME COMPLETIONS DURATION AGE 221 | 5-min-job 1/1 5m40s 5m46s 222 | ``` 223 | 224 | Now, **it is possible to replace the node** by a cheaper and smaller instance because the the NGINX server can be disrupted as it does't contain the `karpenter.sh/do-not-disrupt: "true"` annotation. You can check this in the Karpenter logs terminal: 225 | 226 | ```json 227 | {"level":"INFO","time":"2024-08-16T10:17:21.322Z","logger":"controller","message":"created nodeclaim","commit":"5bdf9c3","controller":"disruption","namespace":"","name":"","reconcileID":"1135db0e-45ef-4529-9492-63789a9837c6","NodePool":{"name":"default"},"NodeClaim":{"name":"default-9m4bv"},"requests":{"cpu":"2260m","memory":"290Mi","pods":"6"},"instance-types":"c4.xlarge, c5.xlarge, c5a.xlarge, c5d.xlarge, c5n.xlarge and 32 other(s)"} 228 | ... 229 | {"level":"INFO","time":"2024-08-16T10:17:23.452Z","logger":"controller","message":"launched nodeclaim","commit":"5bdf9c3","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-9m4bv"},"namespace":"","name":"default-9m4bv","reconcileID":"f0e0cc47-45a9-479c-a1c7-b5f0f0341026","provider-id":"aws:///eu-west-2a/i-0a4fa068af5550afa","instance-type":"c6g.xlarge","zone":"eu-west-2a","capacity-type":"on-demand","allocatable":{"cpu":"3920m","ephemeral-storage":"17Gi","memory":"6525Mi","pods":"58","vpc.amazonaws.com/pod-eni":"18"}} 230 | ... 231 | {"level":"INFO","time":"2024-08-16T10:18:07.430Z","logger":"controller","message":"tainted node","commit":"5bdf9c3","controller":"node.termination","controllerGroup":"","controllerKind":"Node","Node":{"name":"ip-10-0-42-175.eu-west-2.compute.internal"},"namespace":"","name":"ip-10-0-42-175.eu-west-2.compute.internal","reconcileID":"a57044a6-f00f-41e5-a1ab-31e4b19dd838","taint.Key":"karpenter.sh/disrupted","taint.Value":"","taint.Effect":"NoSchedule"} 232 | 233 | {"level":"INFO","time":"2024-08-16T10:18:50.331Z","logger":"controller","message":"deleted node","commit":"5bdf9c3","controller":"node.termination","controllerGroup":"","controllerKind":"Node","Node":{"name":"ip-10-0-42-175.eu-west-2.compute.internal"},"namespace":"","name":"ip-10-0-42-175.eu-west-2.compute.internal","reconcileID":"2a51acf8-702f-4c75-988d-92052d690b01"} 234 | ``` 235 | 236 | Karpenter replaces the **c6g.4xlarge** (16 vCPU, 32 GiB) with a **c6g.xlarge** node (4 vCPU, 8 GiB), enough for the NGINX server: 237 | 238 | ```sh 239 | $> kubectl get nodes --label-columns node.kubernetes.io/instance-type 240 | NAME STATUS ROLES AGE VERSION INSTANCE-TYPE 241 | ip-10-0-105-122.eu-west-2.compute.internal Ready 10m v1.32.3-eks-473151a m4.large 242 | ip-10-0-34-49.eu-west-2.compute.internal Ready 10m v1.32.3-eks-473151a m4.large 243 | ip-10-0-85-30.eu-west-1.compute.internal Ready 10m v1.32.3-eks-473151a c6g.xlarge 244 | ``` 245 | 246 | Finally, you can check the NGINX server pod has been re-scheduled into the new pod: 247 | 248 | ```sh 249 | $> kubectl get pods 250 | NAME READY STATUS RESTARTS AGE 251 | nginx-8467c776-vjwgv 1/1 Running 0 22s 252 | ``` 253 | 254 | ## Cleanup 255 | 256 | ```sh 257 | kubectl delete -f . 258 | ``` 259 | -------------------------------------------------------------------------------- /blueprints/disruption-budgets/README.md: -------------------------------------------------------------------------------- 1 | # Karpenter Blueprint: NodePool Disruption Budgets 2 | 3 | ## Purpose 4 | 5 | Karpenter's actions like consolidation, drift detection and `expireAfter`, allow users to optimize for cost in the case of consolidation, keep up with the latest security patches and desired configuration, or ensure governance best practices, like refreshing instances every N days. These actions cause, as a trade-off, some level of disruption in the cluster caused by expected causes. To control the trade-off between, for example, being on the latest AMI (drift detection) and nodes restarting when that happens we can use disruption controls and configure `disruption budgets` in the Karpenter `NodePool` configuration. If no disruption budget is configured their is a default budget with `nodes: 10%`. When calculating if a budget will block nodes from disruption, Karpenter checks if the number of nodes being deleted is greater than the number of allowed disruptions. Budgets take into consideration voluntary disruptions through expiration, drift, emptiness and consolidation. If there are multiple budgets defined in the `NodePool`, Karpenter will honour the most restrictive of the budgets. 6 | 7 | By applying a combination of disruptions budgets and Pod Disruptions Budgets (PDBs) you get both application and platform voluntary disruption controls, this can help you move towards continually operations to protect workload availability. You can learn more about Karpenter NodePool disruption budgets and how the Kapenter disruption controller works in the [Karpenter documentation](https://karpenter.sh/docs/concepts/disruption/#disruption-controller). 8 | 9 | ## Examples 10 | 11 | The following provides a set of example disruption budgets: 12 | 13 | ### Limit Disruptions to a Percentage of Nodes 14 | 15 | To prevent disruptions from affecting more than a certain percentage of nodes in a NodePool 16 | 17 | The following Disruption Budgets says, at any-point in time only disrupt 20% of the Nodes managed by the NodePool. For instance, if there were 19 nodes owned by the NodePool, 4 disruptions would be allowed, rounding up from 19 * .2 = 3.8. 18 | 19 | ```yaml 20 | apiVersion: karpenter.sh/v1 21 | kind: NodePool 22 | metadata: 23 | name: default 24 | spec: 25 | ... 26 | disruption: 27 | consolidationPolicy: WhenEmptyOrUnderutilized 28 | budgets: 29 | - nodes: "20%" 30 | template: 31 | spec: 32 | expireAfter: 720h # 30 days 33 | ``` 34 | 35 | ### No Disruptions During Peak Hours 36 | 37 | This configuration ensures that Karpenter avoids disrupting workloads during peak traffic periods. Specifically, it prevents disruptions from UTC 9:00 for an 8-hour window and limits disruptions to 20% outside of this window. 38 | 39 | ```yaml 40 | apiVersion: karpenter.sh/v1 41 | kind: NodePool 42 | metadata: 43 | name: default 44 | spec: 45 | disruption: 46 | consolidationPolicy: WhenEmptyOrUnderutilized 47 | consolidateAfter: 1m 48 | budgets: 49 | - nodes: "0" 50 | schedule: "0 9 * * *" 51 | duration: 8h 52 | - nodes: "20%" 53 | schedule: "0 17 * * *" 54 | duration: 16h 55 | ``` 56 | 57 | ### Allow 20% disruptions during a maintenance window from UTC 22:00 to 2:00, but only 10% disruptions outside of a maintenance window 58 | 59 | By setting multiple disruption budgets, you can gain precise control over node disruptions. Karpenter will use the most restrictive budget applicable at any given time. 60 | 61 | In the following example, disruptions are limited to 20% of nodes during a 4-hour period starting from UTC 22:00. During the remaining hours (UTC 2:00 - 22:00), disruptions are limited to 10% of nodes. 62 | 63 | ```yaml 64 | apiVersion: karpenter.sh/v1 65 | kind: NodePool 66 | metadata: 67 | name: default 68 | spec: 69 | disruption: 70 | consolidationPolicy: WhenEmptyOrUnderutilized 71 | consolidateAfter: 1m 72 | budgets: 73 | - nodes: "20%" 74 | schedule: "0 22 * * *" 75 | duration: 4h 76 | - nodes: "10%" 77 | schedule: "0 2 * * *" 78 | duration: 20h 79 | ``` 80 | 81 | ### Multiple Budgets Defined 82 | 83 | The following configuration illustrates a NodePool with three disruption budgets: 84 | 85 | The first budget allows up to 20% of nodes to be disrupted at any time. 86 | The second budget imposes a maximum of 5 disruptions. 87 | The third budget blocks all disruptions during the first 10 minutes of each day. 88 | 89 | While the first and second budgets are always in effect, they work together to limit disruptions to a maximum of 5 nodes at any given time. Karpenter will apply the most restrictive budget when multiple budgets overlap, enabling flexible disruption policies for different scenarios, such as during maintenance windows. 90 | 91 | > **Note:** If multiple budgets are active at the same time, Karpenter will consider the most restrictive budget. You might consider using multiple disruption budgets to establish a default policy while providing an alternative policy for specific times, such as allowing more disruptions during maintenance windows to roll out new Amazon Machine Images faster. 92 | 93 | ```yaml 94 | apiVersion: karpenter.sh/v1 95 | kind: NodePool 96 | metadata: 97 | name: default 98 | spec: 99 | disruption: 100 | consolidationPolicy: WhenEmptyOrUnderutilized 101 | consolidateAfter: 1m 102 | budgets: 103 | - nodes: "20%" 104 | - nodes: "5" 105 | - nodes: "0" 106 | schedule: "@daily" 107 | duration: 10m 108 | ``` 109 | 110 | ### Disrupting by Reasons 111 | 112 | Karpenter allows specifying if a budget applies to any of `Drifted`, `Underutilized`, or `Empty`. When a budget has no reasons, it’s assumed that it applies to all reasons. When calculating allowed disruptions for a given reason, Karpenter will take the minimum of the budgets that have listed the reason or have left reasons undefined. 113 | 114 | #### Only Drifted Nodes 115 | 116 | This example sets a budget that applies only to nodes classified as Drifted. During times when nodes are identified as Drifted, Karpenter will only disrupt up to 20% of those nodes. 117 | 118 | ```yaml 119 | apiVersion: karpenter.sh/v1 120 | kind: NodePool 121 | metadata: 122 | name: example-drifted 123 | spec: 124 | disruption: 125 | consolidationPolicy: WhenEmptyOrUnderutilized 126 | budgets: 127 | - nodes: "20%" 128 | reasons: 129 | - "Drifted" 130 | ``` 131 | 132 | #### Only Underutilized Nodes 133 | 134 | This example sets a budget that applies only to nodes classified as Underutilized. During times when nodes are identified as Underutilized, Karpenter will only disrupt up to 30% of those nodes. 135 | 136 | ```yaml 137 | apiVersion: karpenter.sh/v1 138 | kind: NodePool 139 | metadata: 140 | name: example-underutilized 141 | spec: 142 | disruption: 143 | consolidationPolicy: WhenEmptyOrUnderutilized 144 | budgets: 145 | - nodes: "30%" 146 | reasons: 147 | - "Underutilized" 148 | ``` 149 | 150 | #### Only Empty Nodes 151 | 152 | This example sets a budget that applies only to nodes classified as Empty. During times when nodes are identified as Empty, Karpenter will only disrupt up to 10% of those nodes. 153 | 154 | ```yaml 155 | apiVersion: karpenter.sh/v1 156 | kind: NodePool 157 | metadata: 158 | name: example-empty 159 | spec: 160 | disruption: 161 | consolidationPolicy: WhenEmptyOrUnderutilized 162 | budgets: 163 | - nodes: "10%" 164 | reasons: 165 | - "Empty" 166 | ``` 167 | 168 | ## Requirements 169 | 170 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the cluster folder in the root of this repository. 171 | 172 | ## Deploy 173 | 174 | Let's say you want to control how nodes are upgraded when switching to Bottlerocket via Karpenter Drift, in this example we deploy a disruption budget, that prevents disruptions 24 hours a day 7 days a week. You can use the schedule and duration of the budget to control when disruptions via Drift can take place. 175 | 176 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes: 177 | 178 | ```sh 179 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name) 180 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name) 181 | ``` 182 | 183 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN). 184 | 185 | To deploy the Karpenter NodePool and the sample workload, simply run this command: 186 | 187 | ```sh 188 | sed -i '' "s/<>/$CLUSTER_NAME/g" disruption-budgets.yaml 189 | sed -i '' "s/<>/$KARPENTER_NODE_IAM_ROLE_NAME/g" disruption-budgets.yaml 190 | kubectl apply -f . 191 | ``` 192 | 193 | You should see the following output: 194 | 195 | ```console 196 | nodepool.karpenter.sh/disruption-budget created 197 | ec2nodeclass.karpenter.k8s.aws/disruption-budget created 198 | deployment.apps/disruption-budget created 199 | ``` 200 | 201 | You should now see new nodes provisioned in your Amazon EKS cluster: 202 | 203 | ```sh 204 | > kubectl get nodes 205 | NAME STATUS ROLES AGE VERSION 206 | ip-10-0-103-232.eu-west-2.compute.internal Ready 2m8s v1.32.2-eks-677bac1 207 | ip-10-0-120-141.eu-west-2.compute.internal Ready 2m44s v1.32.2-eks-677bac1 208 | ip-10-0-38-179.eu-west-2.compute.internal Ready 2m8s v1.32.2-eks-677bac1 209 | ip-10-0-39-106.eu-west-2.compute.internal Ready 2m18s v1.32.2-eks-677bac1 210 | ip-10-0-50-60.eu-west-2.compute.internal Ready 17m v1.32.2-eks-677bac1 211 | ip-10-0-55-94.eu-west-2.compute.internal Ready 2m47s v1.32.2-eks-677bac1 212 | ip-10-0-63-247.eu-west-2.compute.internal Ready 2m40s v1.32.2-eks-677bac1 213 | ip-10-0-66-70.eu-west-2.compute.internal Ready 17m v1.32.2-eks-677bac1 214 | ip-10-0-72-85.eu-west-2.compute.internal Ready 2m50s v1.32.2-eks-677bac1 215 | ip-10-0-82-100.eu-west-2.compute.internal Ready 2m29s v1.32.2-eks-677bac1 216 | ip-10-0-95-228.eu-west-2.compute.internal Ready 2m19s v1.32.2-eks-677bac1 217 | ip-10-0-96-121.eu-west-2.compute.internal Ready 2m26s v1.32.2-eks-677bac1 218 | ``` 219 | 220 | Now, use the `kubectl patch` command to change `spec.amiSelectorTerms` alias from `al20232023.0.20230222` to `bottlerocket@v1.39.1`. 221 | 222 | ```sh 223 | kubectl patch ec2nodeclass disruption-budget --type='json' -p='[ 224 | {"op": "replace", "path": "/spec/amiSelectorTerms/0/alias", "value": "bottlerocket@v1.39.1"} 225 | ]' 226 | ``` 227 | 228 | ## Results 229 | 230 | This is an example of an overly restrictive budget for demo purposes as it will prevent any voluntary disruptions via emptiness, drift, emptiness and consolidation. We learn from this that the schedule states when the budget is first active and the duration specifies how long the budget is active - a duration must be specified if a schedule is set otherwise the budget is always active. 231 | 232 | Karpenter will try to replace nodes via the Drift mechanism on an AMI change. However, if you watch the nodes, you’ll notice that they’re not being replaced with new instances provisioned with the Bottlerocket Amazon EKS optimized AMI. 233 | 234 | ```sh 235 | > kubectl get nodes -o wide -w 236 | 237 | NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME 238 | ip-10-0-103-232.eu-west-2.compute.internal Ready 3m22s v1.32.2-eks-677bac1 10.0.103.232 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 239 | ip-10-0-120-141.eu-west-2.compute.internal Ready 3m58s v1.32.2-eks-677bac1 10.0.120.141 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 240 | ip-10-0-38-179.eu-west-2.compute.internal Ready 3m22s v1.32.2-eks-677bac1 10.0.38.179 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 241 | ip-10-0-39-106.eu-west-2.compute.internal Ready 3m32s v1.32.2-eks-677bac1 10.0.39.106 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 242 | ip-10-0-50-60.eu-west-2.compute.internal Ready 18m v1.32.2-eks-677bac1 10.0.50.60 Amazon Linux 2023.5.20240805 6.1.102-108.177.amzn2023.x86_64 containerd://1.7.20 243 | ip-10-0-55-94.eu-west-2.compute.internal Ready 4m1s v1.32.2-eks-677bac1 10.0.55.94 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 244 | ip-10-0-63-247.eu-west-2.compute.internal Ready 3m54s v1.32.2-eks-677bac1 10.0.63.247 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 245 | ip-10-0-66-70.eu-west-2.compute.internal Ready 18m v1.32.2-eks-677bac1 10.0.66.70 Amazon Linux 2023.5.20240805 6.1.102-108.177.amzn2023.x86_64 containerd://1.7.20 246 | ip-10-0-72-85.eu-west-2.compute.internal Ready 4m4s v1.32.2-eks-677bac1 10.0.72.85 Amazon Linux 2 5.10.223-211.872.amzn2.x86_64 containerd://1.7.20 247 | ip-10-0-82-100.eu-west-2.compute.internal Ready 3m43s v1.32.2-eks-677bac1 10.0.82.100 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 248 | ip-10-0-95-228.eu-west-2.compute.internal Ready 3m33s v1.32.2-eks-677bac1 10.0.95.228 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 249 | ip-10-0-96-121.eu-west-2.compute.internal Ready 3m40s v1.32.2-eks-677bac1 10.0.96.121 Amazon Linux 2 5.10.223-211.872.amzn2.aarch64 containerd://1.7.20 250 | ``` 251 | 252 | You will also see the following message in Kubernetes events stating disruptions are blocked: 253 | 254 | ```sh 255 | > kubectl get events -w 256 | 257 | 0s Normal DisruptionBlocked nodepool/restrictive-budget No allowed disruptions for disruption reason Drifted due to blocking budget 258 | 0s Normal DisruptionBlocked nodepool/restrictive-budget No allowed disruptions for disruption reason Underutilized due to blocking budget 259 | 0s Normal DisruptionBlocked nodepool/restrictive-budget No allowed disruptions for disruption reason Empty due to blocking budget 260 | 0s Normal DisruptionBlocked nodepool/restrictive-budget No allowed disruptions due to blocking budget 261 | ``` 262 | 263 | This is because the NodePool defines the following budget which states, starting at UTC 00:00 everyday, for a time period of 24 hours no nodes can be voluntary drifted. This is a great fit when you want consolidation but might not want to apply it all the time. 264 | 265 | ```yaml 266 | budgets: 267 | - nodes: "0" 268 | schedule: "0 0 * * *" 269 | duration: 24h 270 | ``` 271 | 272 | If you edit the NodePool and replace the budget with the following, Karpenter will be able to Drift 20% of the Nodes. 273 | 274 | Edit with the `kubectl patch` command. 275 | 276 | ```sh 277 | kubectl patch nodepool disruption-budget --type='json' -p='[ 278 | {"op": "replace", "path": "/spec/disruption/budgets/0/nodes", "value": "20"} 279 | ]' 280 | ``` 281 | 282 | After modifying that budget for the NodePool you should observe the nodes drifting and new nodes being provisioned with the latest Amazon EKS optimized Bottlerocket AMI. 283 | 284 | ```sh 285 | > kubectl get nodes -o custom-columns=NAME:.metadata.name,OS-IMAGE:.status.nodeInfo.osImage 286 | 287 | NAME OS-IMAGE 288 | ip-10-0-103-176.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 289 | ip-10-0-106-25.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 290 | ip-10-0-108-51.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 291 | ip-10-0-115-104.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 292 | ip-10-0-116-220.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 293 | ip-10-0-121-123.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 294 | ip-10-0-43-37.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 295 | ip-10-0-50-60.eu-west-2.compute.internal Amazon Linux 2023.5.20240805 296 | ip-10-0-57-199.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 297 | ip-10-0-59-82.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 298 | ip-10-0-62-198.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 299 | ip-10-0-62-228.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 300 | ip-10-0-66-249.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 301 | ip-10-0-66-70.eu-west-2.compute.internal Amazon Linux 2023.5.20240805 302 | ip-10-0-67-142.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 303 | ip-10-0-67-203.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 304 | ip-10-0-67-255.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 305 | ip-10-0-68-97.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 306 | ip-10-0-70-55.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 307 | ip-10-0-73-112.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 308 | ip-10-0-75-130.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 309 | ip-10-0-77-110.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 310 | ip-10-0-78-43.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 311 | ip-10-0-91-17.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 312 | ip-10-0-97-201.eu-west-2.compute.internal Bottlerocket OS 1.39.1 (aws-k8s-1.32) 313 | ``` 314 | 315 | You will also see the following message in Kubernetes events stating a node has been drifted: 316 | 317 | ```console 318 | 0s Normal DisruptionTerminating node/ip-10-0-96-121.eu-west-2.compute.internal Disrupting Node: Drifted/Delete 319 | 0s Warning InstanceTerminating node/ip-10-0-96-121.eu-west-2.compute.internal Instance is terminating 320 | 0s Normal RemovingNode node/ip-10-0-96-121.eu-west-2.compute.internal Node ip-10-0-96-121.eu-west-2.compute.internal event: Removing Node ip-10-0-96-121.eu-west-2.compute.internal from Controller 321 | ``` 322 | 323 | ## Clean-up 324 | 325 | To remove all objects created, simply run the following commands: 326 | 327 | ```sh 328 | kubectl delete -f . 329 | ``` 330 | --------------------------------------------------------------------------------