├── .gitignore
├── CODE_OF_CONDUCT.md
├── cluster
    └── terraform
    │   ├── variables.tf
    │   ├── versions.tf
    │   ├── outputs.tf
    │   ├── karpenter.tf
    │   └── main.tf
├── blueprints
    ├── overprovision
    │   ├── workload.yaml
    │   ├── dummy-workload.yaml
    │   └── README.md
    ├── userdata
    │   ├── workload.yaml
    │   ├── userdata.yaml
    │   └── README.md
    ├── multi-ebs
    │   ├── workload.yaml
    │   ├── multi-ebs.yaml
    │   └── README.md
    ├── custom-ami
    │   ├── workload.yaml
    │   ├── custom-ami.yaml
    │   └── README.md
    ├── saving-plans
    │   ├── workload.yaml
    │   ├── savings-plans.yaml
    │   └── README.md
    ├── stateful
    │   ├── storage.yaml
    │   ├── workload.yaml
    │   └── README.md
    ├── update-nodes-with-drift
    │   ├── workload.yaml
    │   ├── latest-current-ami.yaml
    │   └── README.md
    ├── graviton
    │   ├── workload-graviton.yaml
    │   ├── workload-flexible.yaml
    │   └── README.md
    ├── od-spot-split
    │   ├── workload.yaml
    │   ├── od-spot.yaml
    │   └── README.md
    ├── disruption-budgets
    │   ├── workload.yaml
    │   ├── disruption-budgets.yaml
    │   └── README.md
    ├── ha-az-nodes
    │   ├── workload.yaml
    │   └── README.md
    ├── batch-jobs
    │   ├── workloads-evicted.yaml
    │   ├── workloads-not-evicted.yaml
    │   └── README.md
    ├── soci-snapshotter
    │   ├── workload.yaml
    │   ├── soci-snapshotter.yaml
    │   └── README.md
    └── nvidia-gpu-workload
    │   └── README.md
├── LICENSE
├── CONTRIBUTING.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .terraform*
3 | terraform.tfstate*
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/cluster/terraform/variables.tf:
--------------------------------------------------------------------------------
1 | ## NOTE: It's going to use your AWS_REGION or AWS_DEFAULT_REGION environment variable,
2 | ## but you can define which on to use in terraform.tfvars file as well, or pass it as an argument
3 | ## in the CLI like this "terraform apply -var 'region=eu-west-1'"
4 | variable "region" {
5 |   description = "Region to deploy the resources"
6 |   type        = string
7 | }
8 | 


--------------------------------------------------------------------------------
/cluster/terraform/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.3.2"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = "~> 5.95"
 8 |     }
 9 |     kubernetes = {
10 |       source  = "hashicorp/kubernetes"
11 |       version = ">= 2.30"
12 |     }
13 |     helm = {
14 |       source  = "hashicorp/helm"
15 |       version = "~> 2.17"
16 |     }
17 |     kubectl = {
18 |       source  = "alekc/kubectl"
19 |       version = ">= 2.1"
20 |     }
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/blueprints/overprovision/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: workload
 5 | spec:
 6 |   replicas: 10
 7 |   selector:
 8 |     matchLabels:
 9 |       app: workload
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: workload
14 |     spec:
15 |       nodeSelector:
16 |         intent: apps
17 |       containers:
18 |       - name: workload
19 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
20 |         resources:
21 |           requests:
22 |             cpu: 512m
23 |             memory: 512Mi
24 | 


--------------------------------------------------------------------------------
/blueprints/userdata/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: userdata
 5 | spec:
 6 |   replicas: 3
 7 |   selector:
 8 |     matchLabels:
 9 |       app: userdata
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: userdata
14 |     spec:
15 |       nodeSelector:
16 |         intent: userdata
17 |       containers:
18 |       - name: userdata
19 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
20 |         resources:
21 |           requests:
22 |             cpu: 512m
23 |             memory: 512Mi
24 | 


--------------------------------------------------------------------------------
/blueprints/multi-ebs/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: multi-ebs
 5 | spec:
 6 |   replicas: 3
 7 |   selector:
 8 |     matchLabels:
 9 |       app: multi-ebs
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: multi-ebs
14 |     spec:
15 |       nodeSelector:
16 |         intent: multi-ebs
17 |       containers:
18 |       - name: multi-ebs
19 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
20 |         resources:
21 |           requests:
22 |             cpu: 512m
23 |             memory: 512Mi
24 | 


--------------------------------------------------------------------------------
/blueprints/custom-ami/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: custom-ami
 5 | spec:
 6 |   replicas: 3
 7 |   selector:
 8 |     matchLabels:
 9 |       app: custom-ami
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: custom-ami
14 |     spec:
15 |       nodeSelector:
16 |         intent: custom-ami
17 |       containers:
18 |       - name: custom-ami
19 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
20 |         resources:
21 |           requests:
22 |             cpu: 512m
23 |             memory: 512Mi
24 | 


--------------------------------------------------------------------------------
/blueprints/saving-plans/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: reserved-first
 5 | spec:
 6 |   replicas: 20
 7 |   selector:
 8 |     matchLabels:
 9 |       app: reserved-first
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: reserved-first
14 |     spec:
15 |       nodeSelector:
16 |         intent: apps
17 |       containers:
18 |       - name: reserved-first
19 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
20 |         resources:
21 |           requests:
22 |             cpu: 950m
23 |             memory: 512Mi
24 | 


--------------------------------------------------------------------------------
/blueprints/stateful/storage.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: storage.k8s.io/v1
 2 | kind: StorageClass
 3 | metadata:
 4 |   name: storage-gp3
 5 | provisioner: ebs.csi.aws.com
 6 | parameters:
 7 |   type: gp3
 8 | volumeBindingMode: WaitForFirstConsumer
 9 | allowedTopologies:
10 | - matchLabelExpressions:
11 |   - key: topology.ebs.csi.aws.com/zone
12 |     values: ["<<AVAILABILITY_ZONE>>"]
13 | ---
14 | apiVersion: v1
15 | kind: PersistentVolumeClaim
16 | metadata:
17 |   name: ebs-claim
18 | spec:
19 |   accessModes:
20 |     - ReadWriteOnce
21 |   storageClassName: storage-gp3
22 |   resources:
23 |     requests:
24 |       storage: 4Gi
25 | 


--------------------------------------------------------------------------------
/blueprints/update-nodes-with-drift/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: latest-current-ami
 5 | spec:
 6 |   replicas: 3
 7 |   selector:
 8 |     matchLabels:
 9 |       app: latest-current-ami
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: latest-current-ami
14 |     spec:
15 |       nodeSelector:
16 |         intent: latest-current-ami
17 |       containers:
18 |       - name: latest-current-ami
19 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
20 |         resources:
21 |           requests:
22 |             cpu: 512m
23 |             memory: 512Mi
24 | 


--------------------------------------------------------------------------------
/blueprints/graviton/workload-graviton.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: workload-graviton
 5 | spec:
 6 |   replicas: 5
 7 |   selector:
 8 |     matchLabels:
 9 |       app: workload-graviton
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: workload-graviton
14 |     spec:
15 |       nodeSelector:
16 |         intent: apps
17 |         kubernetes.io/arch: arm64
18 |       containers:
19 |       - name: workload-flexible
20 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
21 |         imagePullPolicy: Always
22 |         resources:
23 |           requests:
24 |             cpu: 512m
25 |             memory: 512Mi
26 | 


--------------------------------------------------------------------------------
/blueprints/graviton/workload-flexible.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: workload-flexible
 5 | spec:
 6 |   replicas: 5
 7 |   selector:
 8 |     matchLabels:
 9 |       app: workload-flexible
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: workload-flexible
14 |     spec:
15 |       nodeSelector:
16 |         intent: apps
17 |         karpenter.sh/capacity-type: on-demand
18 |       containers:
19 |       - name: workload-flexible
20 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
21 |         imagePullPolicy: Always
22 |         resources:
23 |           requests:
24 |             cpu: 512m
25 |             memory: 512Mi
26 | 


--------------------------------------------------------------------------------
/cluster/terraform/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "configure_kubectl" {
 2 |   description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
 3 |   value       = "aws eks --region ${var.region} update-kubeconfig --name ${module.eks.cluster_name}"
 4 | }
 5 | 
 6 | output "cluster_name" {
 7 |   description = "Cluster name of the EKS cluster"
 8 |   value       = module.eks.cluster_name
 9 | }
10 | 
11 | output "vpc_id" {
12 |   description = "VPC ID that the EKS cluster is using"
13 |   value       = module.vpc.vpc_id
14 | }
15 | 
16 | output "node_instance_role_name" {
17 |   description = "IAM Role name that each Karpenter node will use"
18 |   value       = local.name
19 | }
20 | 


--------------------------------------------------------------------------------
/blueprints/overprovision/dummy-workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: scheduling.k8s.io/v1
 2 | kind: PriorityClass
 3 | metadata:
 4 |   name: overprovisioning
 5 | value: -10
 6 | globalDefault: false
 7 | description: "Priority class used by overprovisioning."
 8 | ---
 9 | apiVersion: apps/v1
10 | kind: Deployment
11 | metadata:
12 |   name: dummy-workload
13 | spec:
14 |   replicas: 10
15 |   selector:
16 |     matchLabels:
17 |       app: workload
18 |   template:
19 |     metadata:
20 |       labels:
21 |         app: workload
22 |     spec:
23 |       nodeSelector:
24 |         intent: apps
25 |       containers:
26 |       - name: workload
27 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
28 |         resources:
29 |           requests:
30 |             cpu: 512m
31 |             memory: 512Mi
32 |       priorityClassName: overprovisioning
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 


--------------------------------------------------------------------------------
/blueprints/stateful/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: stateful
 5 | spec:
 6 |   replicas: 3
 7 |   selector:
 8 |     matchLabels:
 9 |       app: stateful
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: stateful
14 |     spec:
15 |       terminationGracePeriodSeconds: 0
16 |       nodeSelector:
17 |         intent: apps
18 |       containers:
19 |       - name: stateful
20 |         image: public.ecr.aws/docker/library/centos:centos7.9.2009
21 |         command: ["/bin/sh"]
22 |         args: ["-c", "while true; do echo Writing content every three minutes! Printing a random number: $(( $RANDOM % 1000 + 1 ))>> /data/out.txt; sleep 180; done"]
23 |         volumeMounts:
24 |         - name: persistent-storage
25 |           mountPath: /data
26 |         resources:
27 |           requests:
28 |             cpu: 1
29 |       volumes:
30 |       - name: persistent-storage
31 |         persistentVolumeClaim:
32 |           claimName: ebs-claim
33 | 


--------------------------------------------------------------------------------
/blueprints/od-spot-split/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: workload-split
 5 | spec:
 6 |   replicas: 10
 7 |   selector:
 8 |     matchLabels:
 9 |       app: workload-split
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: workload-split
14 |     spec:
15 |       nodeSelector:
16 |         intent: apps
17 |       tolerations:
18 |       - key: "intent"
19 |         operator: "Equal"
20 |         value: "workload-split"
21 |         effect: "NoSchedule"
22 |       containers:
23 |       - name: workload-split
24 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
25 |         imagePullPolicy: Always
26 |         resources:
27 |           requests:
28 |             cpu: 512m
29 |             memory: 512Mi
30 |       topologySpreadConstraints:
31 |         - labelSelector:
32 |             matchLabels:
33 |               app: workload-split
34 |           maxSkew: 1
35 |           topologyKey: capacity-spread
36 |           whenUnsatisfiable: DoNotSchedule
37 | 


--------------------------------------------------------------------------------
/blueprints/disruption-budgets/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: disruption-budget
 5 | spec:
 6 |   replicas: 30
 7 |   selector:
 8 |     matchLabels:
 9 |       intent: disruption-budget
10 |   template:
11 |     metadata:
12 |       labels:
13 |         intent: disruption-budget
14 |     spec:
15 |       nodeSelector:
16 |         intent: disruption-budget
17 |       containers:
18 |       - name: disruption-budget
19 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
20 |         imagePullPolicy: Always
21 |         resources:
22 |           requests:
23 |             cpu: 512m
24 |             memory: 512Mi
25 |       topologySpreadConstraints:
26 |         - labelSelector:
27 |             matchLabels:
28 |               intent: disruption-budget
29 |           maxSkew: 1
30 |           topologyKey: kubernetes.io/hostname
31 |           whenUnsatisfiable: ScheduleAnyway
32 |         - labelSelector:
33 |             matchLabels:
34 |               intent: disruption-budget
35 |           maxSkew: 1
36 |           topologyKey: topology.kubernetes.io/zone
37 |           whenUnsatisfiable: ScheduleAnyway
38 | 


--------------------------------------------------------------------------------
/blueprints/ha-az-nodes/workload.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: workload-multi-az-nodes
 5 | spec:
 6 |   replicas: 30
 7 |   selector:
 8 |     matchLabels:
 9 |       app: workload-multi-az-nodes
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: workload-multi-az-nodes
14 |     spec:
15 |       nodeSelector:
16 |         intent: apps
17 |       containers:
18 |       - name: workload-multi-az-nodes
19 |         image: public.ecr.aws/eks-distro/kubernetes/pause:v1.33.0-eks-1-33-4
20 |         imagePullPolicy: Always
21 |         resources:
22 |           requests:
23 |             cpu: 512m
24 |             memory: 512Mi
25 |       topologySpreadConstraints:
26 |         - labelSelector:
27 |             matchLabels:
28 |               app: workload-multi-az-nodes
29 |           maxSkew: 1
30 |           topologyKey: kubernetes.io/hostname
31 |           whenUnsatisfiable: ScheduleAnyway
32 |         - labelSelector:
33 |             matchLabels:
34 |               app: workload-multi-az-nodes
35 |           maxSkew: 1
36 |           topologyKey: topology.kubernetes.io/zone
37 |           whenUnsatisfiable: ScheduleAnyway
38 | 


--------------------------------------------------------------------------------
/blueprints/saving-plans/savings-plans.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.sh/v1
 2 | kind: NodePool
 3 | metadata:
 4 |   name: savings-plans
 5 | spec:
 6 |   disruption:
 7 |     consolidationPolicy: WhenEmptyOrUnderutilized
 8 |     consolidateAfter: 1m
 9 |   limits:
10 |     cpu: "20" # For example: Limit to launch up to 5 c4.xlarge instances
11 |   template:
12 |     metadata:
13 |       labels:
14 |         intent: apps
15 |     spec:
16 |       expireAfter: 168h0m0s
17 |       nodeClassRef:
18 |         group: karpenter.k8s.aws
19 |         name: default
20 |         kind: EC2NodeClass
21 |       requirements:
22 |       - key: karpenter.k8s.aws/instance-family
23 |         operator: In
24 |         values:
25 |         - c4
26 |         # Alternatively, you can configure fixed instance types
27 |         # - key: "node.kubernetes.io/instance-type"
28 |         #   operator: In
29 |         #   values: ["c4.xlarge"] # 4 vCPUs
30 |       - key: kubernetes.io/os
31 |         operator: In
32 |         values:
33 |         - linux
34 |       - key: kubernetes.io/arch
35 |         operator: In
36 |         values:
37 |         - amd64
38 |       - key: karpenter.sh/capacity-type
39 |         operator: In
40 |         values:
41 |         - on-demand
42 |   weight: 100
43 | 


--------------------------------------------------------------------------------
/blueprints/disruption-budgets/disruption-budgets.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.sh/v1
 2 | kind: NodePool
 3 | metadata:
 4 |   name: disruption-budget
 5 | spec:
 6 |   limits:
 7 |     cpu: 100
 8 |     memory: 100Gi
 9 |   template:
10 |     metadata:
11 |       labels:
12 |         intent: disruption-budget
13 |     spec:
14 |       nodeClassRef:
15 |         group: karpenter.k8s.aws
16 |         name: disruption-budget
17 |         kind: EC2NodeClass
18 |       requirements:
19 |         - key: karpenter.sh/capacity-type
20 |           operator: In
21 |           values: ["on-demand"]
22 |         - key: karpenter.k8s.aws/instance-category
23 |           operator: In
24 |           values: ["c","m","r"]
25 |         - key: karpenter.k8s.aws/instance-size
26 |           operator: NotIn
27 |           values: ["nano","micro","small","medium"]
28 |         - key: karpenter.k8s.aws/instance-hypervisor
29 |           operator: In
30 |           values: ["nitro"]
31 |       expireAfter: 720h
32 |   disruption:
33 |     consolidationPolicy: WhenEmptyOrUnderutilized
34 |     consolidateAfter: 1m
35 |     budgets:
36 |     - nodes: "0"
37 |       schedule: "0 0 * * *"
38 |       duration: 24h
39 | ---
40 | apiVersion: karpenter.k8s.aws/v1
41 | kind: EC2NodeClass
42 | metadata:
43 |   name: disruption-budget
44 | spec:
45 |   amiSelectorTerms:
46 |   - alias: bottlerocket@latest
47 |   role: "<<KARPENTER_NODE_IAM_ROLE_NAME>>"
48 |   securityGroupSelectorTerms:
49 |   - tags:
50 |       karpenter.sh/discovery: <<CLUSTER_NAME>>
51 |   subnetSelectorTerms:
52 |   - tags:
53 |       karpenter.sh/discovery: <<CLUSTER_NAME>>
54 | 


--------------------------------------------------------------------------------
/blueprints/custom-ami/custom-ami.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.k8s.aws/v1
 2 | kind: EC2NodeClass
 3 | metadata:
 4 |   name: custom-ami-template
 5 | spec:
 6 |   amiFamily: AL2023
 7 |   amiSelectorTerms:
 8 |   - name: '*amazon-eks-node-al2023*'
 9 |   role: "<<KARPENTER_NODE_IAM_ROLE_NAME>>"
10 |   securityGroupSelectorTerms:
11 |   - tags:
12 |       karpenter.sh/discovery: <<CLUSTER_NAME>>
13 |   subnetSelectorTerms:
14 |   - tags:
15 |       karpenter.sh/discovery: <<CLUSTER_NAME>>
16 | ---
17 | apiVersion: karpenter.sh/v1
18 | kind: NodePool
19 | metadata:
20 |   name: custom-ami
21 | spec:
22 |   disruption:
23 |     consolidationPolicy: WhenEmptyOrUnderutilized
24 |     consolidateAfter: 1m
25 |   limits:
26 |     cpu: 1k
27 |     memory: 500Gi
28 |   template:
29 |     metadata:
30 |       labels:
31 |         intent: custom-ami
32 |     spec:
33 |       nodeClassRef:
34 |         group: karpenter.k8s.aws
35 |         name: custom-ami-template
36 |         kind: EC2NodeClass
37 |       requirements:
38 |       - key: karpenter.k8s.aws/instance-hypervisor
39 |         operator: NotIn
40 |         values:
41 |         - ""
42 |       - key: karpenter.sh/capacity-type
43 |         operator: In
44 |         values:
45 |         - spot
46 |         - on-demand
47 |       - key: kubernetes.io/os
48 |         operator: In
49 |         values:
50 |         - linux
51 |       - key: kubernetes.io/arch
52 |         operator: In
53 |         values:
54 |         - amd64
55 |       - key: karpenter.k8s.aws/instance-category
56 |         operator: In
57 |         values:
58 |         - c
59 |         - m
60 |         - r
61 |       - key: karpenter.k8s.aws/instance-generation
62 |         operator: Gt
63 |         values:
64 |         - "2"
65 | 


--------------------------------------------------------------------------------
/blueprints/update-nodes-with-drift/latest-current-ami.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.k8s.aws/v1
 2 | kind: EC2NodeClass
 3 | metadata:
 4 |   name: latest-current-ami
 5 | spec:
 6 |   amiSelectorTerms:
 7 |     - id: <<AMD64PREVAMI>>
 8 |     - id: <<ARM64PREVAMI>>
 9 |   role: "<<KARPENTER_NODE_IAM_ROLE_NAME>>"
10 |   securityGroupSelectorTerms:
11 |   - tags:
12 |       karpenter.sh/discovery: <<CLUSTER_NAME>>
13 |   subnetSelectorTerms:
14 |   - tags:
15 |       karpenter.sh/discovery: <<CLUSTER_NAME>>
16 |   tags:
17 |     KubernetesVersion: "1.31"
18 | ---
19 | apiVersion: karpenter.sh/v1
20 | kind: NodePool
21 | metadata:
22 |   name: latest-current-ami
23 | spec:
24 |   disruption:
25 |     consolidationPolicy: WhenEmptyOrUnderutilized
26 |     consolidateAfter: 1m
27 |   limits:
28 |     cpu: 100k
29 |     memory: 5000Gi
30 |   template:
31 |     metadata:
32 |       labels:
33 |         intent: latest-current-ami
34 |     spec:
35 |       expireAfter: 168h0m0s
36 |       nodeClassRef:
37 |         group: karpenter.k8s.aws
38 |         kind: EC2NodeClass
39 |         name: latest-current-ami
40 |       requirements:
41 |       - key: karpenter.k8s.aws/instance-category
42 |         operator: In
43 |         values:
44 |         - c
45 |         - m
46 |         - r
47 |         - i
48 |         - d
49 |       - key: karpenter.k8s.aws/instance-cpu
50 |         operator: In
51 |         values:
52 |         - "4"
53 |         - "8"
54 |         - "16"
55 |         - "32"
56 |         - "48"
57 |         - "64"
58 |       - key: karpenter.sh/capacity-type
59 |         operator: In
60 |         values:
61 |         - spot
62 |         - on-demand
63 |       - key: kubernetes.io/os
64 |         operator: In
65 |         values:
66 |         - linux
67 |       - key: kubernetes.io/arch
68 |         operator: In
69 |         values:
70 |         - amd64
71 | 


--------------------------------------------------------------------------------
/blueprints/batch-jobs/workloads-evicted.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: nginx
 5 | spec:
 6 |     selector:
 7 |         matchLabels:
 8 |           app: nginx
 9 |     replicas: 1
10 |     template:
11 |       metadata:
12 |         labels:
13 |           app: nginx
14 |       spec:
15 |         nodeSelector:
16 |           intent: apps
17 |           karpenter.sh/capacity-type: on-demand
18 |         containers:
19 |         - name: nginx
20 |           image: nginx
21 |           imagePullPolicy: IfNotPresent
22 |           resources:
23 |               requests:
24 |                 cpu: "2"
25 | ---
26 | apiVersion: batch/v1
27 | kind: Job
28 | metadata:
29 |   name: 2-min-job
30 | spec:
31 |   ttlSecondsAfterFinished: 10
32 |   template:
33 |     spec:
34 |       nodeSelector:
35 |         intent: apps
36 |         karpenter.sh/capacity-type: on-demand
37 |       containers:
38 |       - name: 2-min-job
39 |         image: alpine
40 |         imagePullPolicy: IfNotPresent
41 |         resources:
42 |           requests:
43 |             cpu: "7"
44 |         command: ['sh', '-c', 'echo 2 minutes Job Pod is Running ; sleep 120']
45 |       restartPolicy: Never
46 |       terminationGracePeriodSeconds: 0
47 |   backoffLimit: 2
48 | ---
49 | apiVersion: batch/v1
50 | kind: Job
51 | metadata:
52 |   name: 5-min-job
53 | spec:
54 |   ttlSecondsAfterFinished: 10
55 |   template:
56 |     spec:
57 |       nodeSelector:
58 |         intent: apps
59 |         karpenter.sh/capacity-type: on-demand
60 |       containers:
61 |       - name: 5-min-job
62 |         image: alpine
63 |         imagePullPolicy: IfNotPresent
64 |         resources:
65 |           requests:
66 |             cpu: "2"
67 |         command: ['sh', '-c', 'echo 5 minutes Job Pod is Running ; sleep 300']
68 |       restartPolicy: Never
69 |       terminationGracePeriodSeconds: 0
70 |   backoffLimit: 2
71 | 


--------------------------------------------------------------------------------
/blueprints/userdata/userdata.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.k8s.aws/v1
 2 | kind: EC2NodeClass
 3 | metadata:
 4 |   name: userdata-template
 5 | spec:
 6 |   amiSelectorTerms:
 7 |     - alias: al2023@2023.0.20230222 # Amazon Linux 2023
 8 |   role: "<<KARPENTER_NODE_IAM_ROLE_NAME>>"
 9 |   securityGroupSelectorTerms:
10 |   - tags:
11 |       karpenter.sh/discovery: "<<CLUSTER_NAME>>"
12 |   subnetSelectorTerms:
13 |   - tags:
14 |       karpenter.sh/discovery: "<<CLUSTER_NAME>>"
15 |   userData: |
16 |     MIME-Version: 1.0
17 |     Content-Type: multipart/mixed; boundary="BOUNDARY"
18 | 
19 |     --BOUNDARY
20 |     Content-Type: text/x-shellscript; charset="us-ascii"
21 | 
22 |     #!/bin/bash
23 |     echo "Running a custom user data script"
24 | 
25 |     --BOUNDARY--
26 | ---
27 | apiVersion: karpenter.sh/v1
28 | kind: NodePool
29 | metadata:
30 |   name: userdata
31 | spec:
32 |   disruption:
33 |     consolidationPolicy: WhenEmptyOrUnderutilized
34 |     consolidateAfter: 1m
35 |   template:
36 |     metadata:
37 |       labels:
38 |         intent: userdata
39 |     spec:
40 |       nodeClassRef:
41 |         group: karpenter.k8s.aws
42 |         kind: EC2NodeClass
43 |         name: userdata-template
44 |       requirements:
45 |       - key: karpenter.k8s.aws/instance-hypervisor
46 |         operator: NotIn
47 |         values:
48 |         - ""
49 |       - key: karpenter.sh/capacity-type
50 |         operator: In
51 |         values:
52 |         - spot
53 |         - on-demand
54 |       - key: kubernetes.io/os
55 |         operator: In
56 |         values:
57 |         - linux
58 |       - key: kubernetes.io/arch
59 |         operator: In
60 |         values:
61 |         - amd64
62 |       - key: karpenter.k8s.aws/instance-category
63 |         operator: In
64 |         values:
65 |         - c
66 |         - m
67 |         - r
68 |       - key: karpenter.k8s.aws/instance-generation
69 |         operator: Gt
70 |         values:
71 |         - "2"
72 | 


--------------------------------------------------------------------------------
/blueprints/multi-ebs/multi-ebs.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.sh/v1
 2 | kind: NodePool
 3 | metadata:
 4 |   name: multi-ebs
 5 | spec:
 6 |   disruption:
 7 |     consolidationPolicy: WhenEmptyOrUnderutilized
 8 |     consolidateAfter: 1m
 9 |   limits:
10 |     cpu: 1k
11 |     memory: 500Gi
12 |   template:
13 |     metadata:
14 |       labels:
15 |         intent: multi-ebs
16 |     spec:
17 |       expireAfter: 168h0m0s
18 |       nodeClassRef:
19 |         group: karpenter.k8s.aws
20 |         name: multi-ebs
21 |         kind: EC2NodeClass
22 |       requirements:
23 |       - key: karpenter.k8s.aws/instance-category
24 |         operator: In
25 |         values:
26 |         - c
27 |         - m
28 |         - r
29 |         - i
30 |         - d
31 |       - key: karpenter.k8s.aws/instance-cpu
32 |         operator: In
33 |         values:
34 |         - "4"
35 |         - "8"
36 |         - "16"
37 |         - "32"
38 |         - "48"
39 |         - "64"
40 |       - key: karpenter.sh/capacity-type
41 |         operator: In
42 |         values:
43 |         - spot
44 |         - on-demand
45 |       - key: kubernetes.io/os
46 |         operator: In
47 |         values:
48 |         - linux
49 |       - key: kubernetes.io/arch
50 |         operator: In
51 |         values:
52 |         - amd64
53 | ---
54 | apiVersion: karpenter.k8s.aws/v1
55 | kind: EC2NodeClass
56 | metadata:
57 |   name: multi-ebs
58 | spec:
59 |   amiFamily: Bottlerocket
60 |   amiSelectorTerms:
61 |   - alias: bottlerocket@v1.39.1
62 |   blockDeviceMappings:
63 |   - deviceName: /dev/xvda
64 |     ebs:
65 |       deleteOnTermination: true
66 |       volumeSize: 20Gi
67 |       volumeType: gp3
68 |   - deviceName: /dev/xvdb
69 |     ebs:
70 |       deleteOnTermination: true
71 |       volumeSize: 100Gi
72 |       volumeType: gp3
73 |   role: "<<KARPENTER_NODE_IAM_ROLE_NAME>>"
74 |   securityGroupSelectorTerms:
75 |   - tags:
76 |       karpenter.sh/discovery: <<CLUSTER_NAME>>
77 |   subnetSelectorTerms:
78 |   - tags:
79 |       karpenter.sh/discovery: <<CLUSTER_NAME>>
80 | 


--------------------------------------------------------------------------------
/blueprints/batch-jobs/workloads-not-evicted.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: nginx
 5 | spec:
 6 |     selector:
 7 |         matchLabels:
 8 |           app: nginx
 9 |     replicas: 1
10 |     template:
11 |       metadata:
12 |         labels:
13 |           app: nginx
14 |       spec:
15 |         nodeSelector:
16 |           intent: apps
17 |           karpenter.sh/capacity-type: on-demand
18 |         containers:
19 |         - name: nginx
20 |           image: nginx
21 |           imagePullPolicy: IfNotPresent
22 |           resources:
23 |               requests:
24 |                 cpu: "2"
25 | ---
26 | apiVersion: batch/v1
27 | kind: Job
28 | metadata:
29 |   name: 2-min-job
30 | spec:
31 |   ttlSecondsAfterFinished: 10 #Eliminate job pods after 10 seconds of being completed
32 |   template:
33 |     spec:
34 |       nodeSelector:
35 |         intent: apps
36 |         karpenter.sh/capacity-type: on-demand
37 |       containers:
38 |       - name: 2-min-job
39 |         image: alpine
40 |         imagePullPolicy: IfNotPresent
41 |         resources:
42 |           requests:
43 |             cpu: "7"
44 |         command: ['sh', '-c', 'echo 2 minutes Job Pod is Running ; sleep 120']
45 |       restartPolicy: Never
46 |       terminationGracePeriodSeconds: 0
47 |     metadata:
48 |       annotations:
49 |         karpenter.sh/do-not-disrupt: "true"
50 |   backoffLimit: 2
51 | ---
52 | apiVersion: batch/v1
53 | kind: Job
54 | metadata:
55 |   name: 5-min-job
56 | spec:
57 |   ttlSecondsAfterFinished: 10 #Eliminate job pods after 10 seconds of being completed
58 |   template:
59 |     spec:
60 |       nodeSelector:
61 |         intent: apps
62 |         karpenter.sh/capacity-type: on-demand
63 |       containers:
64 |       - name: 5-min-job
65 |         image: alpine
66 |         imagePullPolicy: IfNotPresent
67 |         resources:
68 |           requests:
69 |             cpu: "2"
70 |         command: ['sh', '-c', 'echo 5 minutes Job Pod is Running ; sleep 300']
71 |       restartPolicy: Never
72 |       terminationGracePeriodSeconds: 0
73 |     metadata:
74 |       annotations:
75 |         karpenter.sh/do-not-disrupt: "true"
76 |   backoffLimit: 2
77 | 


--------------------------------------------------------------------------------
/blueprints/od-spot-split/od-spot.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: karpenter.sh/v1
  2 | kind: NodePool
  3 | metadata:
  4 |   name: node-od
  5 | spec:
  6 |   disruption:
  7 |     consolidationPolicy: WhenEmptyOrUnderutilized
  8 |     consolidateAfter: 1m
  9 |   limits:
 10 |     cpu: 1k
 11 |     memory: 500Gi
 12 |   template:
 13 |     metadata:
 14 |       labels:
 15 |         intent: apps
 16 |     spec:
 17 |       expireAfter: 168h0m0s
 18 |       nodeClassRef:
 19 |         group: karpenter.k8s.aws
 20 |         name: default
 21 |         kind: EC2NodeClass
 22 |       requirements:
 23 |       - key: capacity-spread
 24 |         operator: In
 25 |         values:
 26 |         - "1"
 27 |       - key: kubernetes.io/arch
 28 |         operator: In
 29 |         values:
 30 |         - amd64
 31 |       - key: karpenter.sh/capacity-type
 32 |         operator: In
 33 |         values:
 34 |         - on-demand
 35 |       - key: kubernetes.io/os
 36 |         operator: In
 37 |         values:
 38 |         - linux
 39 |       - key: karpenter.k8s.aws/instance-category
 40 |         operator: In
 41 |         values:
 42 |         - c
 43 |         - m
 44 |         - r
 45 |       - key: karpenter.k8s.aws/instance-generation
 46 |         operator: Gt
 47 |         values:
 48 |         - "2"
 49 |       taints:
 50 |       - effect: NoSchedule
 51 |         key: intent
 52 |         value: workload-split
 53 | ---
 54 | apiVersion: karpenter.sh/v1
 55 | kind: NodePool
 56 | metadata:
 57 |   name: node-spot
 58 | spec:
 59 |   disruption:
 60 |     consolidationPolicy: WhenEmptyOrUnderutilized
 61 |     consolidateAfter: 1m
 62 |   limits:
 63 |     cpu: 1k
 64 |     memory: 500Gi
 65 |   template:
 66 |     metadata:
 67 |       labels:
 68 |         intent: apps
 69 |     spec:
 70 |       expireAfter: 168h0m0s
 71 |       nodeClassRef:
 72 |         group: karpenter.k8s.aws
 73 |         name: default
 74 |         kind: EC2NodeClass
 75 |       requirements:
 76 |       - key: capacity-spread
 77 |         operator: In
 78 |         values:
 79 |         - "2"
 80 |         - "3"
 81 |         - "4"
 82 |         - "5"
 83 |       - key: kubernetes.io/arch
 84 |         operator: In
 85 |         values:
 86 |         - amd64
 87 |       - key: karpenter.sh/capacity-type
 88 |         operator: In
 89 |         values:
 90 |         - spot
 91 |       - key: kubernetes.io/os
 92 |         operator: In
 93 |         values:
 94 |         - linux
 95 |       - key: karpenter.k8s.aws/instance-category
 96 |         operator: In
 97 |         values:
 98 |         - c
 99 |         - m
100 |         - r
101 |       - key: karpenter.k8s.aws/instance-generation
102 |         operator: Gt
103 |         values:
104 |         - "2"
105 |       taints:
106 |       - effect: NoSchedule
107 |         key: intent
108 |         value: workload-split
109 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/blueprints/soci-snapshotter/workload.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: Deployment
  3 | metadata:
  4 |   name: vllm-soci
  5 |   labels:
  6 |     app: vllm-soci
  7 | spec:
  8 |   replicas: 1
  9 |   selector:
 10 |     matchLabels:
 11 |       app: vllm-soci
 12 |   template:
 13 |     metadata:
 14 |       labels:
 15 |         app: vllm-soci
 16 |     spec:
 17 |       containers:
 18 |       - name: vllm
 19 |         image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2
 20 |         command: ["bash", "-c"]
 21 |         args: ["trap 'exit 0' TERM; sleep 9999 & wait"]
 22 |       nodeSelector:
 23 |         intent: soci-snapshotter
 24 |         kubernetes.io/arch: amd64
 25 |       affinity:
 26 |         nodeAffinity:
 27 |           requiredDuringSchedulingIgnoredDuringExecution:
 28 |             nodeSelectorTerms:
 29 |             - matchExpressions:
 30 |               - key: karpenter.k8s.aws/instance-ebs-bandwidth
 31 |                 operator: Gt
 32 |                 values:
 33 |                 - "8000"
 34 |               - key: karpenter.k8s.aws/instance-network-bandwidth
 35 |                 operator: Gt
 36 |                 values:
 37 |                 - "8000"
 38 | ---
 39 | apiVersion: apps/v1
 40 | kind: Deployment
 41 | metadata:
 42 |   name: vllm-soci-br
 43 |   labels:
 44 |     app: vllm-soci-br
 45 | spec:
 46 |   replicas: 1
 47 |   selector:
 48 |     matchLabels:
 49 |       app: vllm-soci-br
 50 |   template:
 51 |     metadata:
 52 |       labels:
 53 |         app: vllm-soci-br
 54 |     spec:
 55 |       containers:
 56 |       - name: vllm
 57 |         image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2
 58 |         command: ["bash", "-c"]
 59 |         args: ["trap 'exit 0' TERM; sleep 9999 & wait"]
 60 |       nodeSelector:
 61 |         intent: soci-snapshotter-br
 62 |         kubernetes.io/arch: amd64
 63 |       affinity:
 64 |         nodeAffinity:
 65 |           requiredDuringSchedulingIgnoredDuringExecution:
 66 |             nodeSelectorTerms:
 67 |             - matchExpressions:
 68 |               - key: karpenter.k8s.aws/instance-ebs-bandwidth
 69 |                 operator: Gt
 70 |                 values:
 71 |                 - "8000"
 72 |               - key: karpenter.k8s.aws/instance-network-bandwidth
 73 |                 operator: Gt
 74 |                 values:
 75 |                 - "8000"
 76 | ---
 77 | apiVersion: apps/v1
 78 | kind: Deployment
 79 | metadata:
 80 |   name: vllm
 81 |   labels:
 82 |     app: vllm
 83 | spec:
 84 |   replicas: 1
 85 |   selector:
 86 |     matchLabels:
 87 |       app: vllm
 88 |   template:
 89 |     metadata:
 90 |       labels:
 91 |         app: vllm
 92 |     spec:
 93 |       containers:
 94 |       - name: vllm
 95 |         image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2
 96 |         command: ["bash", "-c"]
 97 |         args: ["trap 'exit 0' TERM; sleep 9999 & wait"]
 98 |       nodeSelector:
 99 |         intent: non-soci-snapshotter
100 |         kubernetes.io/arch: amd64
101 |       affinity:
102 |         nodeAffinity:
103 |           requiredDuringSchedulingIgnoredDuringExecution:
104 |             nodeSelectorTerms:
105 |             - matchExpressions:
106 |               - key: karpenter.k8s.aws/instance-ebs-bandwidth
107 |                 operator: Gt
108 |                 values:
109 |                 - "8000"
110 |               - key: karpenter.k8s.aws/instance-network-bandwidth
111 |                 operator: Gt
112 |                 values:
113 |                 - "8000"


--------------------------------------------------------------------------------
/blueprints/userdata/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Karpenter Blueprint: Customizing nodes with your own User Data automation
 3 | 
 4 | ## Purpose
 5 | 
 6 | When you need to bootstrap the data plane nodes to either overwrite certain Kubernetes settings, mount volumes or anything else you need to do when a node is launched. Within the `EC2NodeClass` there's a `userData` field you can use to control the [user data](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html) that is applied to your worker nodes. This way, you can continue using the [EKS optimized AMI](https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html) with any additional configuration you need to run on top of the base AMI.
 7 | 
 8 | ## Requirements
 9 | 
10 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
11 | 
12 | ## Deploy
13 | 
14 | You need to create a new `EC2NodeClass` with the `userData` field, along with a `NodePool` to use this new template.
15 | 
16 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes:
17 | 
18 | ```sh
19 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name)
20 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name)
21 | ```
22 | 
23 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN).
24 | 
25 | Now, make sure you're in this blueprint folder, then run the following command to create the new `EC2NodeClass` and `NodePool`:
26 | 
27 | ```sh
28 | sed -i '' "s/<<CLUSTER_NAME>>/$CLUSTER_NAME/g" userdata.yaml
29 | sed -i '' "s/<<KARPENTER_NODE_IAM_ROLE_NAME>>/$KARPENTER_NODE_IAM_ROLE_NAME/g" userdata.yaml
30 | kubectl apply -f .
31 | ```
32 | 
33 | ## Results
34 | 
35 | The pods from the sample workload should be running:
36 | 
37 | ```sh
38 | > kubectl get pods
39 | NAME                             READY   STATUS    RESTARTS       AGE
40 | userdata-75d87b5b6c-6s978        1/1     Running   0              45s
41 | userdata-75d87b5b6c-gnglz        1/1     Running   0              45s
42 | userdata-75d87b5b6c-krmxm        1/1     Running   0              45s
43 | ```
44 | 
45 | You can confirm the Kubernetes settings have been added to the user data of the instance by running this command:
46 | 
47 | ```sh
48 | aws ec2 describe-instance-attribute \
49 |   --instance-id $(aws ec2 describe-instances \
50 |   --filters "Name=tag:karpenter.sh/nodepool,Values=userdata" \
51 |   --output text --query 'Reservations[0].Instances[0].InstanceId') \
52 |   --attribute userData --query 'UserData.Value' --output text | base64 --decode
53 | ```
54 | 
55 | You should get an output like this with the `[settings.kubernetes]` configured in the `EC2NodeClass`:
56 | 
57 | ```text
58 | MIME-Version: 1.0
59 | Content-Type: multipart/mixed; boundary="//"
60 | 
61 | --//
62 | Content-Type: text/x-shellscript; charset="us-ascii"
63 | 
64 | #!/bin/bash
65 | echo "Running a custom user data script"
66 | 
67 | --//
68 | Content-Type: application/node.eks.aws
69 | ```
70 | 
71 | Look at how the `userdata` from the instance has the `userdata` you specified within the `EC2NodeClass` manifest.
72 | 
73 | ## Cleanup
74 | 
75 | To remove all objects created, simply run the following commands:
76 | 
77 | ```sh
78 | kubectl delete -f .
79 | ```
80 | 


--------------------------------------------------------------------------------
/blueprints/od-spot-split/README.md:
--------------------------------------------------------------------------------
 1 | # Karpenter Blueprint: Split Between On-Demand & Spot Instances
 2 | 
 3 | ## Purpose
 4 | 
 5 | This setup works if you're interested in having a portion the EKS nodes running using On-Demand instances, and another portion on Spot. For example, a split of 20% On-Demand, and 80% on Spot. You're can take advantage of the labels Karpenter adds automatically to each node, and use [Topology Spread Constraints (TSC)](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/) within a `Deployment` or `Pod` to split capacity in a desired ratio.
 6 | 
 7 | To do this, you can create a NodePool each for Spot and On-Demand with disjoint values for a unique new label called `capacity-spread`. Then, assign values to this label to configure the split. If you'd like to have a 20/80 split, you could add the values `["2","3","4","5"]` for the Spot NodePool, and `["1"]` for the On-Demand NodePool.
 8 | 
 9 | ## Requirements
10 | 
11 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
12 | * A `default` Karpenter NodePool as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository.
13 | 
14 | ## Deploy
15 | 
16 | To deploy the Karpenter `NodePool` and the sample `workload`, simply run this command:
17 | 
18 | ```sh
19 | kubectl apply -f .
20 | ```
21 | 
22 | You should see the following output:
23 | 
24 | ```console
25 | nodepool.karpenter.sh/node-od created
26 | nodepool.karpenter.sh/node-spot created
27 | deployment.apps/workload-split created
28 | ```
29 | 
30 | ## Results
31 | 
32 | You can review the Karpenter logs and watch how it's deciding to launch multiple nodes following the workload constraints:
33 | 
34 | ```sh
35 | kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20
36 | ```
37 | 
38 | Wait one minute and you should see the pods running within multiple nodes, run this command:
39 | 
40 | ```sh
41 | kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true
42 | ```
43 | 
44 | You should see an output similar to this:
45 | 
46 | ```console
47 | NAME                                         STATUS   ROLES    AGE     VERSION               CAPACITY-TYPE   INSTANCE-TYPE    NODEPOOL    ZONE
48 | ip-10-0-104-249.eu-west-2.compute.internal   Ready    <none>   17s     v1.32.3-eks-473151a   spot            c7i-flex.large   node-spot   eu-west-2c
49 | ip-10-0-40-176.eu-west-2.compute.internal    Ready    <none>   6m29s   v1.32.3-eks-473151a   spot            m7g.xlarge       default     eu-west-2a
50 | ip-10-0-47-113.eu-west-2.compute.internal    Ready    <none>   6m29s   v1.32.3-eks-473151a   spot            m7g.xlarge       default     eu-west-2a
51 | ip-10-0-53-185.eu-west-2.compute.internal    Ready    <none>   6m29s   v1.32.3-eks-473151a   spot            m7g.xlarge       default     eu-west-2a
52 | ip-10-0-54-129.eu-west-2.compute.internal    Ready    <none>   6m29s   v1.32.3-eks-473151a   spot            m7g.xlarge       default     eu-west-2a
53 | ip-10-0-83-213.eu-west-2.compute.internal    Ready    <none>   20s     v1.32.3-eks-473151a   on-demand       c6a.large        node-od     eu-west-2b
54 | ```
55 | 
56 | As you can see, pods were spread within the `spot` and `od` nodepools because of the `capacity-spread` TSC:
57 | 
58 | ```yaml
59 |       topologySpreadConstraints:
60 |         - labelSelector:
61 |             matchLabels:
62 |               app: workload-split
63 |           maxSkew: 1
64 |           topologyKey: capacity-spread
65 |           whenUnsatisfiable: DoNotSchedule
66 | ```
67 | 
68 | And each `NodePool` has a weight configured, the `od` NodePool has the following requirement:
69 | 
70 | ```yaml
71 |     - key: capacity-spread
72 |       operator: In
73 |       values: ["1"]
74 | ```
75 | 
76 | And the `spot` has the following requirement:
77 | 
78 | ```yaml
79 |     - key: capacity-spread
80 |       operator: In
81 |       values: ["2","3","4","5"]
82 | ```
83 | 
84 | ## Cleanup
85 | 
86 | ```sh
87 | kubectl delete -f workload.yaml
88 | kubectl delete -f od-spot.yaml
89 | ```
90 | 


--------------------------------------------------------------------------------
/blueprints/saving-plans/README.md:
--------------------------------------------------------------------------------
 1 | # Karpenter Blueprint: Prioritize Savings Plans and/or Reserved Instances
 2 | 
 3 | ## Purpose
 4 | 
 5 | You might want to consume your Saving Plans and/or Reserved Instances before any other purchase model when using Karpenter. Currently, to cover this scenario you need to have a prioritized NodePool for the reserved instances. This NodePool needs to have a high weight configuration to tell Karpenter to user this NodePool first, along with a `limits` configuration to limit the number of EC2 instances to launch. When this NodePool meet the limits, Karpenter will continue launching instances from other NodePools, typically from the `default` one.
 6 | 
 7 | ## Requirements
 8 | 
 9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
10 | * A list of instance types or families that match with your Savings Plans and/or Reserved Instances, along with the total number of vCPUs you've reserved.
11 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository.
12 | 
13 | ## Deploy
14 | 
15 | Let's suppose you purchased a Saving Plans of 20 vCPUs for `c4` family. Your NodePool should look like this:
16 | 
17 | ```yaml
18 | apiVersion: karpenter.sh/v1
19 | kind: NodePool
20 | metadata:
21 |   name: savings-plans
22 | spec:
23 |   disruption:
24 |     consolidationPolicy: WhenEmptyOrUnderutilized
25 |     consolidateAfter: 1m
26 |   limits:
27 |     cpu: "20" # For example: Limit to launch up to 5 c4.xlarge instances
28 |   template:
29 |     metadata:
30 |       labels:
31 |         intent: apps
32 |     spec:
33 |       expireAfter: 168h0m0s
34 |       nodeClassRef:
35 |         group: karpenter.k8s.aws
36 |         name: default
37 |         kind: EC2NodeClass
38 |       requirements:
39 |       - key: karpenter.k8s.aws/instance-family
40 |         operator: In
41 |         values:
42 |         - c4
43 |       - key: kubernetes.io/os
44 |         operator: In
45 |         values:
46 |         - linux
47 |       - key: kubernetes.io/arch
48 |         operator: In
49 |         values:
50 |         - amd64
51 |       - key: karpenter.sh/capacity-type
52 |         operator: In
53 |         values:
54 |         - on-demand
55 |   weight: 100
56 | ```
57 | 
58 | Notice that the above `NodePool` has a `weight` configuration of `100` and a `cpu` limit of 20 (5 x c4.xlarge instances).
59 | 
60 | Deploy the prioritized NodePool and the sample workload with 20 pods requesting `950m` cpu units:
61 | 
62 | ```sh
63 | kubectl apply -f savings-plans.yaml
64 | kubectl apply -f workload.yaml
65 | ```
66 | 
67 | ## Results
68 | 
69 | Wait around three minutes to get all the pods running. Run the following command to see the nodes launched by Karpenter including the `NodePool-name` column to see which `NodePool` was used:
70 | 
71 | ```sh
72 | kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true
73 | ```
74 | 
75 | You should get a similar output like this:
76 | 
77 | ```console
78 | NAME                                         STATUS   ROLES    AGE   VERSION               CAPACITY-TYPE   INSTANCE-TYPE   NODEPOOL        ZONE
79 | ip-10-0-119-235.eu-west-2.compute.internal   Ready    <none>   23s   v1.32.3-eks-473151a   on-demand       c4.4xlarge      savings-plans   eu-west-2c
80 | ip-10-0-127-154.eu-west-2.compute.internal   Ready    <none>   35m   v1.32.3-eks-473151a   on-demand       c6g.xlarge      default         eu-west-2c
81 | ip-10-0-78-33.eu-west-2.compute.internal     Ready    <none>   24s   v1.32.3-eks-473151a   on-demand       c4.xlarge       savings-plans   eu-west-2b
82 | ```
83 | 
84 | Notice how the `savings-plans` NodePool launched all the capacity it could. Two instances: `c4.xlarge` (4 vCPUs) and `c4.4xlarge` (16 vCPUs), which together reach the limit of 20 vCPUs you configured for this NodePool. Additionally, you see Karpenter launched a `c5.large` Spot instance for the rest of the pods using the `default` NodePool. Remember, each node always launch the `kubelet` and `kube-proxy` pods, that's why by Karpenter launched an extra node because 20 vCPUs of reserved capacity wasn't enough if system pods need to be included.
85 | 
86 | ## Cleanup
87 | 
88 | To remove all objects created, simply run the following commands:
89 | 
90 | ```sh
91 | kubectl delete -f .
92 | ```
93 | 


--------------------------------------------------------------------------------
/cluster/terraform/karpenter.tf:
--------------------------------------------------------------------------------
  1 | locals {
  2 |   karpenter_namespace = "karpenter"
  3 | }
  4 | 
  5 | ################################################################################
  6 | # Controller & Node IAM roles, SQS Queue, Eventbridge Rules
  7 | ################################################################################
  8 | 
  9 | module "karpenter" {
 10 |   source  = "terraform-aws-modules/eks/aws//modules/karpenter"
 11 |   version = "20.37.0"
 12 | 
 13 |   cluster_name          = module.eks.cluster_name
 14 |   enable_v1_permissions = true
 15 |   namespace             = local.karpenter_namespace
 16 | 
 17 |   # Name needs to match role name passed to the EC2NodeClass
 18 |   node_iam_role_use_name_prefix   = false
 19 |   node_iam_role_name              = local.name
 20 |   create_pod_identity_association = true
 21 | 
 22 |   tags = local.tags
 23 | }
 24 | 
 25 | ################################################################################
 26 | # Helm charts
 27 | ################################################################################
 28 | 
 29 | resource "helm_release" "karpenter" {
 30 |   name                = "karpenter"
 31 |   namespace           = local.karpenter_namespace
 32 |   create_namespace    = true
 33 |   repository          = "oci://public.ecr.aws/karpenter"
 34 |   repository_username = data.aws_ecrpublic_authorization_token.token.user_name
 35 |   repository_password = data.aws_ecrpublic_authorization_token.token.password
 36 |   chart               = "karpenter"
 37 |   version             = "1.5.0"
 38 |   wait                = false
 39 | 
 40 |   values = [
 41 |     <<-EOT
 42 |     nodeSelector:
 43 |       karpenter.sh/controller: 'true'
 44 |     settings:
 45 |       clusterName: ${module.eks.cluster_name}
 46 |       clusterEndpoint: ${module.eks.cluster_endpoint}
 47 |       interruptionQueue: ${module.karpenter.queue_name}
 48 |     tolerations:
 49 |       - key: CriticalAddonsOnly
 50 |         operator: Exists
 51 |     webhook:
 52 |       enabled: false
 53 |     EOT
 54 |   ]
 55 | 
 56 |   lifecycle {
 57 |     ignore_changes = [
 58 |       repository_password
 59 |     ]
 60 |   }
 61 | }
 62 | 
 63 | # Karpenter default EC2NodeClass and NodePool
 64 | 
 65 | resource "kubectl_manifest" "karpenter_default_ec2_node_class" {
 66 |   yaml_body = <<-YAML
 67 |     apiVersion: karpenter.k8s.aws/v1
 68 |     kind: EC2NodeClass
 69 |     metadata:
 70 |       name: default
 71 |     spec:
 72 |       role: "${module.karpenter.node_iam_role_name}"
 73 |       amiSelectorTerms:
 74 |       - alias: al2023@latest
 75 |       securityGroupSelectorTerms:
 76 |       - tags:
 77 |           karpenter.sh/discovery: ${module.eks.cluster_name}
 78 |       subnetSelectorTerms:
 79 |       - tags:
 80 |           karpenter.sh/discovery: ${module.eks.cluster_name}
 81 |       tags:
 82 |         IntentLabel: apps
 83 |         KarpenterNodePoolName: default
 84 |         NodeType: default
 85 |         intent: apps
 86 |         karpenter.sh/discovery: ${module.eks.cluster_name}
 87 |         project: karpenter-blueprints
 88 |   YAML
 89 | 
 90 |   depends_on = [
 91 |     helm_release.karpenter,
 92 |   ]
 93 | }
 94 | 
 95 | resource "kubectl_manifest" "karpenter_default_node_pool" {
 96 |   yaml_body = <<-YAML
 97 |     apiVersion: karpenter.sh/v1
 98 |     kind: NodePool
 99 |     metadata:
100 |       name: default
101 |     spec:
102 |       template:
103 |         metadata:
104 |           labels:
105 |             intent: apps
106 |         spec:
107 |           requirements:
108 |             - key: kubernetes.io/arch
109 |               operator: In
110 |               values: ["amd64", "arm64"]
111 |             - key: "karpenter.k8s.aws/instance-cpu"
112 |               operator: In
113 |               values: ["4", "8", "16", "32", "48", "64"]
114 |             - key: karpenter.sh/capacity-type
115 |               operator: In
116 |               values: ["spot", "on-demand"]
117 |             - key: karpenter.k8s.aws/instance-category
118 |               operator: In
119 |               values: ["c", "m", "r", "i", "d"]
120 |           nodeClassRef:
121 |             name: default
122 |             group: karpenter.k8s.aws
123 |             kind: EC2NodeClass
124 |           kubelet:
125 |             containerRuntime: containerd
126 |             systemReserved:
127 |               cpu: 100m
128 |               memory: 100Mi
129 |       disruption:
130 |         consolidationPolicy: WhenEmptyOrUnderutilized
131 |         consolidateAfter: 1m
132 |   YAML
133 | 
134 |   depends_on = [
135 |     helm_release.karpenter,
136 |     kubectl_manifest.karpenter_default_ec2_node_class,
137 |   ]
138 | }
139 | 


--------------------------------------------------------------------------------
/blueprints/multi-ebs/README.md:
--------------------------------------------------------------------------------
  1 | # Karpenter Blueprint: Using multiple EBS volumes
  2 | 
  3 | ## Purpose
  4 | 
  5 | This blueprint shows how to attach more than one EBS volume to a data plane node. Maybe you need to use a volume for logs, cache, or any container resources such as images. You do this configuration in the `EC2NodeClass`, then you configure a `NodePool` to use such template when launching a machine.
  6 | 
  7 | ## Requirements
  8 | 
  9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
 10 | * An IAM Role name that Karpenter nodes will use
 11 | * AWS CLI configured with permissions to describe EC2 instances (`ec2:DescribeInstances`)
 12 | 
 13 | ## Deploy
 14 | 
 15 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes:
 16 | 
 17 | ```sh
 18 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name)
 19 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name)
 20 | ```
 21 | 
 22 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role (not the ARN).
 23 | 
 24 | Now, make sure you're in this blueprint folder, then run the following command:
 25 | 
 26 | ```sh
 27 | sed -i '' "s/<<CLUSTER_NAME>>/$CLUSTER_NAME/g" multi-ebs.yaml
 28 | sed -i '' "s/<<KARPENTER_NODE_IAM_ROLE_NAME>>/$KARPENTER_NODE_IAM_ROLE_NAME/g" multi-ebs.yaml
 29 | kubectl apply -f .
 30 | ```
 31 | 
 32 | Here's the important configuration block within the spec of an `EC2NodeClass`:
 33 | 
 34 | ```yaml
 35 |   blockDeviceMappings:
 36 |     - deviceName: /dev/xvda
 37 |       ebs:
 38 |         volumeType: gp3
 39 |         volumeSize: 20
 40 |         deleteOnTermination: true
 41 |     - deviceName: /dev/xvdb
 42 |       ebs:
 43 |         volumeType: gp3
 44 |         volumeSize: 100Gi
 45 |         deleteOnTermination: true
 46 | ```
 47 | 
 48 | ## Results
 49 | 
 50 | After waiting for about one minute, you should see a machine ready, and all pods in a `Running` state, like this:
 51 | 
 52 | ```sh
 53 | ❯ kubectl get pods                                                                                                             1m 52s
 54 | NAME                        READY   STATUS    RESTARTS   AGE
 55 | multi-ebs-f4fb69fdd-kstj9   1/1     Running   0          2m34s
 56 | multi-ebs-f4fb69fdd-t9xnl   1/1     Running   0          2m34s
 57 | multi-ebs-f4fb69fdd-x42ss   1/1     Running   0          2m34s
 58 | ❯ kubectl get nodeclaims
 59 | NAME              TYPE        ZONE         NODE                                       READY   AGE
 60 | multi-ebs-chvzv   m5.xlarge   eu-west-1a   ip-10-0-43-92.eu-west-1.compute.internal   True    3m55s
 61 | ```
 62 | 
 63 | To validate that two EBS volumes have been attached to the EC2 instance, you need to run this command:
 64 | 
 65 | ```sh
 66 | aws ec2 describe-instances --filters "Name=tag:karpenter.sh/nodepool,Values=multi-ebs" --query 'Reservations[*].Instances[*].{Instance:InstanceId,Instance:BlockDeviceMappings}' --output json
 67 | ```
 68 | 
 69 | The output should be similar to this:
 70 | 
 71 | ```json
 72 | [
 73 |     [
 74 |         {
 75 |             "Instance": [
 76 |                 {
 77 |                     "DeviceName": "/dev/xvda",
 78 |                     "Ebs": {
 79 |                         "AttachTime": "2024-08-16T12:39:36+00:00",
 80 |                         "DeleteOnTermination": true,
 81 |                         "Status": "attached",
 82 |                         "VolumeId": "vol-0561b68b188d4e63a"
 83 |                     }
 84 |                 },
 85 |                 {
 86 |                     "DeviceName": "/dev/xvdb",
 87 |                     "Ebs": {
 88 |                         "AttachTime": "2024-08-16T12:39:36+00:00",
 89 |                         "DeleteOnTermination": true,
 90 |                         "Status": "attached",
 91 |                         "VolumeId": "vol-0ca5ca8b749f6bed0"
 92 |                     }
 93 |                 }
 94 |             ]
 95 |         }
 96 |     ]
 97 | ]
 98 | ```
 99 | 
100 | ## Cleanup
101 | 
102 | To remove all objects created, simply run the following commands:
103 | 
104 | ```sh
105 | kubectl delete -f .
106 | ```
107 | 


--------------------------------------------------------------------------------
/blueprints/custom-ami/README.md:
--------------------------------------------------------------------------------
 1 | # Karpenter Blueprint: Launching nodes using custom AMIs
 2 | 
 3 | ## Purpose
 4 | 
 5 | When you need to launch nodes using a custom AMI that you've created (i.e. to pre-load base container images), you need to configure an `EC2NodeClass` properly to get the AMI you need. With Karpenter, you might be able to use AMIs for different CPU architectures or other specifications like GPUs. So, our recommendation is that you use a naming convention or a tag to easily identify which AMIs Karpenter can use to launch nodes.
 6 | 
 7 | ## Requirements
 8 | 
 9 | * A custom AMI to use (for this example, we'll skip this requirement)
10 | * An EKS Cluster name with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
11 | * An IAM Role name that Karpenter nodes will use
12 | 
13 | ## Deploy
14 | 
15 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes:
16 | 
17 | ```
18 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name)
19 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name)
20 | ```
21 | 
22 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN).
23 | 
24 | Now, make sure you're in this blueprint folder, then run the following command:
25 | 
26 | ```sh
27 | sed -i '' "s/<<CLUSTER_NAME>>/$CLUSTER_NAME/g" custom-ami.yaml
28 | sed -i '' "s/<<KARPENTER_NODE_IAM_ROLE_NAME>>/$KARPENTER_NODE_IAM_ROLE_NAME/g" custom-ami.yaml
29 | kubectl apply -f .
30 | ```
31 | 
32 | Here's the important configuration block within the spec of an [`EC2NodeClass`](https://karpenter.sh/preview/concepts/nodeclasses/#specamiselectorterms): **spec.amiSelectorTerms**
33 | 
34 | `amiSelectorTerms` are required and are used to configure AMIs for Karpenter to use. AMIs are discovered through alias, id, owner, name, and [tags](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html).
35 | 
36 | If amiSelectorTerms match more than one AMI, Karpenter will automatically determine which AMI best fits the workloads on the launched worker node under the following constraints:
37 | 
38 | * When launching nodes, Karpenter automatically determines which architecture a custom AMI is compatible with and will use images that match an instanceType's requirements.
39 |   * Unless using an alias, Karpenter cannot detect requirements other than architecture. If you need to specify different AMIs for different kind of nodes (e.g. accelerated GPU AMIs), you should use a separate EC2NodeClass.
40 | * If multiple AMIs are found that can be used, Karpenter will choose the latest one.
41 | * If no AMIs are found that can be used, then no nodes will be provisioned.
42 | 
43 | To select an AMI by name, use the `name` field in the selector term. To select an AMI by id, use the `id` field in the selector term. To select an AMI using an alias, use the `alias` field which supports version pinning (e.g. `al2023@v20240807`) or latest version (`al2023@latest`). To ensure that AMIs are owned by the expected owner, use the `owner` field - you can use a combination of account aliases (e.g. self amazon, your-aws-account-name) and account IDs. If this is not set, it defaults to `self,amazon`.
44 | 
45 | > **Tip**
46 | > AMIs may be specified by any AWS tag, including Name. Selecting by tag
47 | > or by name using wildcards (*) is supported.
48 | 
49 | ```yaml
50 |   amiSelectorTerms:
51 |     - name: "*amazon-eks-node-al2023*"
52 |       owner: self
53 |     - name: "*amazon-eks-node-al2023*"
54 |       owner: amazon
55 | ```
56 | 
57 | ***IMPORTANT NOTE:*** With this configuration, you're saying that you need to use the latest AMI available for an EKS cluster v1.32 which is either owned by you (customized) or Amazon (official image). We're  using a regular expression to have the flexibility to use AMIs for either `x86` or `Arm`, workloads that need GPUs, or a nodes with different OS like `Windows`. You're basically letting the workload (pod) to decide which type of node(s) it needs. If you don't have a custom AMI created by you in your account, Karpenter will use the official EKS AMI owned by Amazon.
58 | 
59 | ## Results
60 | 
61 | After waiting for about one minute, you should see a machine ready, and all pods in a `Running` state, like this:
62 | 
63 | ```sh
64 | ❯ kubectl get pods
65 | NAME                         READY   STATUS    RESTARTS   AGE
66 | custom-ami-bdf66b777-2g27q   1/1     Running   0          2m2s
67 | custom-ami-bdf66b777-dbkls   1/1     Running   0          2m2s
68 | custom-ami-bdf66b777-rzlsz   1/1     Running   0          2m2s
69 | ❯ kubectl get nodeclaims
70 | NAME               TYPE          CAPACITY    ZONE         NODE                                         READY   AGE
71 | custom-ami-jhdbh   c5a.large     spot        eu-west-2c   ip-10-0-117-230.eu-west-2.compute.internal   True    114s
72 | ```
73 | 
74 | ## Cleanup
75 | 
76 | To remove all objects created, simply run the following commands:
77 | 
78 | ```sh
79 | kubectl delete -f .
80 | ```
81 | 


--------------------------------------------------------------------------------
/blueprints/graviton/README.md:
--------------------------------------------------------------------------------
 1 | # Karpenter Blueprint: Working with Graviton Instances
 2 | 
 3 | ## Purpose
 4 | 
 5 | You might be wondering how to use Graviton instances with Karpenter. Well, first you need to make sure that your application can run on different CPUs such as `arm64` or `x86-64`. The programming language you’re using and its ecosystem needs to be multi-arch aware, as you'll need to container images for both `arm64` and `x86-64` architectures. [AWS Graviton](https://aws.amazon.com/ec2/graviton/) processors are custom built by AWS using 64-bit Arm Neoverse. They power Amazon EC2 instances such as: M6g, M6gd, T4g, C6g, C6gd, C6gn, R6g, R6gd, X2gd, and more. Graviton instances provide up to 40% better price performance over comparable current generation x86-based instances for a wide variety of workloads.
 6 | 
 7 | Karpenter set the default architecture constraint on your NodePool that supports most common user workloads, which today will be `amd64` (or `x86-64` architecture). However, if you're flexible to support either `arm64` or `x86-64`, when working with AWS, you defer the decision of which architecture to use depending on purchase model: `On-Demand` or `Spot`.
 8 | 
 9 | If it’s an On-Demand Instance, Karpenter uses the `lowest-price` (LP) allocation strategy to launch the cheapest instance type that has available capacity. If it’s a Spot Instance, Karpenter uses the `price-capacity-optimized` (PCO) allocation strategy. PCO looks at both price and capacity availability to launch from the [Spot Instance pools](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-instances.html#spot-features) that are the least likely to be interrupted and have the lowest possible price.
10 | 
11 | ## Requirements
12 | 
13 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
14 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository.
15 | * A container image built for `arm64` architecture hosted in a container image registry such as ECR.
16 | 
17 | **NOTE:** To build a multi-arch container image, you can use Docker‘s [buildx](https://www.docker.com/blog/multi-arch-build-and-images-the-simple-way/) or, equally possible, a [remote](https://community.arm.com/developer/tools-software/tools/b/tools-software-ides-blog/posts/unifying-arm-software-development-with-docker) build. In this context, you want to check the [multi-arch readiness](https://github.com/aws-samples/aws-multiarch-container-build-pipeline) of your automated build and test pipeline, for example, [[support in Travis](https://docs.travis-ci.com/user/multi-cpu-architectures/#example-multi-architecture-build-matrix). Next, you need to [push your container images to a registry such as ECR](https://aws.amazon.com/blogs/containers/introducing-multi-architecture-container-images-for-amazon-ecr/).
18 | 
19 | **NOTE:** The sample `workload` in this repository already supports `arm64`.
20 | 
21 | ## Deploy
22 | 
23 | You're going to use the `default` NodePool as there's no need to create a separate NodePool to launch Graviton instances.
24 | 
25 | ## Results
26 | 
27 | You can inspect the pods from the `workload-flexible` deployment, but they don't have something in particular for Graviton instances other than asking for On-Demand capacity (`karpenter.sh/capacity-type: on-demand`) as a node selector. So, let's deploy the following assets:
28 | 
29 | ```sh
30 | kubectl apply -f workload-flexible.yaml
31 | ```
32 | 
33 | Wait for about one minute, and you'll see a new Graviton instance coming up:
34 | 
35 | ```sh
36 | $> kubectl get nodeclaims
37 | NAME            TYPE         ZONE         NODE                                        READY   AGE
38 | default-sgmkw   c6g.xlarge   eu-west-1b   ip-10-0-66-182.eu-west-1.compute.internal   True    42s
39 | ```
40 | 
41 | **NOTE:** All pods should be running now, and you didn't have to say anything special to Karpenter about which container image to use. Why? In Kubernetes, and by extension in Amazon EKS, the worker node-local supervisor called `kubelet` instructs the container runtime via a [standardized interface](https://kubernetes.io/blog/2016/12/container-runtime-interface-cri-in-kubernetes/) to pull container images from a registry such as Amazon ECR and launch them, accordingly. All of which is multi-arch enabled and automated.
42 | 
43 | Now, let's suppose that you've make the decision to go all-in with Graviton. Instead of creating a new NodePool, you can control that behavior within the `Deployment` by using a `nodeSelector` of `kubernetes.io/arch: arm64` and without limiting to On-Demand only. This means that now chances are that Karpenter will launch a Spot instance as it's the one with a better price offering. Let's see, deploy the other workload:
44 | 
45 | ```sh
46 | kubectl apply -f workload-graviton.yaml
47 | ```
48 | 
49 | Wait for about one minute, and run the following command to see which nodes Karpenter has launched and see if it's On-Demand or Spot:
50 | 
51 | ```sh
52 | kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true
53 | ```
54 | 
55 | You should see something similar to this:
56 | 
57 | ```console
58 | NAME                                        STATUS   ROLES    AGE    VERSION               CAPACITY-TYPE
59 | ip-10-0-87-181.eu-west-2.compute.internal   Ready    <none>   114s   v1.32.3-eks-473151a   on-demand       c6g.xlarge      default    eu-west-2b
60 | ```
61 | 
62 | Notice that now Karpenter decided to launch a `c6g.2xlarge` Spot instance because the workload and the NodePool support both pricing models, and the one that has a better price at this moment was a Graviton Spot instance.
63 | 
64 | ## Cleanup
65 | 
66 | ```sh
67 | kubectl delete -f .
68 | ```
69 | 


--------------------------------------------------------------------------------
/blueprints/soci-snapshotter/soci-snapshotter.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: karpenter.k8s.aws/v1
  2 | kind: EC2NodeClass
  3 | metadata:
  4 |   name: soci-snapshotter
  5 | spec:
  6 |   amiSelectorTerms:
  7 |     - alias: al2023@latest
  8 |   role: "<<KARPENTER_NODE_IAM_ROLE_NAME>>"
  9 |   blockDeviceMappings:
 10 |   - deviceName: /dev/xvda
 11 |     ebs:
 12 |       volumeSize: 100Gi
 13 |       volumeType: gp3
 14 |       throughput: 1000
 15 |       iops: 16000
 16 |   securityGroupSelectorTerms:
 17 |   - tags:
 18 |       karpenter.sh/discovery: "<<CLUSTER_NAME>>"
 19 |   subnetSelectorTerms:
 20 |   - tags:
 21 |       karpenter.sh/discovery: "<<CLUSTER_NAME>>"
 22 |   userData: |
 23 |     apiVersion: node.eks.aws/v1alpha1
 24 |     kind: NodeConfig
 25 |     spec:
 26 |       featureGates:
 27 |         FastImagePull: true
 28 | ---
 29 | apiVersion: karpenter.sh/v1
 30 | kind: NodePool
 31 | metadata:
 32 |   name: soci-snapshotter
 33 | spec:
 34 |   disruption:
 35 |     consolidationPolicy: WhenEmptyOrUnderutilized
 36 |     consolidateAfter: 5m
 37 |   template:
 38 |     metadata:
 39 |       labels:
 40 |         intent: soci-snapshotter
 41 |     spec:
 42 |       nodeClassRef:
 43 |         group: karpenter.k8s.aws
 44 |         kind: EC2NodeClass
 45 |         name: soci-snapshotter
 46 |       requirements:
 47 |       - key: kubernetes.io/os
 48 |         operator: In
 49 |         values:
 50 |         - linux
 51 |       - key: karpenter.k8s.aws/instance-category
 52 |         operator: In
 53 |         values:
 54 |         - c
 55 |         - m
 56 |         - r
 57 |       - key: kubernetes.io/arch
 58 |         operator: In
 59 |         values:
 60 |         - amd64
 61 |       - key: karpenter.sh/capacity-type
 62 |         operator: In
 63 |         values:
 64 |         - spot
 65 |         - on-demand
 66 | ---
 67 | apiVersion: karpenter.k8s.aws/v1
 68 | kind: EC2NodeClass
 69 | metadata:
 70 |   name: soci-snapshotter-br
 71 | spec:
 72 |   amiSelectorTerms:
 73 |     - alias: bottlerocket@latest
 74 |   role: "<<KARPENTER_NODE_IAM_ROLE_NAME>>"
 75 |   blockDeviceMappings:
 76 |     - deviceName: /dev/xvda
 77 |       ebs:
 78 |         volumeSize: 4Gi
 79 |         volumeType: gp3
 80 |         encrypted: true
 81 |     - deviceName: /dev/xvdb
 82 |       ebs:
 83 |         volumeSize: 100Gi
 84 |         volumeType: gp3
 85 |         throughput: 1000
 86 |         iops: 16000
 87 |         encrypted: true
 88 |   securityGroupSelectorTerms:
 89 |   - tags:
 90 |       karpenter.sh/discovery: "<<CLUSTER_NAME>>"
 91 |   subnetSelectorTerms:
 92 |   - tags:
 93 |       karpenter.sh/discovery: "<<CLUSTER_NAME>>"
 94 |   userData: |
 95 |     [settings.container-runtime]
 96 |     snapshotter = "soci"
 97 |     [settings.container-runtime-plugins.soci-snapshotter]
 98 |     pull-mode = "parallel-pull-unpack"
 99 |     [settings.container-runtime-plugins.soci-snapshotter.parallel-pull-unpack]
100 |     max-concurrent-downloads-per-image = 20
101 |     concurrent-download-chunk-size = "16mb"
102 |     max-concurrent-unpacks-per-image = 12
103 |     discard-unpacked-layers = true
104 |     [settings.bootstrap-commands.k8s-ephemeral-storage]
105 |     commands = [
106 |         ["apiclient", "ephemeral-storage", "init"],
107 |         ["apiclient", "ephemeral-storage" ,"bind", "--dirs", "/var/lib/containerd", "/var/lib/kubelet", "/var/log/pods", "/var/lib/soci-snapshotter"]
108 |     ]
109 |     essential = true
110 |     mode = "always"
111 | ---
112 | apiVersion: karpenter.sh/v1
113 | kind: NodePool
114 | metadata:
115 |   name: soci-snapshotter-br
116 | spec:
117 |   disruption:
118 |     consolidationPolicy: WhenEmptyOrUnderutilized
119 |     consolidateAfter: 5m
120 |   template:
121 |     metadata:
122 |       labels:
123 |         intent: soci-snapshotter-br
124 |     spec:
125 |       nodeClassRef:
126 |         group: karpenter.k8s.aws
127 |         kind: EC2NodeClass
128 |         name: soci-snapshotter-br
129 |       requirements:
130 |       - key: kubernetes.io/os
131 |         operator: In
132 |         values:
133 |         - linux
134 |       - key: karpenter.k8s.aws/instance-category
135 |         operator: In
136 |         values:
137 |         - c
138 |         - m
139 |         - r
140 |       - key: kubernetes.io/arch
141 |         operator: In
142 |         values:
143 |         - amd64
144 |       - key: karpenter.sh/capacity-type
145 |         operator: In
146 |         values:
147 |         - spot
148 |         - on-demand
149 | ---
150 | apiVersion: karpenter.k8s.aws/v1
151 | kind: EC2NodeClass
152 | metadata:
153 |   name: non-soci-snapshotter
154 | spec:
155 |   amiSelectorTerms:
156 |     - alias: al2023@latest
157 |   role: "<<KARPENTER_NODE_IAM_ROLE_NAME>>"
158 |   blockDeviceMappings:
159 |   - deviceName: /dev/xvda
160 |     ebs:
161 |       volumeSize: 100Gi
162 |       volumeType: gp3
163 |       throughput: 1000
164 |       iops: 16000
165 |   securityGroupSelectorTerms:
166 |   - tags:
167 |       karpenter.sh/discovery: "<<CLUSTER_NAME>>"
168 |   subnetSelectorTerms:
169 |   - tags:
170 |       karpenter.sh/discovery: "<<CLUSTER_NAME>>"
171 | ---
172 | apiVersion: karpenter.sh/v1
173 | kind: NodePool
174 | metadata:
175 |   name: non-soci-snapshotter
176 | spec:
177 |   disruption:
178 |     consolidationPolicy: WhenEmptyOrUnderutilized
179 |     consolidateAfter: 5m
180 |   template:
181 |     metadata:
182 |       labels:
183 |         intent: non-soci-snapshotter
184 |     spec:
185 |       nodeClassRef:
186 |         group: karpenter.k8s.aws
187 |         kind: EC2NodeClass
188 |         name: non-soci-snapshotter
189 |       requirements:
190 |       - key: kubernetes.io/os
191 |         operator: In
192 |         values:
193 |         - linux
194 |       - key: karpenter.k8s.aws/instance-category
195 |         operator: In
196 |         values:
197 |         - c
198 |         - m
199 |         - r
200 |       - key: kubernetes.io/arch
201 |         operator: In
202 |         values:
203 |         - amd64
204 |       - key: karpenter.sh/capacity-type
205 |         operator: In
206 |         values:
207 |         - spot
208 |         - on-demand


--------------------------------------------------------------------------------
/blueprints/overprovision/README.md:
--------------------------------------------------------------------------------
  1 | # Karpenter Blueprint: Overprovision capacity in advanced to increase responsiveness
  2 | 
  3 | ## Purpose
  4 | 
  5 | Let's say you have a data pipeline process that knows it will need to have the capacity to launch 100 pods at the same time. To reduce the initiation time, you could overprovision capacity in advanced to increase responsiveness so when the data pipeline launches the pods, the capacity is already there.
  6 | 
  7 | To achieve this, you deploy a "dummy" workload with a low [PriorityClass](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass) to reserve capacity (to make Karpenter launch nodes). Then, when you deploy the workload with the pods you actually need, "dummy" pods are evicted to make rapidly start the pods you need for your workload.
  8 | 
  9 | ## Requirements
 10 | 
 11 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
 12 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository.
 13 | 
 14 | ## Deploy
 15 | 
 16 | Let's start by deploying the "dummy" workload:
 17 | 
 18 | ```sh
 19 | kubectl apply -f dummy-workload.yaml
 20 | ```
 21 | 
 22 | After waiting for around two minutes, notice how Karpenter will provision the machine(s) needed to run the "dummy" workload:
 23 | 
 24 | ```sh
 25 | > kubectl get nodeclaims
 26 | NAME            TYPE          ZONE         NODE                                       READY   AGE
 27 | default-kpj7k   c6i.2xlarge   eu-west-1b   ip-10-0-73-34.eu-west-1.compute.internal   True    57s
 28 | ```
 29 | 
 30 | And the "dummy" pods are now running simply to reserve this capacity:
 31 | 
 32 | ```sh
 33 | > kubectl get pods                                                                                                             7s
 34 | NAME                             READY   STATUS    RESTARTS   AGE
 35 | dummy-workload-6bf87d68f-2ftbq   1/1     Running   0          53s
 36 | dummy-workload-6bf87d68f-8pnp8   1/1     Running   0          53s
 37 | dummy-workload-6bf87d68f-ctlvc   1/1     Running   0          53s
 38 | dummy-workload-6bf87d68f-fznv6   1/1     Running   0          53s
 39 | dummy-workload-6bf87d68f-hp4qs   1/1     Running   0          53s
 40 | dummy-workload-6bf87d68f-pwtp9   1/1     Running   0          53s
 41 | dummy-workload-6bf87d68f-rg7tj   1/1     Running   0          53s
 42 | dummy-workload-6bf87d68f-t7bqz   1/1     Running   0          53s
 43 | dummy-workload-6bf87d68f-xwln7   1/1     Running   0          53s
 44 | dummy-workload-6bf87d68f-zmhk8   1/1     Running   0          53s
 45 | ```
 46 | 
 47 | ## Results
 48 | 
 49 | Now, when you deploy the actual workload you need  to do some work (such as a data pipeline process), the "dummy" pods are going to be evicted. So, let's deploy the following workload to test it:
 50 | 
 51 | ```sh
 52 | kubectl apply -f workload.yaml
 53 | ```
 54 | 
 55 | Notice how your new pods are almost immediately running, and some of the "dummy" pods are "Pending":
 56 | 
 57 | ```sh
 58 | > kubectl get pods
 59 | NAME                             READY   STATUS    RESTARTS   AGE
 60 | dummy-workload-6bf87d68f-2ftbq   1/1     Running   0          11m
 61 | dummy-workload-6bf87d68f-6bq4v   0/1     Pending   0          15s
 62 | dummy-workload-6bf87d68f-8nkp8   0/1     Pending   0          14s
 63 | dummy-workload-6bf87d68f-cchqx   0/1     Pending   0          15s
 64 | dummy-workload-6bf87d68f-fznv6   1/1     Running   0          11m
 65 | dummy-workload-6bf87d68f-hp4qs   1/1     Running   0          11m
 66 | dummy-workload-6bf87d68f-r69g6   0/1     Pending   0          15s
 67 | dummy-workload-6bf87d68f-rg7tj   1/1     Running   0          11m
 68 | dummy-workload-6bf87d68f-w4zk8   0/1     Pending   0          15s
 69 | dummy-workload-6bf87d68f-zmhk8   1/1     Running   0          11m
 70 | workload-679c759476-6h47j        1/1     Running   0          15s
 71 | workload-679c759476-hhjmp        1/1     Running   0          15s
 72 | workload-679c759476-jxnc2        1/1     Running   0          15s
 73 | workload-679c759476-lqv5t        1/1     Running   0          15s
 74 | workload-679c759476-n269j        1/1     Running   0          15s
 75 | workload-679c759476-nfjtp        1/1     Running   0          15s
 76 | workload-679c759476-nv7sg        1/1     Running   0          15s
 77 | workload-679c759476-p277d        1/1     Running   0          15s
 78 | workload-679c759476-qw8sk        1/1     Running   0          15s
 79 | workload-679c759476-sxjpt        1/1     Running   0          15s
 80 | ```
 81 | 
 82 | After waiting for around two minutes, you'll see all pods running and a new machine registered:
 83 | 
 84 | ```sh
 85 | > kubectl get nodeclaims                                                                                                        18s
 86 | NAME            TYPE          ZONE         NODE                                        READY   AGE
 87 | default-4q9dn   c6g.xlarge   on-demand   eu-west-2c   ip-10-0-127-154.eu-west-2.compute.internal   True      29m
 88 | default-xwbvp   c7g.xlarge   spot        eu-west-2c   ip-10-0-100-21.eu-west-2.compute.internal    True      75s
 89 | ```
 90 | 
 91 | The new machine is simply there because some "dummy" pods were pending and they exist to reserve capacity. If you think you won't need those "dummy" pods while your workload is running, you can simply reduce the "dummy" deployment replicas to 0, and Karpenter consolidation will kick in to remove unnecessary machines.
 92 | 
 93 | ```sh
 94 | > kubectl scale deployment dummy-workload --replicas 0
 95 | deployment.apps/dummy-workload scaled
 96 | > kubectl get nodeclaims
 97 | NAME            TYPE          ZONE         NODE                                       READY   AGE
 98 | default-kpj7k   c6i.2xlarge   eu-west-1b   ip-10-0-73-34.eu-west-1.compute.internal   True    16m
 99 | ```
100 | 
101 | ## Cleanup
102 | 
103 | To remove all objects created, simply run the following commands:
104 | 
105 | ```sh
106 | kubectl delete -f .
107 | ```
108 | 


--------------------------------------------------------------------------------
/cluster/terraform/main.tf:
--------------------------------------------------------------------------------
  1 | ## THIS TO AUTHENTICATE TO ECR, DON'T CHANGE IT
  2 | provider "aws" {
  3 |   region = "us-east-1"
  4 |   alias  = "virginia"
  5 | }
  6 | 
  7 | provider "aws" {
  8 |   region = var.region
  9 | }
 10 | 
 11 | provider "kubernetes" {
 12 |   host                   = module.eks.cluster_endpoint
 13 |   cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
 14 |   token                  = data.aws_eks_cluster_auth.this.token
 15 | }
 16 | 
 17 | provider "helm" {
 18 |   kubernetes {
 19 |     host                   = module.eks.cluster_endpoint
 20 |     cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
 21 |     token                  = data.aws_eks_cluster_auth.this.token
 22 |   }
 23 | }
 24 | 
 25 | provider "kubectl" {
 26 |   apply_retry_count      = 10
 27 |   host                   = module.eks.cluster_endpoint
 28 |   cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
 29 |   load_config_file       = false
 30 |   token                  = data.aws_eks_cluster_auth.this.token
 31 | }
 32 | 
 33 | data "aws_eks_cluster_auth" "this" {
 34 |   name = module.eks.cluster_name
 35 | }
 36 | 
 37 | data "aws_ecrpublic_authorization_token" "token" {
 38 |   provider = aws.virginia
 39 | }
 40 | 
 41 | data "aws_availability_zones" "available" {
 42 |   filter {
 43 |     name   = "opt-in-status"
 44 |     values = ["opt-in-not-required"]
 45 |   }
 46 | }
 47 | 
 48 | locals {
 49 |   name = "karpenter-blueprints"
 50 | 
 51 |   vpc_cidr = "10.0.0.0/16"
 52 |   # NOTE: You might need to change this less number of AZs depending on the region you're deploying to
 53 |   azs = slice(data.aws_availability_zones.available.names, 0, 3)
 54 | 
 55 |   tags = {
 56 |     blueprint = local.name
 57 |   }
 58 | }
 59 | 
 60 | ################################################################################
 61 | # Cluster
 62 | ################################################################################
 63 | 
 64 | module "eks" {
 65 |   source  = "terraform-aws-modules/eks/aws"
 66 |   version = "20.37.0"
 67 | 
 68 |   cluster_name                             = local.name
 69 |   cluster_version                          = "1.32"
 70 |   cluster_endpoint_public_access           = true
 71 |   enable_cluster_creator_admin_permissions = true
 72 | 
 73 |   cluster_addons = {
 74 |     aws-ebs-csi-driver = {
 75 |       most_recent = true
 76 |     }
 77 |     coredns = {
 78 |       most_recent = true
 79 |     }
 80 |     eks-pod-identity-agent = {
 81 |       before_compute = true
 82 |       most_recent    = true
 83 |     }
 84 |     kube-proxy = {
 85 |       most_recent = true
 86 |     }
 87 |     metrics-server = {
 88 |       most_recent = true
 89 |     }
 90 |     vpc-cni = {
 91 |       most_recent    = true
 92 |       before_compute = true
 93 |       configuration_values = jsonencode({
 94 |         env = {
 95 |           ENABLE_PREFIX_DELEGATION = "true"
 96 |           WARM_PREFIX_TARGET       = "1"
 97 |         }
 98 |       })
 99 |     }
100 |   }
101 | 
102 |   vpc_id     = module.vpc.vpc_id
103 |   subnet_ids = module.vpc.private_subnets
104 | 
105 |   create_cloudwatch_log_group = false
106 | 
107 |   eks_managed_node_groups = {
108 |     mng = {
109 |       instance_types = ["m4.large", "m5.large", "m5a.large", "m5ad.large", "m5d.large", "t2.large", "t3.large", "t3a.large"]
110 | 
111 |       subnet_ids   = module.vpc.private_subnets
112 |       max_size     = 2
113 |       desired_size = 2
114 |       min_size     = 2
115 | 
116 |       labels = {
117 |         # Used to ensure Karpenter runs on nodes that it does not manage
118 |         "karpenter.sh/controller" = "true"
119 |       }
120 |     }
121 |   }
122 | 
123 |   node_security_group_tags = merge(local.tags, {
124 |     # NOTE - if creating multiple security groups with this module, only tag the
125 |     # security group that Karpenter should utilize with the following tag
126 |     # (i.e. - at most, only one security group should have this tag in your account)
127 |     "karpenter.sh/discovery" = local.name
128 |   })
129 | 
130 |   tags = local.tags
131 | }
132 | 
133 | module "eks_blueprints_addons" {
134 |   source  = "aws-ia/eks-blueprints-addons/aws"
135 |   version = "1.21.0"
136 | 
137 |   cluster_name      = module.eks.cluster_name
138 |   cluster_endpoint  = module.eks.cluster_endpoint
139 |   cluster_version   = module.eks.cluster_version
140 |   oidc_provider_arn = module.eks.oidc_provider_arn
141 | 
142 |   create_delay_dependencies = [for grp in module.eks.eks_managed_node_groups : grp.node_group_arn]
143 | 
144 |   enable_aws_load_balancer_controller = true
145 | 
146 |   enable_aws_for_fluentbit = true
147 |   aws_for_fluentbit = {
148 |     set = [
149 |       {
150 |         name  = "cloudWatchLogs.region"
151 |         value = var.region
152 |       }
153 |     ]
154 |   }
155 | 
156 |   tags = local.tags
157 | }
158 | 
159 | module "aws_ebs_csi_pod_identity" {
160 |   source = "terraform-aws-modules/eks-pod-identity/aws"
161 | 
162 |   name    = "aws-ebs-csi"
163 |   version = "1.12.0"
164 | 
165 |   attach_aws_ebs_csi_policy = true
166 | 
167 |   # Pod Identity Associations
168 |   association_defaults = {
169 |     namespace       = "kube-system"
170 |     service_account = "ebs-csi-controller-sa"
171 |   }
172 | 
173 |   associations = {
174 |     default = {
175 |       cluster_name = module.eks.cluster_name
176 |     }
177 |   }
178 | 
179 |   tags = local.tags
180 | }
181 | 
182 | #---------------------------------------------------------------
183 | # Supporting Resources
184 | #---------------------------------------------------------------
185 | 
186 | module "vpc" {
187 |   source  = "terraform-aws-modules/vpc/aws"
188 |   version = "5.21.0"
189 | 
190 |   name = local.name
191 |   cidr = local.vpc_cidr
192 | 
193 |   azs             = local.azs
194 |   private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)]
195 |   public_subnets  = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)]
196 | 
197 |   enable_nat_gateway   = true
198 |   single_nat_gateway   = true
199 |   enable_dns_hostnames = true
200 | 
201 |   # Manage so we can name
202 |   manage_default_network_acl    = true
203 |   default_network_acl_tags      = { Name = "${local.name}-default" }
204 |   manage_default_route_table    = true
205 |   default_route_table_tags      = { Name = "${local.name}-default" }
206 |   manage_default_security_group = true
207 |   default_security_group_tags   = { Name = "${local.name}-default" }
208 | 
209 |   public_subnet_tags = {
210 |     "kubernetes.io/cluster/${local.name}" = "shared"
211 |     "kubernetes.io/role/elb"              = 1
212 |   }
213 | 
214 |   private_subnet_tags = {
215 |     "kubernetes.io/cluster/${local.name}" = "shared"
216 |     "kubernetes.io/role/internal-elb"     = 1
217 |     "karpenter.sh/discovery"              = local.name
218 |   }
219 | 
220 |   tags = local.tags
221 | }
222 | 


--------------------------------------------------------------------------------
/blueprints/ha-az-nodes/README.md:
--------------------------------------------------------------------------------
 1 | # Karpenter Blueprint: High-Availability - Spread Pods across AZs & Nodes
 2 | 
 3 | ## Purpose
 4 | 
 5 | Karpenter can launch only one node for all pending pods. However, putting all application pods in the same node is not recommended if you want to have high-availability. To avoid this, and make the workload more highly-available, you can spread the pods within multiple availability zones (AZs). Additionally, you can configure a constraint to spread pods within multiple nodes in the same AZ. To do so, you configure [Topology Spread Constraints (TSC)](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/) within a `Deployment` or `Pod`.
 6 | 
 7 | ## Requirements
 8 | 
 9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
10 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository.
11 | 
12 | ## Deploy
13 | 
14 | To deploy the sample `workload`, simply run this command:
15 | 
16 | ```sh
17 | kubectl apply -f workload.yaml
18 | ```
19 | 
20 | ## Results
21 | 
22 | You can review the Karpenter logs and watch how it's deciding to launch multiple nodes following the workload constraints:
23 | 
24 | ```sh
25 | kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20
26 | ```
27 | 
28 | Wait one minute and you should see the pods running within two nodes in each AZ, run this command:
29 | 
30 | ```sh
31 | kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true
32 | ```
33 | 
34 | You should see an output similar to this:
35 | 
36 | ```console
37 | NAME                                         STATUS   ROLES    AGE     VERSION               CAPACITY-TYPE   INSTANCE-TYPE   NODEPOOL   ZONE
38 | ip-10-0-101-160.eu-west-2.compute.internal   Ready    <none>   19s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2c
39 | ip-10-0-109-204.eu-west-2.compute.internal   Ready    <none>   20s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2c
40 | ip-10-0-112-15.eu-west-2.compute.internal    Ready    <none>   20s     v1.32.3-eks-473151a   spot            m8g.xlarge      default    eu-west-2c
41 | ip-10-0-117-72.eu-west-2.compute.internal    Ready    <none>   2m51s   v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2c
42 | ip-10-0-36-130.eu-west-2.compute.internal    Ready    <none>   22s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
43 | ip-10-0-37-110.eu-west-2.compute.internal    Ready    <none>   21s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
44 | ip-10-0-40-176.eu-west-2.compute.internal    Ready    <none>   22s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
45 | ip-10-0-44-135.eu-west-2.compute.internal    Ready    <none>   21s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
46 | ip-10-0-45-90.eu-west-2.compute.internal     Ready    <none>   22s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
47 | ip-10-0-47-113.eu-west-2.compute.internal    Ready    <none>   22s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
48 | ip-10-0-48-218.eu-west-2.compute.internal    Ready    <none>   21s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
49 | ip-10-0-53-185.eu-west-2.compute.internal    Ready    <none>   22s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
50 | ip-10-0-54-107.eu-west-2.compute.internal    Ready    <none>   23s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
51 | ip-10-0-54-129.eu-west-2.compute.internal    Ready    <none>   22s     v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
52 | ip-10-0-66-57.eu-west-2.compute.internal     Ready    <none>   22s     v1.32.3-eks-473151a   spot            c7gd.xlarge     default    eu-west-2b
53 | ip-10-0-77-61.eu-west-2.compute.internal     Ready    <none>   21s     v1.32.3-eks-473151a   spot            c7gd.xlarge     default    eu-west-2b
54 | ip-10-0-85-117.eu-west-2.compute.internal    Ready    <none>   23s     v1.32.3-eks-473151a   spot            c8g.xlarge      default    eu-west-2b
55 | ip-10-0-87-181.eu-west-2.compute.internal    Ready    <none>   4m22s   v1.32.3-eks-473151a   on-demand       c6g.xlarge      default    eu-west-2b
56 | ```
57 | 
58 | As you can see, pods were spread within AZs (1a and 1b) because of the `topology.kubernetes.io/zone` TSC. But at the same time, pods were spread within multiple nodes in each AZ because of the `kubernetes.io/hostname` TSC.
59 | 
60 | ```yaml
61 |       topologySpreadConstraints:
62 |         - labelSelector:
63 |             matchLabels:
64 |               app: workload-multi-az-nodes
65 |           maxSkew: 1
66 |           topologyKey: kubernetes.io/hostname
67 |           whenUnsatisfiable: ScheduleAnyway
68 |         - labelSelector:
69 |             matchLabels:
70 |               app: workload-multi-az-nodes
71 |           maxSkew: 1
72 |           topologyKey: topology.kubernetes.io/zone
73 |           whenUnsatisfiable: ScheduleAnyway
74 | ```
75 | 
76 | If you're using a region with more than two AZs available, you might have noticed that pods were scheduled only in two AZs. This is because you're setting `whenUnsatisfiable` to `ScheduleAnyway` which is a soft constraint, the `kube-scheduler` gives higher precedence to topologies that would help reduce the skew.
77 | 
78 | **NOTE**: If you strictly need to spread within all available AZs, you can set he `minDomains` to the number of AZs as this lets you tell the `kube-scheduler` that you expect there to be a particular number of AZs. Therefore, if `kube-scheduler` is not aware of all available AZs, pods are marked as unschedulable and Karpenter will launch a node in each AZ. However, it's important that you know that setting `whenUnsatisfiable` to `DoNotSchedule` will cause pods to be unschedulable if the topology spread constraint can't be fulfilled. It should only be set if its preferable for pods to not run instead of violating the topology spread constraint.
79 | 
80 | In case you want to enforce this spread within `Deployments`, you can use projects like [Kyverno](https://kyverno.io) to mutate a `Deployment` object and set the TSC you've seen in this blueprint. Here's a [Kyverno policy example](https://kyverno.io/policies/other/s-z/spread-pods-across-topology/spread-pods-across-topology/) that mutates a `Deployment` to include a TSC, just make sure it replicates the same rule from this blueprint (`whenUnsatisfiable` to `ScheduleAnyway`).
81 | 
82 | ## Cleanup
83 | 
84 | ```sh
85 | kubectl delete -f workload.yaml
86 | ```
87 | 


--------------------------------------------------------------------------------
/blueprints/update-nodes-with-drift/README.md:
--------------------------------------------------------------------------------
  1 | # Karpenter Blueprint: Update Nodes using Drift
  2 | 
  3 | ## Purpose
  4 | 
  5 | After upgrading the Kubernetes control plane version, you might be wondering how to properly upgrade the data plane nodes launched by Karpenter. Currently, Karpenter has a feature gate to mark nodes as drifted. A drifted node is one whose spec and metadata does not match the spec of its `NodePool` and `nodeClassRef`. A node can drift when a user changes their `NodePool` or `nodeClassRef`. Moreover, underlying infrastructure in the nodepool can be changed outside of the cluster. For example, configuring an `amiSelectorTerms` to configure static AMI IDs match the control plane version in the `NodePool`. This allows you to control when to upgrade node's version or when a new AL2 EKS Optimized AMI is released, creating drifted nodes.
  6 | 
  7 | Karpenter's drift will reconcile when a node's AMI drifts from `NodePool` requirements. When upgrading a node, Karpenter will minimize the downtime of the applications on the node by initiating `NodePool` logic for a replacement node before terminating drifted nodes. Once Karpenter has begun launching the replacement node, Karpenter will cordon and drain the old node, terminating it when it’s fully drained, then finishing the upgrade.
  8 | 
  9 | ## Requirements
 10 | 
 11 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
 12 | 
 13 | ## Deploy
 14 | 
 15 | Let's create a new `EC2NodeClass` to be more precise about the AMIs you'd like to use. For now, you'll intentionally create new nodes using a previous EKS version to simulate where you'll be after upgrading the control plane. Within the `amiSelectorTerms` you'll configure the most recent AMIs (both for `amd64` and `arm64`) from a previous version of the control plane to test the drift feature.
 16 | 
 17 | ```yaml
 18 |   amiSelectorTerms:
 19 |     - id: <<AMD64PREVAMI>>
 20 |     - id: <<ARM64PREVAMI>>
 21 | ```
 22 | 
 23 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes:
 24 | 
 25 | ```sh
 26 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name)
 27 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name)
 28 | ```
 29 | 
 30 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN).
 31 | 
 32 | Karpenter will use the latest EKS-optimized AMIs, so when there's a new AMI available or after you update the Kubernetes control plane and you have `Drift` enabled, the nodes with older AMIs are recycled automatically. To test this feature, you need to configure static AMIs within the `EC2NodeClass`. Run the following commands to create an environment variable with the AMI IDs to use:
 33 | 
 34 | ```sh
 35 | export amd64PrevAMI=$(aws ssm get-parameter --name /aws/service/bottlerocket/aws-k8s-1.32/x86_64/latest/image_id --region $AWS_REGION --query "Parameter.Value" --output text)
 36 | export arm64PrevAMI=$(aws ssm get-parameter --name /aws/service/bottlerocket/aws-k8s-1.32/arm64/latest/image_id --region $AWS_REGION --query "Parameter.Value" --output text)
 37 | ```
 38 | 
 39 | Now, make sure you're in this blueprint folder, then run the following command to create the new `NodePool` and `EC2NodeClass`:
 40 | 
 41 | ```sh
 42 | sed -i '' "s/<<CLUSTER_NAME>>/$CLUSTER_NAME/g" latest-current-ami.yaml
 43 | sed -i '' "s/<<KARPENTER_NODE_IAM_ROLE_NAME>>/$KARPENTER_NODE_IAM_ROLE_NAME/g" latest-current-ami.yaml
 44 | sed -i '' "s/<<AMD64PREVAMI>>/$amd64PrevAMI/g" latest-current-ami.yaml
 45 | sed -i '' "s/<<ARM64PREVAMI>>/$arm64PrevAMI/g" latest-current-ami.yaml
 46 | kubectl apply -f .
 47 | ```
 48 | 
 49 | ## Results
 50 | 
 51 | Wait for around two minutes. The pods from the sample workload should be running even if the node has a version that doesn't match with the control plane.
 52 | 
 53 | ```sh
 54 | > kubectl get pods
 55 | NAME                                  READY   STATUS    RESTARTS     AGE
 56 | latest-current-ami-5bbfbc98f7-6hxkw   1/1     Running   0            3m
 57 | latest-current-ami-5bbfbc98f7-n7mgs   1/1     Running   0            3m
 58 | latest-current-ami-5bbfbc98f7-rxjjx   1/1     Running   0            3m
 59 | ```
 60 | 
 61 | You should see a new node registered with the latest AMI for EKS `v1.31`, like this:
 62 | 
 63 | ```sh
 64 | > kubectl get nodes -l karpenter.sh/initialized=true
 65 | NAME                                         STATUS   ROLES    AGE   VERSION
 66 | ip-10-0-103-18.eu-west-2.compute.internal    Ready    <none>   5m6s   v1.31.6-eks-aad632c
 67 | ```
 68 | 
 69 | Let's simulate a node upgrade by changing the EKS version in the `EC2NodeClass`, run this command:
 70 | 
 71 | ```sh
 72 | export amd64LatestAMI=$(aws ssm get-parameter --name /aws/service/bottlerocket/aws-k8s-1.32/x86_64/latest/image_id --region $AWS_REGION --query "Parameter.Value" --output text)
 73 | export arm64LatestAMI=$(aws ssm get-parameter --name /aws/service/bottlerocket/aws-k8s-1.32/arm64/latest/image_id --region $AWS_REGION --query "Parameter.Value" --output text)
 74 | sed -i '' "s/$amd64PrevAMI/$amd64LatestAMI/g" latest-current-ami.yaml
 75 | sed -i '' "s/$arm64PrevAMI/$arm64LatestAMI/g" latest-current-ami.yaml
 76 | sed -i '' "s/1.31/1.32/g" latest-current-ami.yaml
 77 | kubectl apply -f latest-current-ami.yaml
 78 | ```
 79 | 
 80 | You can confirm the update has been applied by running this command:
 81 | 
 82 | ```sh
 83 | kubectl get ec2nodeclass latest-current-ami -o yaml
 84 | ```
 85 | 
 86 | Wait around five minutes, in the mean time, you can monitor Karpenter logs until you see something like this:
 87 | 
 88 | ```json
 89 | {"level":"INFO","time":"2024-08-16T13:32:10.187Z","logger":"controller","message":"disrupting nodeclaim(s) via replace, terminating 1 nodes (3 pods) ip-10-0-119-175.eu-west-2.compute.internal/c7i-flex.xlarge/spot and replacing with node from types c6a.xlarge, m5.xlarge, c7i-flex.xlarge, m6a.xlarge, c5a.xlarge and 55 other(s)","commit":"5bdf9c3","controller":"disruption","namespace":"","name":"","reconcileID":"be617b33-df37-44fc-897d-737fd3198cee","command-id":"26f7f912-a8f5-4e94-aaaf-386f8da44988","reason":"drifted"}
 90 | {"level":"INFO","time":"2024-08-16T13:32:10.222Z","logger":"controller","message":"created nodeclaim","commit":"5bdf9c3","controller":"disruption","namespace":"","name":"","reconcileID":"be617b33-df37-44fc-897d-737fd3198cee","NodePool":{"name":"latest-current-ami"},"NodeClaim":{"name":"latest-current-ami-smlh7"},"requests":{"cpu":"1766m","memory":"1706Mi","pods":"7"},"instance-types":"c4.2xlarge, c4.xlarge, c5.2xlarge, c5.xlarge, c5a.2xlarge and 55 other(s)"}
 91 | ```
 92 | 
 93 | Wait around two minutes. You should now see a new node with the latest AMI version that matches the control plane's version.
 94 | 
 95 | ```sh
 96 | > kubectl get nodes -l karpenter.sh/initialized=true
 97 | NAME                                        STATUS   ROLES    AGE   VERSION
 98 | ip-10-0-102-231.eu-west-2.compute.internal   Ready    <none>   51s     v1.32.2-eks-677bac1
 99 | ```
100 | 
101 | You can repeat this process every time you need to run a controlled upgrade of the nodes. Also, if you'd like to control when to replace a node, you can learn more about [Disruption Budgets](//blueprints/disruption-budgets/).
102 | 
103 | ## Cleanup
104 | 
105 | To remove all objects created, simply run the following commands:
106 | 
107 | ```sh
108 | kubectl delete -f .
109 | ```
110 | 


--------------------------------------------------------------------------------
/blueprints/stateful/README.md:
--------------------------------------------------------------------------------
  1 | # Karpenter Blueprint: Working with Stateful Workloads using EBS
  2 | 
  3 | ## Purpose
  4 | 
  5 | For stateful workloads that use persistent volumes, Karpenter detects storage scheduling requirements when deciding which instance type to launch and in which AZ. If you have a `StorageClass` configured for multiple AZs, Karpenter randomly selects one AZ when the pod is created for the first time. If the same pod is then removed, a new pod is created to request the same Persistent Volume Claim (PVC) and Karpenter takes this into consideration when choosing the AZ of an existing claim.
  6 | 
  7 | ## Requirements
  8 | 
  9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
 10 | * A `default` Karpenter `NodePool` as that's the one we'll use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository.
 11 | * The [Amazon EBS CSI driver](https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html) installed in the cluster. If you're using the Terraform template in this repository, it's already configured.
 12 | 
 13 | ## Deploy
 14 | 
 15 | Let's start by creating the `PersistentVolumeClaim` and `StorageClass` to use only one AZ. To do so,first choose one of the AZs in the region where you deployed the EKS cluster. Run this command to get one automatically:
 16 | 
 17 | ```sh
 18 | export FIRSTAZ=$(aws ec2 describe-availability-zones --query 'AvailabilityZones[0].ZoneName' --output text)
 19 | echo $FIRSTAZ
 20 | ```
 21 | 
 22 | Then, run these commands to replace the placeholder with the AZ, and deploy the storage resources:
 23 | 
 24 | ```sh
 25 | sed -i '' "s/<<AVAILABILITY_ZONE>>/$FIRSTAZ/g" storage.yaml
 26 | kubectl apply -f storage.yaml
 27 | ```
 28 | 
 29 | Wait around one minute, as long as you get an event of `WaitForFirstConsumer` in the PVC, you're good to continue:
 30 | 
 31 | ```sh
 32 | > kubectl describe pvc ebs-claim
 33 | ...
 34 | Events:
 35 |   Type    Reason                Age                   From                         Message
 36 |   ----    ------                ----                  ----                         -------
 37 |   Normal  WaitForFirstConsumer  14s (x16 over 3m47s)  persistentvolume-controller  waiting for first consumer to be created before binding
 38 | ```
 39 | 
 40 | Deploy a sample workload:
 41 | 
 42 | ```sh
 43 | kubectl apply -f workload.yaml
 44 | ```
 45 | 
 46 | ## Results
 47 | 
 48 | After waiting for around two minutes, you should see the pods running, and the PVC claimed:
 49 | 
 50 | ```sh
 51 | > kubectl get pods
 52 | NAME                        READY   STATUS    RESTARTS   AGE
 53 | stateful-7b68c8d7bc-6mkvn   1/1     Running   0          2m
 54 | stateful-7b68c8d7bc-6mrj5   1/1     Running   0          2m
 55 | stateful-7b68c8d7bc-858nd   1/1     Running   0          2m
 56 | > kubectl get pvc
 57 | NAME        STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS   AGE
 58 | ebs-claim   Bound    pvc-d4c11e32-9da0-41d6-a477-d454a4aade94   4Gi        RWO            storage-gp3    116s
 59 | ```
 60 | 
 61 | Notice that Karpenter launched a node in the AZ (using the value from `$FIRSTAZ` env var), following the constraint defined in the `StorageClass` (no need to constraint it within the `Deployment` or `Pod`):
 62 | 
 63 | ```sh
 64 | > kubectl get nodes -L karpenter.sh/capacity-type,beta.kubernetes.io/instance-type,karpenter.sh/nodepool,topology.kubernetes.io/zone -l karpenter.sh/initialized=true
 65 | NAME                                       STATUS   ROLES    AGE     VERSION               CAPACITY-TYPE   INSTANCE-TYPE   NODEPOOL   ZONE
 66 | ip-10-0-52-243.eu-west-2.compute.internal    Ready    <none>   16s   v1.32.3-eks-473151a   spot            m7g.xlarge      default    eu-west-2a
 67 | ```
 68 | 
 69 | Let's read the file that the pods are writing to, like this:
 70 | 
 71 | ```sh
 72 | export POD=$(kubectl get pods -l app=stateful -o name | cut -d/ -f2 | tail -n1)
 73 | kubectl exec $POD -- cat /data/out.txt
 74 | ```
 75 | 
 76 | You should see that the three pods are writing something every three minutes, like this:
 77 | 
 78 | ```console
 79 | Writing content every three minutes! Printing a random number: 795
 80 | Writing content every three minutes! Printing a random number: 600
 81 | Writing content every three minutes! Printing a random number: 987
 82 | ```
 83 | 
 84 | If you delete one pod, the new pod will continue using the same PVC and will be in a `Running` state:
 85 | 
 86 | ```sh
 87 | kubectl delete pod $POD
 88 | ```
 89 | 
 90 | You can read the content of the file using the new pod:
 91 | 
 92 | ```sh
 93 | export POD=$(kubectl get pods -l app=stateful -o name | cut -d/ -f2 | tail -n1)
 94 | kubectl exec $POD -- cat /data/out.txt
 95 | ```
 96 | 
 97 | You should still see the previous content plus any additional content if three minutes have passed, like this:
 98 | 
 99 | ```console
100 | Writing content every three minutes! Printing a random number: 795
101 | Writing content every three minutes! Printing a random number: 600
102 | Writing content every three minutes! Printing a random number: 987
103 | Writing content every three minutes! Printing a random number: 224
104 | Writing content every three minutes! Printing a random number: 307
105 | Writing content every three minutes! Printing a random number: 325
106 | ```
107 | 
108 | Lastly, you can simulate a scale-down event for the workload and scale the replicas to 0, like this:
109 | 
110 | ```sh
111 | kubectl scale deployment stateful --replicas 0
112 | ```
113 | 
114 | Wait around two minutes, and consolidation will make sure to remove the node. You can then scale-out the workload again, like this:
115 | 
116 | ```sh
117 | kubectl scale deployment stateful --replicas 3
118 | ```
119 | 
120 | And you should see that Karpenter launches a replacement node in the AZ you choose, and the pods are soon going to be in a `Running` state.
121 | 
122 | **NOTE:** You might have a experience/simulate a node loss which can result in data corruption or loss. If this happens, when the new node launched by Karpenter is ready, pods might have a warning event like `Multi-Attach error for volume "pvc-19af27b8-fc0a-428d-bda5-552cb52b9806" Volume is already exclusively attached to one node and can't be attached to another`. You can wait around five minutes and the volume will try to get unattached, and attached again, making your pods successfully run again. Look at this series of events for reference:
123 | 
124 | ```console
125 | Events:
126 |   Type     Reason                  Age                  From                     Message
127 |   ----     ------                  ----                 ----                     -------
128 |   Warning  FailedScheduling        14m                  default-scheduler        0/3 nodes are available: 1 node(s) were unschedulable, 2 node(s) didn't match Pod's node affinity/selector. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling..
129 |   Normal   Nominated               14m                  karpenter                Pod should schedule on: machine/default-75hvl
130 |   Warning  FailedScheduling        14m (x2 over 14m)    default-scheduler        0/3 nodes are available: 1 node(s) had volume node affinity conflict, 2 node(s) didn't match Pod's node affinity/selector. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling..
131 |   Normal   Scheduled               14m                  default-scheduler        Successfully assigned default/stateful-7b68c8d7bc-6mkvn to ip-10-0-63-154.eu-west-1.compute.internal
132 |   Warning  FailedAttachVolume      14m                  attachdetach-controller  Multi-Attach error for volume "pvc-19af27b8-fc0a-428d-bda5-552cb52b9806" Volume is already exclusively attached to one node and can't be attached to another
133 |   Warning  FailedMount             9m52s (x2 over 12m)  kubelet                  Unable to attach or mount volumes: unmounted volumes=[persistent-storage], unattached volumes=[persistent-storage], failed to process volumes=[]: timed out waiting for the condition
134 |   Normal   SuccessfulAttachVolume  8m53s                attachdetach-controller  AttachVolume.Attach succeeded for volume "pvc-19af27b8-fc0a-428d-bda5-552cb52b9806"
135 |   Normal   Pulling                 8m51s                kubelet                  Pulling image "centos"
136 |   Normal   Pulled                  8m47s                kubelet                  Successfully pulled image "centos" in 4.871822072s (4.871840882s including waiting)
137 |   Normal   Created                 8m47s                kubelet                  Created container stateful
138 |   Normal   Started                 8m46s                kubelet                  Started container stateful
139 | ```
140 | 
141 | Finally, you can read the content of the file again:
142 | 
143 | ```sh
144 | export POD=$(kubectl get pods -l app=stateful -o name | cut -d/ -f2 | tail -n1)
145 | kubectl exec $POD -- cat /data/out.txt
146 | ```
147 | 
148 | ## Cleanup
149 | 
150 | To remove all objects created, simply run the following commands:
151 | 
152 | ```sh
153 | kubectl delete -f .
154 | ```
155 | 


--------------------------------------------------------------------------------
/blueprints/nvidia-gpu-workload/README.md:
--------------------------------------------------------------------------------
  1 | # Karpenter Blueprint: Deploy an NVIDIA GPU workload
  2 | 
  3 | ## Purpose
  4 | 
  5 | Karpenter streamlines node lifecycle management, and it can help provide the right compute just-in-time based on your workloads scheduling constraints. This is particularly helpful for your machine learning workflows with variable and heterogeneous compute demands (e.g., NVIDIA GPU-based inference followed by CPU-based plotting). When your Kubernetes workload requires accelerated instance, Karpenter automatically selects the appropriate [Amazon EKS optimized accelerated AMI](https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html).
  6 | 
  7 | Therefore, the purpose of this Karpenter blueprint is to demonstrate how to launch a GPU-based workload on Amazon EKS with Karpenter and AL2023 EKS optimized accelerated AMI. This example assumes a simple one-to-one mapping between a Kubernetes Pod and a GPU. This blueprint does not go into the details about GPU sharing techniques such as MiG, time slicing or other software based GPU fractional scheduling.
  8 | 
  9 | Before you start seeing Karpenter in action, when using AL2023 you need to deploy a Kubernetes device plugin to advertise GPU information from the host.
 10 | 
 11 | ## Requirements
 12 | 
 13 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
 14 | 
 15 | ## Deploy NVIDIA device plugin for Kubernetes
 16 | 
 17 | The [NVIDIA device plugin for Kubernetes](https://github.com/NVIDIA/k8s-device-plugin) is used to advertise the number of GPUs on the host to Kubernetes so that this information can be used for scheduling purposes. You can install the NVIDIA device plugin with helm.
 18 | 
 19 | To install the device plugin run the following:
 20 | 
 21 | ```sh
 22 | helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
 23 | helm repo update
 24 | helm upgrade -i nvdp nvdp/nvidia-device-plugin \
 25 |   --namespace nvidia-device-plugin \
 26 |   --create-namespace \
 27 |   --version 0.17.2
 28 | ```
 29 | 
 30 | Now that you have the device set-up, let’s enable Karpenter to launch NVIDIA GPU instances.
 31 | 
 32 | ## Create a NodeClass and NodePool with GPU-instances (AL2023)
 33 | 
 34 | The following NodeClass, specify the Security Group and Subnet selector, along with AMI. We are using AL2023 here, and when launching an accelerated instance Karpenter will pick the respective EKS optimized accelerated AMI. AL2023 comes packaged with the NVIDIA GPU drivers, and the container runtime is configured out of the box.
 35 | 
 36 | Before applying the `gpu-nodeclass.yaml` replace `KARPENTER_NODE_IAM_ROLE_NAME` and `CLUSTER_NAME` in the file with your specific cluster details. If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes:
 37 | 
 38 | ```sh
 39 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name)
 40 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name)
 41 | ```
 42 | 
 43 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN).
 44 | 
 45 | The EC2NodeClass we’ll deploy looks like this, execute the following command to create the EC2NodeClass file:
 46 | 
 47 | ```sh
 48 | cat << EOF > gpu-nodeclass.yaml
 49 | apiVersion: karpenter.k8s.aws/v1
 50 | kind: EC2NodeClass
 51 | metadata:
 52 |   name: gpu
 53 | spec:
 54 |   amiSelectorTerms:
 55 |   - alias: al2023@latest
 56 |   role: "$KARPENTER_NODE_IAM_ROLE_NAME"
 57 |   blockDeviceMappings:
 58 |   - deviceName: /dev/xvda
 59 |     ebs:
 60 |       deleteOnTermination: true
 61 |       iops: 10000
 62 |       throughput: 125
 63 |       volumeSize: 100Gi
 64 |       volumeType: gp3
 65 |   securityGroupSelectorTerms:
 66 |   - tags:
 67 |       karpenter.sh/discovery: $CLUSTER_NAME
 68 |   subnetSelectorTerms:
 69 |   - tags:
 70 |       karpenter.sh/discovery: $CLUSTER_NAME
 71 | EOF
 72 | ```
 73 | 
 74 | A separate [EC2NodeClass](https://karpenter.sh/docs/concepts/nodeclasses/) was created as you may want to tune node properties such as ephemeral storage size, block device mappings, [capacity reservations selector](https://karpenter.sh/docs/concepts/nodeclasses/).
 75 | 
 76 | The next step is to create a dedicated NodePool to provision instances from the `g` Amazon EC2 instance category and nvidia gpu manufacturer, and only allow workloads that tolerate the `nvidia.com/gpu` taint to be scheduled. Such NodePool will look like this. Execute the following command to create the NodePool file:
 77 | 
 78 | ```sh
 79 | cat << EOF > gpu-nodepool.yaml
 80 | apiVersion: karpenter.sh/v1
 81 | kind: NodePool
 82 | metadata:
 83 |   name: gpu
 84 | spec:
 85 |   limits:
 86 |     cpu: 100
 87 |     memory: 100Gi
 88 |     nvidia.com/gpu: 5
 89 |   template:
 90 |     metadata:
 91 |       labels:
 92 |         nvidia.com/gpu.present: true
 93 |     spec:
 94 |       nodeClassRef:
 95 |         group: karpenter.k8s.aws
 96 |         name: gpu
 97 |         kind: EC2NodeClass
 98 |       requirements:
 99 |         - key: karpenter.sh/capacity-type
100 |           operator: In
101 |           values: ["on-demand"]
102 |         - key: karpenter.k8s.aws/instance-category
103 |           operator: In
104 |           values: ["g"]
105 |         - key: karpenter.k8s.aws/instance-gpu-manufacturer
106 |           operator: In
107 |           values: ["nvidia"]
108 |       expireAfter: 720h
109 |       taints:
110 |          - key: nvidia.com/gpu
111 |            effect: NoSchedule
112 |   disruption:
113 |     consolidationPolicy: WhenEmpty
114 |     consolidateAfter: 5m
115 | EOF
116 | ```
117 | 
118 | We’ve added the `nivida.com/gpu` taint in the NodePool to prevent workloads that do not tolerate this taint being scheduled on nodes managed by this NodePool (they might not take advantage of it). Also, notice that the `.spec.disruption` policy has been set to WhenEmpty and only consolidate after 5 minutes, this is to support spiky workloads like jobs with a high-churn - you’ll likely want to tweak this based on your workloads requirements.
119 | 
120 | Once the placeholders are complete, to apply the EC2NodeClass and NodePool execute the following:
121 | 
122 | ```sh
123 | $> kubectl apply -f gpu-nodeclass.yaml
124 | ec2nodeclass.karpenter.k8s.aws/gpu created
125 | 
126 | $> kubectl apply -f gpu-nodepool.yaml
127 | nodepool.karpenter.sh/gpu created
128 | ```
129 | 
130 | Now let’s deploy a test workload to see how Karpenter launches the GPU node.
131 | 
132 | ### Deploy a test workload to test GPU drivers are loaded
133 | 
134 | The following Pod manifest launches a pod and calls the NVIDIA systems management CLI to check if a GPU is detected and the driver versions printed to standard output, which you can see when you check the logs, like this: `kubectl logs pod/nvidia-smi`. Execute the following command to create the `workload.yaml`:
135 | 
136 | ```sh
137 | cat << EOF > workload.yaml
138 | apiVersion: v1
139 | kind: Pod
140 | metadata:
141 |   name: nvidia-smi
142 | spec:
143 |   nodeSelector:
144 |     nvidia.com/gpu.present: "true"
145 |     karpenter.k8s.aws/instance-gpu-name: "t4"
146 |   restartPolicy: OnFailure
147 |   containers:
148 |   - name: nvidia-smi
149 |     image: public.ecr.aws/amazonlinux/amazonlinux:2023-minimal
150 |     args:
151 |     - "nvidia-smi"
152 |     resources:
153 |       requests:
154 |         memory: "8Gi"
155 |         cpu: "3500m"
156 |       limits:
157 |         memory: "8Gi"
158 |         nvidia.com/gpu: 1
159 |   tolerations:
160 |   - key: nvidia.com/gpu
161 |     effect: NoSchedule
162 |     operator: Exists
163 | EOF
164 | ```
165 | 
166 | As GPU-based workloads are likely sensitive to different GPUs (e.g. GPU memory) we've specified a `karpenter.k8s.aws/instance-gpu-name` node selector to request an instance with a specific GPU for this workload. The following nodeSelector `karpenter.k8s.aws/instance-gpu-name: "t4"` influences Karpenter node provisioning and launch the workload on a node with a [NVIDIA T4 GPU](https://aws.amazon.com/ec2/instance-types/g4/). Review the [Karpenter documentation](https://karpenter.sh/docs/reference/instance-types/) for different Amazon EC2 instances and there labels.
167 | 
168 | To deploy the workload execute the following:
169 | 
170 | ```sh
171 | $> kubectl apply -f workload.yaml
172 | pod/nvidia-smi created
173 | ```
174 | 
175 | You can check the pods status by executing:
176 | 
177 | ```sh
178 | $> kubectl get pods
179 | NAME         READY   STATUS    RESTARTS   AGE
180 | nvidia-smi   1/1     Running   0          3s
181 | ```
182 | 
183 | You can view the pods nvidia-smi logs by executing:
184 | 
185 | ```sh
186 | $> kubectl logs pod/nvidia-smi
187 | 
188 | +-----------------------------------------------------------------------------------------+
189 | | NVIDIA-SMI 570.133.20             Driver Version: 570.133.20     CUDA Version: 12.8     |
190 | |-----------------------------------------+------------------------+----------------------+
191 | | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
192 | | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
193 | |                                         |                        |               MIG M. |
194 | |=========================================+========================+======================|
195 | |   0  Tesla T4                       On  |   00000000:00:1E.0 Off |                    0 |
196 | | N/A   29C    P8             17W /   70W |       0MiB /  15360MiB |      0%      Default |
197 | |                                         |                        |                  N/A |
198 | +-----------------------------------------+------------------------+----------------------+
199 | 
200 | +-----------------------------------------------------------------------------------------+
201 | | Processes:                                                                              |
202 | |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
203 | |        ID   ID                                                               Usage      |
204 | |=========================================================================================|
205 | |  No running processes found                                                             |
206 | +-----------------------------------------------------------------------------------------+
207 | ```
208 | 
209 | To review which node was launched by Karpenter, execute the following:
210 | 
211 | ```sh
212 | $> kubectl get nodeclaims
213 | 
214 | NAME        TYPE           CAPACITY    ZONE         NODE                                          READY   AGE
215 | gpu-f69tm   g4dn.2xlarge   on-demand   eu-west-1c   ip-xxx-xxx-xxx-xxx.eu-west-1.compute.internal True    5m44s
216 | ```
217 | 
218 | ## Clean-up
219 | 
220 | To clean-up execute the following commands:
221 | 
222 | ```sh
223 | kubectl delete -f workload.yaml
224 | kubectl delete -f gpu-nodepool.yaml
225 | kubectl delete -f gpu-nodeclass.yaml
226 | helm -n nvidia-device-plugin uninstall nvdp
227 | ```
228 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Karpenter Blueprints for Amazon EKS
  2 | 
  3 | ## Motivation
  4 | 
  5 | [Karpenter](https://karpenter.sh/), a node provisioning project built for Kubernetes has been helping many companies to improve the efficiency and cost of running workloads on Kubernetes. However, as Karpenter takes an application-first approach to provision compute capacity for the Kubernetes data plane, there are common workload scenarios that you might be wondering how to configure them properly. This repository includes a list of common workload scenarios, some of them go in depth with the explanation of why configuring Karpenter and Kubernetes objects in such a way is important.
  6 | 
  7 | ## Blueprint Structure
  8 | 
  9 | Each blueprint follows the same structure to help you better understand what's the motivation and the expected results:
 10 | 
 11 | | Concept        | Description                                                                                     |
 12 | | -------------- | ----------------------------------------------------------------------------------------------- |
 13 | | Purpose        | Explains what the blueprint is about, and what problem is solving.                              |
 14 | | Requirements   | Any pre-requisites you might need to use the blueprint (i.e. An `arm64` container image).       |
 15 | | Deploy         | The steps to follow to deploy the blueprint into an existing Kubernetes cluster.                |
 16 | | Results        | The expected results when using the blueprint.                                                  |
 17 | 
 18 | ## How to use these Blueprints?
 19 | 
 20 | Before you get started, you need to have a Kubernetes cluster with Karpenter installed. If you're planning to work with an existing cluster, just make sure you've configured Karpenter following the [official guide](https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/). This project also has a template to create a cluster with everything you'll need to test each blueprint.
 21 | 
 22 | ## Support & Feedback
 23 | 
 24 | > [!IMPORTANT]
 25 | > Karpenter Blueprints for Amazon EKS is maintained by AWS Solution Architects. It is not part of an AWS
 26 | > service and support is provided as a best-effort by the Karpenter Blueprints community. To provide feedback,
 27 | > please use the [issues templates](https://github.com/aws-samples/karpenter-blueprints/issues)
 28 | > provided. If you are interested in contributing to EKS Blueprints, see the
 29 | > [Contribution guide](https://github.com/aws-samples/karpenter-blueprints/blob/main/CONTRIBUTING.md).
 30 | 
 31 | ### Requirements
 32 | 
 33 | * You need access to an AWS account with IAM permissions to create an EKS cluster, and an AWS Cloud9 environment if you're running the commands listed in this tutorial.
 34 | * Install and configure the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
 35 | * Install the [Kubernetes CLI (kubectl)](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/)
 36 | * (Optional*) Install the [Terraform CLI](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli)
 37 | * (Optional*) Install Helm ([the package manager for Kubernetes](https://helm.sh/docs/intro/install/))
 38 | 
 39 | ***NOTE:** If you're planning to use an existing EKS cluster, you don't need the **optional** prerequisites.
 40 | 
 41 | ### Preparing to Deploy Blueprints
 42 | 
 43 | Before you start deploying and testing blueprints, make sure you follow next steps. For example, all blueprints assume that you have an EKS cluster with Karpenter deployed, and others even required that you have a `default` Karpenter `NodePool` deployed.
 44 | 
 45 | #### Create an EKS Cluster using Terraform (Optional)
 46 | 
 47 | If you're planning on using an existing EKS cluster, you can use an existing node group with On-Demand instances to deploy the Karpenter controller. To do so, you need to follow the [Karpenter getting started guide](https://karpenter.sh/docs/getting-started/).
 48 | 
 49 | You'll create an Amazon EKS cluster using the [EKS Blueprints for Terraform project](https://github.com/aws-ia/terraform-aws-eks-blueprints). The Terraform template included in this repository is going to create a VPC, an EKS control plane, and a Kubernetes service account along with the IAM role and associate them using IAM Roles for Service Accounts (IRSA) to let Karpenter launch instances. Additionally, the template configures the Karpenter node role to the `aws-auth` configmap to allow nodes to connect, and creates an On-Demand managed node group for the `kube-system` and `karpenter` namespaces.
 50 | 
 51 | To create the cluster, clone this repository and open the `cluster/terraform` folder. Then, run the following commands:
 52 | 
 53 | ```sh
 54 | cd cluster/terraform
 55 | helm registry logout public.ecr.aws
 56 | export TF_VAR_region=$AWS_REGION
 57 | terraform init
 58 | terraform apply -target="module.vpc" -auto-approve
 59 | terraform apply -target="module.eks" -auto-approve
 60 | terraform apply --auto-approve
 61 | ```
 62 | 
 63 | Before you continue, you need to enable your AWS account to launch Spot instances if you haven't launch any yet. To do so, create the [service-linked role for Spot](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-requests.html#service-linked-roles-spot-instance-requests) by running the following command:
 64 | 
 65 | ```sh
 66 | aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true
 67 | ```
 68 | 
 69 | You might see the following error if the role has already been successfully created. You don't need to worry about this error, you simply had to run the above command to make sure you have the service-linked role to launch Spot instances:
 70 | 
 71 | ```console
 72 | An error occurred (InvalidInput) when calling the CreateServiceLinkedRole operation: Service role name AWSServiceRoleForEC2Spot has been taken in this account, please try a different suffix.
 73 | ```
 74 | 
 75 | Once complete (after waiting about 15 minutes), run the following command to update the `kube.config` file to interact with the cluster through `kubectl`:
 76 | 
 77 | ```sh
 78 | aws eks --region $AWS_REGION update-kubeconfig --name karpenter-blueprints
 79 | ```
 80 | 
 81 | You need to make sure you can interact with the cluster and that the Karpenter pods are running:
 82 | 
 83 | ```sh
 84 | $> kubectl get pods -n karpenter
 85 | NAME                       READY STATUS  RESTARTS AGE
 86 | karpenter-5f97c944df-bm85s 1/1   Running 0        15m
 87 | karpenter-5f97c944df-xr9jf 1/1   Running 0        15m
 88 | ```
 89 | 
 90 | You can now proceed to deploy the default Karpenter NodePool, and deploy any blueprint you want to test.
 91 | 
 92 | #### Deploy a Karpenter Default EC2NodeClass and NodePool
 93 | 
 94 | Before you start deploying a blueprint, you need to have a default [EC2NodeClass](https://karpenter.sh/preview/concepts/nodeclasses/) and a default [NodePool](https://karpenter.sh/docs/concepts/nodepools/) as some blueprints need them. `EC2NodeClass` enable configuration of AWS specific settings for EC2 instances launched by Karpenter. The `NodePool` sets constraints on the nodes that can be created by Karpenter and the pods that can run on those nodes. Each NodePool must reference an `EC2NodeClass` using `spec.nodeClassRef`.
 95 | 
 96 | If you create a new EKS cluster following the previous steps, a Karpenter `EC2NodeClass` "default" and a Karpenter `NodePool` "default" are installed automatically.
 97 | 
 98 | **NOTE:**  For existing EKS cluster you have to modify the provided `./cluster/terraform/karpenter.tf` according to your setup by properly modifying `securityGroupSelectorTerm` and `subnetSelectorTerms` removing the `depends_on` section. ***If you're not using Terraform***, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN).
 99 | 
100 | You can see that the NodePool has been deployed by running this:
101 | 
102 | ```sh
103 | kubectl get nodepool
104 | ```
105 | 
106 | You can see that the `EC2NodeClass` has been deployed by running this:
107 | 
108 | ```sh
109 | kubectl get ec2nodeclass
110 | ```
111 | 
112 | Throughout all the blueprints, you might need to review Karpenter logs, so let's create an alias for that to read logs by simply running `kl`:
113 | 
114 | ```sh
115 | alias kl="kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20"
116 | ```
117 | 
118 | You can now proceed to deploy any blueprint you want to test.
119 | 
120 | #### Terraform Cleanup  (Optional)
121 | 
122 | Once you're done with testing the blueprints, if you used the Terraform template from this repository, you can proceed to remove all the resources that Terraform created. To do so, run the following commands:
123 | 
124 | ```sh
125 | kubectl delete --all nodeclaim
126 | kubectl delete --all nodepool
127 | kubectl delete --all ec2nodeclass
128 | export TF_VAR_region=$AWS_REGION
129 | terraform destroy -target="module.eks_blueprints_addons" --auto-approve
130 | terraform destroy -target="module.eks" --auto-approve
131 | terraform destroy --auto-approve
132 | ```
133 | 
134 | ## Deploying a Blueprint
135 | 
136 | After you have a cluster up and running with Karpenter installed, you can start testing each blueprint. A blueprint might have a `NodePool`, `EC2NodeClass` and a workload example. You need to open the blueprint folder and follow the steps to deploy the resources needed to test the blueprint.
137 | 
138 | Here's the list of blueprints we have so far:
139 | 
140 | * [High-Availability: Spread Pods across AZs & Nodes](/blueprints/ha-az-nodes/)
141 | * [Split Between On-Demand & Spot Instances](/blueprints/od-spot-split/)
142 | * [Prioritize Savings Plans and/or Reserved Instances](/blueprints/saving-plans/)
143 | * [Working with Graviton Instances](/blueprints/graviton)
144 | * [Overprovision capacity in advanced to increase responsiveness](/blueprints/overprovision/)
145 | * [Using multiple EBS volumes](/blueprints/multi-ebs/)
146 | * [Working with Stateful Workloads using EBS](/blueprints/stateful/)
147 | * [Update Nodes using Drift](/blueprints/update-nodes-with-drift/)
148 | * [Launching nodes using custom AMIs](/blueprints/custom-ami/)
149 | * [Customizing nodes with your own User Data automation](/blueprints/userdata/)
150 | * [Protecting batch jobs during the consolidation process](/blueprints/batch-jobs/)
151 | * [NodePool Disruption Budgets](/blueprints/disruption-budgets/)
152 | * [Deploy an NVIDIA GPU workload](/blueprints/nvidia-gpu-workload/)
153 | * [Accelerating image pull time using SOCI parallel mode](/blueprints/soci-snapshotter/)
154 | 
155 | **NOTE:** Each blueprint is independent from each other, so you can deploy and test multiple blueprints at the same time in the same Kubernetes cluster. However, to reduce noise, we recommend you to test one blueprint at a time.
156 | 
157 | ## Supported Versions
158 | 
159 | The following table describes the list of resources along with the versions where the blueprints in this repo have been tested.
160 | 
161 | | Resources/Tool  | Version             |
162 | | --------------- | ------------------- |
163 | | [Kubernetes](https://kubernetes.io/releases/)      | 1.32                |
164 | | [Karpenter](https://github.com/aws/karpenter/releases)       | v1.5.0            |
165 | | [Terraform](https://github.com/hashicorp/terraform/releases)       | v1.12.1            |
166 | | [AWS EKS](https://github.com/terraform-aws-modules/terraform-aws-eks/releases)  | v20.37.0             |
167 | | [EKS Blueprints Addons](https://github.com/aws-ia/terraform-aws-eks-blueprints-addons/releases)  | v1.21.0              |
168 | 
169 | ## License
170 | 
171 | MIT-0 Licensed. See [LICENSE](/LICENSE).
172 | 


--------------------------------------------------------------------------------
/blueprints/soci-snapshotter/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Karpenter Blueprint: Using SOCI snapshotter parallel pull/unpack mode
  3 | 
  4 | ## Purpose
  5 | 
  6 | Container image pull performance has become a bottleneck as container images grow larger, compared to when typical images were just a few hundred megabytes.
  7 | The default pulling method uses sequential layer downloading and unpacking. SOCI parallel pull/unpack mode accelerates container image loading through concurrent downloads and unpacking operations, reducing image pull time by up to 50%. This makes it ideal for AI/ML and Batch workloads, where it is common for those applications to have a large container images.
  8 | 
  9 | This blueprint demonstrate how to setup SOCI snapshotter parallel pull/unpack mode on AL2023 and Bottlerocket through a custom `EC2NodeClass` and customizing the `userData` field.
 10 | 
 11 | > ***NOTE***: SOCI snapshotter parallel mode is supported on [Amazon Linux 2023 (AL2023) > v20250821](https://github.com/awslabs/amazon-eks-ami/releases/tag/v20250821) and [Bottlerocket > v1.44.0](https://github.com/bottlerocket-os/bottlerocket/releases/tag/v1.44.0)
 12 | 
 13 | If you would like to learn more about SOCI snapshotter's new parallel pull/unpack mode you can visit the following resources:
 14 | 1. [SOCI snapshotter parallel mode feature docs](https://github.com/awslabs/soci-snapshotter/blob/main/docs/parallel-mode.md) in the [SOCI project repository](https://github.com/awslabs/soci-snapshotter) on GitHub.
 15 | 
 16 | ## Requirements
 17 | 
 18 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the `cluster` folder in the root of this repository.
 19 | * A Container Registry that supports HTTP range GET requests such as [Amazon Elastic Container Registry (ECR)](https://aws.amazon.com/ecr/)
 20 | 
 21 | ## Deploy
 22 | 
 23 | You need to create a new `EC2NodeClass` with the `userData` field and customize the root volume EBS with `blockDeviceMappings`, along with a `NodePool` to use this new template.
 24 | 
 25 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes:
 26 | 
 27 | ```sh
 28 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name)
 29 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name)
 30 | ```
 31 | 
 32 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN).
 33 | 
 34 | Now, make sure you're in this blueprint folder, then run the following command:
 35 | 
 36 | ```sh
 37 | sed -i '' "s/<<CLUSTER_NAME>>/$CLUSTER_NAME/g" soci-snapshotter.yaml
 38 | sed -i '' "s/<<KARPENTER_NODE_IAM_ROLE_NAME>>/$KARPENTER_NODE_IAM_ROLE_NAME/g" soci-snapshotter.yaml
 39 | kubectl apply -f .
 40 | ```
 41 | 
 42 | > ***NOTE***: It can take a couple of minutes for resource to be created, while resources are being created you can continue reading.
 43 | 
 44 | Those commands creates the following:
 45 | 1. `EC2NodeClass` and `NodePool` named `soci-snapshotter` for using SOCI snapshotter parallel pull/unpack mode with customized `blockDeviceMappings` for increased I/O and storage size on Amazon Linux 2023.
 46 | 2. `EC2NodeClass` and `NodePool` named `soci-snapshotter-br` for using SOCI snapshotter parallel pull/unpack mode with customized `blockDeviceMappings` for increased I/O and storage size on Bottlerocket.
 47 | 3. `EC2NodeClass` and `NodePool` named `non-soci-snapshotter` for using default containerd implementation with customized `blockDeviceMappings` for increased I/O and storage size.
 48 | 4. Kubernetes `Deployment` named `vllm-soci` that uses the `soci-snapshotter` `NodePool`
 49 | 5. Kubernetes `Deployment` named `vllm-soci-br` that uses the `soci-snapshotter-br` `NodePool`
 50 | 6. Kubernetes `Deployment` named `vllm` that uses the `non-soci-snapshotter` `NodePool`
 51 | 
 52 | > ***NOTE***: For our example both deployments will request instances that have network and ebs bandwidth greater than 8000 Mbps by using `nodeAffinity` in order to eliminate network and storage I/O bottlenecks to demonstrate SOCI parallel mode capabilities.
 53 | ```
 54 |       affinity:
 55 |         nodeAffinity:
 56 |           requiredDuringSchedulingIgnoredDuringExecution:
 57 |             nodeSelectorTerms:
 58 |             - matchExpressions:
 59 |               - key: karpenter.k8s.aws/instance-ebs-bandwidth
 60 |                 operator: Gt
 61 |                 values:
 62 |                 - "8000"
 63 |               - key: karpenter.k8s.aws/instance-network-bandwidth
 64 |                 operator: Gt
 65 |                 values:
 66 |                 - "8000"
 67 | ```
 68 | ## Configuration
 69 | 
 70 | The SOCI snapshotter `EC2NodeClass` configuration have several configuration parameters that affect SOCI parallel mode performance.
 71 | 
 72 | The `blockDeviceMapping` field is used to increase root volume EBS performance and storage size.
 73 | As SOCI parallel mode downloads layers, it buffers them on disk instead of in-memory, having a high performant storage subsystem is crucial to support it as well as enough storage to hold the container images.
 74 | The example configure the root volume with IOPs of 16,000 and throughput of 1,000MiB/s which is the maximum for GP3, it is recommended that you modify those settings accordingly to trade-off between performance and cost.
 75 | > ***NOTE***: From our benchmarks, we have also seen a good starting point by setting the throughput to 600MiB/s and keeping base IOPs to 3,000.
 76 | 
 77 | <details>
 78 | <summary>Amazon Linux 2023</summary>
 79 | 
 80 | ```yaml
 81 | apiVersion: karpenter.k8s.aws/v1
 82 | kind: EC2NodeClass
 83 | metadata:
 84 |   name: soci-snapshotter
 85 | ...
 86 | ...
 87 | spec:
 88 |   blockDeviceMappings:
 89 |   - deviceName: /dev/xvda
 90 |     ebs:
 91 |       volumeSize: 100Gi
 92 |       volumeType: gp3
 93 |       throughput: 1000
 94 |       iops: 16000
 95 | ...
 96 | ...
 97 | ```
 98 | </details>
 99 | <details>
100 | <summary>Bottlerocket</summary>
101 | 
102 | Bottlerocket defaults to two block devices, one for Bottlerocket's control volume and the other for container resources such as images and logs, in the example below we have configured Bottlerocket's secondary block device with increased EBS storage & throughput to support SOCI parallel mode.
103 | 
104 | ```yaml
105 | apiVersion: karpenter.k8s.aws/v1
106 | kind: EC2NodeClass
107 | metadata:
108 |   name: soci-snapshotter-br
109 | ...
110 | ...
111 | spec:
112 |   blockDeviceMappings:
113 |     - deviceName: /dev/xvda
114 |       ebs:
115 |         volumeSize: 4Gi
116 |         volumeType: gp3
117 |         encrypted: true
118 |     - deviceName: /dev/xvdb
119 |       ebs:
120 |         volumeSize: 100Gi
121 |         volumeType: gp3
122 |         throughput: 1000
123 |         iops: 16000
124 |         encrypted: true
125 | ...
126 | ...
127 | ```
128 | 
129 | </details>
130 | <br>
131 | 
132 | The `userData` field is used to enable and configure SOCI snapshotter on AL2023 and Bottlerocket.
133 | 
134 | SOCI parallel mode configuration is controlled by several key settings. While the default values align with containerd's standard configuration to ensure stability and safety, you can adjust these parameters to optimize performance based on your specific needs, but ensure the infastructure can support it.
135 | 
136 | 1. `max_concurrent_downloads_per_image`: Limits the maximum concurrent downloads per individual image, Default is 3 for Bottlerocket and 20 for AL2023. For images hosted on Amazon ECR we recommend setting this to 10-20.
137 | 2. `max_concurrent_unpacks_per_image`: Sets the limit for concurrent unpacking of layers per image. Default is 1 for Bottlerocket and 12 for AL2023. Tuning this to match the number of avg layers count of your container images.
138 | 3. `concurrent_download_chunk_size`: Specifies the size of each download chunk when pulling image layers in parallel. Default is "unlimited" for Bottlerocket and "16mb" for AL2023. This feature will enable multiple concurrent downloads per layer, we recommend setting this value to >0 if your registry support HTTP range requests, if you're using ECR, we recommend setting this to "16mb".
139 | 4. `discard_unpacked_layers`: Controls whether to retain layer blobs after unpacking. Enabling this can reduce disk space usage and speed up pull times. Default is false for Bottlerocket and true for AL2023. We recommend to set this to true on EKS nodes.
140 | 
141 | To learn more about other configuration options, visit the [official SOCI snapshotter doc](https://github.com/awslabs/soci-snapshotter/blob/main/docs/parallel-mode.md#configuration)
142 | 
143 | As installing a snapshotter to containerd and EKS requires several configuration, this is all being done for you automatically in AL2023 and Bottlerocket as SOCI is already pre-installed in the latest AMIs.
144 | 
145 | <details>
146 | <summary>Amazon Linux 2023</summary>
147 | 
148 | SOCI snapshotter parallel mode can be enabled in AL2023 through featureGate named "FastImagePull", in AL2023 we use [`NodeConfig`](https://awslabs.github.io/amazon-eks-ami/nodeadm/doc/examples/#enabling-fast-image-pull-experimental) simplify various data plane configurations.
149 | 
150 | 
151 | ```yaml
152 | apiVersion: karpenter.k8s.aws/v1
153 | kind: EC2NodeClass
154 | metadata:
155 |   name: soci-snapshotter
156 | ...
157 | ...
158 | spec:
159 | ...
160 | ...
161 |   userData: |
162 |     apiVersion: node.eks.aws/v1alpha1
163 |     kind: NodeConfig
164 |     spec:
165 |       featureGates:
166 |         FastImagePull: true
167 | ```
168 | 
169 | Modifying SOCI snapshotter parallel mode configuration in AL2023 requires modifying the `/etc/soci-snapshotter-grpc/config.toml` file, this can be achieved by a `userData` script as additional to the `NodeConfig` configuration.
170 | 
171 | The following sets `max_concurrent_downloads_per_image` and `max_concurrent_unpacks_per_image` to `10` respectively
172 | 
173 | ```yaml
174 | apiVersion: karpenter.k8s.aws/v1
175 | kind: EC2NodeClass
176 | metadata:
177 |   name: soci-snapshotter
178 | ...
179 | ...
180 | spec:
181 | ...
182 | ...
183 |   userData: |
184 |     MIME-Version: 1.0
185 |     Content-Type: multipart/mixed; boundary="//"
186 | 
187 |     --//
188 |     Content-Type: text/x-shellscript; charset="us-ascii"
189 | 
190 |     #!/bin/bash
191 |     max_concurrent_downloads_per_image=10
192 |     max_concurrent_unpacks_per_image=10
193 | 
194 |     sed -i "s/^max_concurrent_downloads_per_image = .*$/max_concurrent_downloads_per_image = $max_concurrent_downloads_per_image/" /etc/soci-snapshotter-grpc/config.toml
195 |     sed -i "s/^max_concurrent_unpacks_per_image = .*$/max_concurrent_unpacks_per_image = $max_concurrent_unpacks_per_image/" /etc/soci-snapshotter-grpc/config.toml
196 | 
197 |     --//
198 |     Content-Type: application/node.eks.aws
199 | 
200 |     apiVersion: node.eks.aws/v1alpha1
201 |     kind: NodeConfig
202 |     spec:
203 |       featureGates:
204 |         FastImagePull: true
205 |     --//
206 | ```
207 | 
208 | </details>
209 | 
210 | <details>
211 | <summary>Bottlerocket</summary>
212 | 
213 | SOCI snapshotter parallel mode can be enabled and configured in Bottlerocket through the [Settings API](https://bottlerocket.dev/en/os/1.44.x/api/settings/container-runtime-plugins/#tag-soci-parallel-pull-configuration).
214 | 
215 | In Bottlerocket, SOCI's data dir is configured at `/var/lib/soci-snapshotter`, to take advantage of instances with NVMe disks, we will need to configure ephemeral storage through Bottlerocket's Settings API, with `[settings.bootstrap-commands.k8s-ephemeral-storage]` as you can see below, we added `/var/lib/soci-snapshotter` as a bind dir.
216 | 
217 | ```yaml
218 | apiVersion: karpenter.k8s.aws/v1
219 | kind: EC2NodeClass
220 | metadata:
221 |   name: soci-snapshotter-br
222 | ...
223 | ...
224 | spec:
225 | ...
226 | ...
227 |   userData: |
228 |     [settings.container-runtime]
229 |     snapshotter = "soci"
230 |     [settings.container-runtime-plugins.soci-snapshotter]
231 |     pull-mode = "parallel-pull-unpack"
232 |     [settings.container-runtime-plugins.soci-snapshotter.parallel-pull-unpack]
233 |     max-concurrent-downloads-per-image = 20
234 |     concurrent-download-chunk-size = "16mb"
235 |     max-concurrent-unpacks-per-image = 12
236 |     discard-unpacked-layers = true
237 |     [settings.bootstrap-commands.k8s-ephemeral-storage]
238 |     commands = [
239 |         ["apiclient", "ephemeral-storage", "init"],
240 |         ["apiclient", "ephemeral-storage" ,"bind", "--dirs", "/var/lib/containerd", "/var/lib/kubelet", "/var/log/pods", "/var/lib/soci-snapshotter"]
241 |     ]
242 |     essential = true
243 |     mode = "always"
244 | ```
245 | </details>
246 | 
247 | ## Results
248 | 
249 | Wait until the pods from the sample workload are in running status:
250 | ```sh
251 | > kubectl wait --for=condition=Ready pods --all --namespace default --timeout=300s
252 | pod/vllm-59bfb6f86c-9nfxb condition met
253 | pod/vllm-soci-6d9bfd996d-vhr4j condition met
254 | pod/vllm-soci-br-74b59cc4bd-rq8cw condition met
255 | ```
256 | 
257 | The sample workload deploys three Deployments running [Amazon Deep Learning Container (DLC) for vLLM](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/dlc-vllm-x86-ec2.html) two using SOCI parallel pull/unpack mode (AL2023, Bottlerocket) and one remains using the default containerd implementation.
258 | > ***NOTE*** The Amazon DLC for vLLM container image size is about **~10GB**
259 | 
260 | Let's examine the pull time for each Deployment:
261 | 
262 | The `vllm` deployment using the default containerd implementation results in pull time of **1m52.33s**.
263 | ```sh
264 | > kubectl describe pod -l app=vllm | grep Pulled
265 |   Normal   Pulled            7m2s   kubelet            Successfully pulled image "763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2"
266 |   in 1m52.33s (1m52.33s including waiting). Image size: 10778400361 bytes.
267 | ```
268 | 
269 | The `vllm-soci` deployment using SOCI snapshotter's parallel pull/unpack mode implementation results in pull time of **59.813s**.
270 | ```sh
271 | > kubectl describe pod -l app=vllm-soci | grep Pulled
272 |   Normal   Pulled            8m27s  kubelet            Successfully pulled image "763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2"
273 |   in 59.813s (59.813s including waiting). Image size: 10778400361 bytes.
274 | ```
275 | 
276 | The `vllm-soci-br` deployment using SOCI snapshotter's parallel pull/unpack mode implementation on Bottlerocket, results in pull time of **44.974s**.
277 | ```sh
278 | > kubectl describe pod -l app=vllm-soci-br | grep Pulled
279 |   Normal   Pulled            9m46s  kubelet            Successfully pulled image "763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.9-gpu-py312-ec2"
280 |   in 44.974s (44.974s including waiting). Image size: 10778400361 bytes.
281 | ```
282 | 
283 | We can see that using SOCI snapshotter's improved container pull time by about **50%** on Amazon Linux 2023, and about **60%** on Bottlerocket, the reason for that is that Bottlerocket have an improved decompression library for Intel based CPUs ([bottlerocket-core-kit PR #443](https://github.com/bottlerocket-os/bottlerocket-core-kit/pull/443))
284 | 
285 | 
286 | ## Cleanup
287 | 
288 | To remove all objects created, simply run the following commands:
289 | 
290 | ```sh
291 | kubectl delete -f .
292 | ```
293 | 
294 | 


--------------------------------------------------------------------------------
/blueprints/batch-jobs/README.md:
--------------------------------------------------------------------------------
  1 | # Karpenter Blueprint: Protecting batch jobs during the disruption (consolidation) process
  2 | 
  3 | ## Purpose
  4 | 
  5 | Karpenter can actively reduce the cluster cost by identifying when nodes can be removed or replaced because they are empty or there are a cheaper one available after some workload change. This process is called [consolidation](https://karpenter.sh/preview/concepts/disruption/#consolidation), and it implies the disruption of pods that are running in the node, if any, as they need to be rescheduled into another node. In some cases, like when running long batch jobs, you don't want those pods to be disrupted. You want to run them from start to finish without disruption, and replace or delete the node once they finish. To achieve that, you can set the `karpenter.sh/do-not-disrupt: "true"` annotation on the pod (more information [here](https://karpenter.sh/preview/concepts/disruption/#pod-level-controls)). By opting pods out of this disruption, you are telling Karpenter that it should not voluntarily remove a node containing this pod.
  6 | 
  7 | ## Requirements
  8 | 
  9 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint you have used to test this pattern at the `cluster` folder in the root of this repository.
 10 | * A `default` Karpenter `NodePool` as that is the one you will use in this blueprint. You did this already in the ["Deploy a Karpenter Default EC2NodeClass and NodePool"](../../README.md) section from this repository.
 11 | 
 12 | ## Deploy
 13 | 
 14 | You are going to use the `default` NodePool.
 15 | 
 16 | If you want to first observe the default behaviour of pods being disrupted during the consolidation process, jump to [(Optional) Simulating the default behaviour](#(optional)-simulating-the-default-behaviour).
 17 | 
 18 | If you want to directly see how to avoid the disruption of jobs by the consolidation process, jump to [Preventing jobs of being evicted](#preventing-jobs-of-being-evicted).
 19 | 
 20 | ### (optional) Simulating the default behaviour
 21 | 
 22 | This section simulates the default behaviour of the pods explained before, in which the Karpenter consolidation process disrupts the pods running the jobs, and re-schedule them into the cheaper node. To simulate it, deploy the [workloads-evicted yaml](/blueprints/batch-jobs/workloads-evicted.yaml):
 23 | 
 24 | ```sh
 25 | $> kubectl apply -f workloads-evicted.yaml
 26 | deployment.apps/nginx created
 27 | job.batch/2-min-job created
 28 | job.batch/5-min-job created
 29 | ```
 30 | 
 31 | This will create three pods that require **11 vCPU** in total:
 32 | * NGINX server - 2 vCPU required
 33 | * 2-minutes job - 7 vCPU required
 34 | * 5-minutes job - 2 vCPU required
 35 | 
 36 | During this test, Karpenter decided to launch a **c6g.4xlarge** on-demand instance (16 vCPU, 32 GiB). You can check this by executing:
 37 | 
 38 | ```sh
 39 | kubectl get nodes --label-columns node.kubernetes.io/instance-type
 40 | ```
 41 | 
 42 | After two minutes, the first job finishes and the pod is terminated:
 43 | 
 44 | ```sh
 45 | >kubectl get events --field-selector involvedObject.kind=Job --sort-by='.lastTimestamp'
 46 | LAST SEEN   TYPE     REASON             OBJECT          MESSAGE
 47 | 5m         Normal   SuccessfulCreate   job/2-min-job   Created pod: 2-min-job-rst5w
 48 | 5m         Normal   SuccessfulCreate   job/5-min-job   Created pod: 5-min-job-l72p8
 49 | 3m         Normal   Completed          job/2-min-job   Job completed
 50 | ```
 51 | 
 52 | ```sh
 53 | > $kubectl get pods
 54 | NAME                   READY   STATUS    RESTARTS   AGE
 55 | 5-min-job-6ffsg        1/1     Running   0          2m50s
 56 | nginx-8467c776-r8j24   1/1     Running   0          2m50s
 57 | ```
 58 | 
 59 | Now, the total number of vCPU required by the running pods are **4 vCPU**:
 60 | * NGINX server - 2 vCPU required
 61 | * 5-minutes job - 2 vCPU required
 62 | 
 63 | The default behaviour is the one defined in the NodePool: `consolidationPolicy: WhenEmptyOrUnderutilized`. Karpenter identifies the **c6g.4xlarge** (12 vCPU) is underutilized, and performs a consolidation replacement of the node. It launches a cheaper and smaller node: a **c6g.2xlarge** (8 vCPU) instance. You can check these logs by executing the following command in another terminal:
 64 | 
 65 | ```sh
 66 | kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20
 67 | ```
 68 | 
 69 | You should see these logs:
 70 | 
 71 | ```json
 72 | {"level":"INFO","time":"2025-05-30T10:15:01.605Z","logger":"controller","message":"disrupting node(s)","commit":"9458bb5","controller":"disruption","namespace":"","name":"","reconcileID":"f44d738f-e895-428b-b0ec-f1b5e5a96996","command-id":"dae73246-b739-42d8-91b8-c80ee651b6ac","reason":"underutilized","decision":"replace","disrupted-node-count":1,"replacement-node-count":1,"pod-count":2,"disrupted-nodes":[{"Node":{"name":"ip-10-0-116-149.eu-west-2.compute.internal"},"NodeClaim":{"name":"default-8t7np"},"capacity-type":"on-demand","instance-type":"c6g.4xlarge"}],"replacement-nodes":[{"capacity-type":"on-demand","instance-types":"c6g.2xlarge, c7g.2xlarge, m6g.2xlarge, c6a.2xlarge, c5a.2xlarge and 36 other(s)"}]}
 73 | 
 74 | ...
 75 | {"level":"INFO","time":"2025-05-30T10:10:49.907Z","logger":"controller","message":"launched nodeclaim","commit":"9458bb5","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-8t7np"},"namespace":"","name":"default-8t7np","reconcileID":"08263c3f-5565-4916-8932-db4596bd1f40","provider-id":"aws:///eu-west-2c/i-0f4e940ab58541307","instance-type":"c6g.4xlarge","zone":"eu-west-2c","capacity-type":"on-demand","allocatable":{"cpu":"15890m","ephemeral-storage":"17Gi","memory":"27322Mi","pods":"234","vpc.amazonaws.com/pod-eni":"54"}}
 76 | ```
 77 | 
 78 | The NGINX server and the 5-min job pods are rescheduled into the new c6g.2xlarge node, so **the job is restarted**, which will cause a disruption the job might not be prepared to handle like doing a checkpoint.
 79 | 
 80 | After five more minutes, the job will finish, and Karpenter will replace the node with a **c6g.xlarge** instance (4 vCPU) for the NGINX server. You can repeat the previous steps to verify this behaviour.
 81 | 
 82 | To clean up, execute:
 83 | 
 84 | ```sh
 85 | kubectl delete -f workloads-evicted.yaml
 86 | ```
 87 | 
 88 | To learn how to avoid this behaviour and wait for the job to be finished before replacing the node, go to [Preventing jobs of being evicted](#preventing-jobs-of-being-evicted).
 89 | 
 90 | ### Preventing jobs of being evicted
 91 | 
 92 | If you executed the [optional](#optional-simulating-the-default-behaviour) part, make sure to delete the `workloads-evicted` deployment:
 93 | 
 94 | ```sh
 95 | kubectl delete -f workloads-evicted.yaml
 96 | ```
 97 | 
 98 | Let's start by deploying the workloads defined in the [workloads-not-evicted yaml](/blueprints/batch-jobs/workloads-not-evicted.yaml):
 99 | 
100 | ```sh
101 | $> kubectl apply -f workloads-not-evicted.yaml
102 | deployment.apps/nginx created
103 | job.batch/2-min-job created
104 | job.batch/5-min-job created
105 | ```
106 | 
107 | This will create three pods that require **11 vCPU** in total:
108 | * NGINX server - 2 vCPU required
109 | * 2-minutes job - 7 vCPU required
110 | * 5-minutes job - 2 vCPU required
111 | 
112 | If you explore the [workloads-not-evicted yaml](/blueprints/batch-jobs/workloads-not-evicted.yaml), the `karpenter.sh/do-not-disrupt: "true"` annotations have been added to both jobs specifications.
113 | 
114 | Go to [Results section](#results) to check the behaviour.
115 | 
116 | ***NOTE:***
117 | The sample deployment only allows scheduling pods on on-demand instances (`nodeSelector: karpenter.sh/capacity-type: on-demand`) to show the replace consolidation mechanism, as for spot nodes Karpenter only uses the deletion consolidation mechanism to avoid breaking the price-capacity-optimized strategy, as explained [here](https://karpenter.sh/preview/concepts/disruption/#consolidation).
118 | 
119 | ## Results
120 | 
121 | ### Deployment verification
122 | 
123 | Karpenter launches the cheapest EC2 instance for the workloads with at least **11 vCPU**: a **c6g.4xlarge** on-demand instance (16 vCPU, 32 GiB). You can check this by executing:
124 | 
125 | ```sh
126 | kubectl get nodes --label-columns node.kubernetes.io/instance-type
127 | ```
128 | 
129 | You should see something similar to this, where a new node just appeared:
130 | 
131 | ```console
132 | NAME                                         STATUS   ROLES    AGE   VERSION               INSTANCE-TYPE
133 | ip-10-0-125-209.eu-west-1.compute.internal   Ready    <none>   16d   v1.32.3-eks-473151a   m4.large
134 | ip-10-0-46-139.eu-west-1.compute.internal    Ready    <none>   16d   v1.32.3-eks-473151a   m4.large
135 | ip-10-0-47-60.eu-west-1.compute.internal     Ready    <none>   44s   v1.32.3-eks-473151a   c6g.4xlarge
136 | ```
137 | 
138 | Check the three new pods are running by executing:
139 | 
140 | ```sh
141 | $> kubectl get pods
142 | NAME                   READY   STATUS    RESTARTS   AGE
143 | 2-min-job-ml6qj        1/1     Running   0          25s
144 | 5-min-job-9jc4b        1/1     Running   0          24s
145 | nginx-8467c776-bbl8w   1/1     Running   0          25s
146 | ```
147 | 
148 | You can check the jobs status by executing:
149 | 
150 | ```sh
151 | $> kubectl get jobs
152 | NAME        COMPLETIONS   DURATION   AGE
153 | 2-min-job   0/1           52s        52s
154 | 5-min-job   0/1           51s        51s
155 | $> kubectl get jobs
156 | NAME        COMPLETIONS   DURATION   AGE
157 | 2-min-job   0/1           52s        52s
158 | 5-min-job   0/1           51s        51s
159 | ```
160 | 
161 | In a different terminal, execute the following command that will display the Karpenter logs in real time:
162 | 
163 | ```sh
164 | kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20
165 | ```
166 | 
167 | You should see the following events indicating that Karpenter identified the need of a new node, and that it selected an instance type and purchase option:
168 | 
169 | ```json
170 | {"level":"INFO","time":"2024-08-16T10:10:47.683Z","logger":"controller","message":"found provisionable pod(s)","commit":"5bdf9c3","controller":"provisioner","namespace":"","name":"","reconcileID":"d8e8907d-5b93-46bb-893a-63520f3ec12f","Pods":"default/2-min-job-czp5x","duration":"39.859328ms"}
171 | 
172 | {"level":"INFO","time":"2024-08-16T10:10:47.683Z","logger":"controller","message":"computed new nodeclaim(s) to fit pod(s)","commit":"5bdf9c3","controller":"provisioner","namespace":"","name":"","reconcileID":"d8e8907d-5b93-46bb-893a-63520f3ec12f","nodeclaims":1,"pods":1}
173 | 
174 | {"level":"INFO","time":"2024-08-16T10:10:47.699Z","logger":"controller","message":"created nodeclaim","commit":"5bdf9c3","controller":"provisioner","namespace":"","name":"","reconcileID":"d8e8907d-5b93-46bb-893a-63520f3ec12f","NodePool":{"name":"default"},"NodeClaim":{"name":"default-g4kgp"},"requests":{"cpu":"7260m","memory":"290Mi","pods":"6"},"instance-types":"c4.2xlarge, c5.2xlarge, c5.4xlarge, c5a.2xlarge, c5a.4xlarge and 55 other(s)"}
175 | ...
176 | {"level":"INFO","time":"2024-08-16T10:10:49.959Z","logger":"controller","message":"launched nodeclaim","commit":"5bdf9c3","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-g4kgp"},"namespace":"","name":"default-g4kgp","reconcileID":"ff5b7f6e-c52e-495e-94b1-3a30385c3439","provider-id":"aws:///eu-west-2a/i-022a05d79bceda579","instance-type":"c6g.2xlarge","zone":"eu-west-2a","capacity-type":"on-demand","allocatable":{"cpu":"7910m","ephemeral-storage":"17Gi","memory":"14103Mi","pods":"58","vpc.amazonaws.com/pod-eni":"38"}}
177 | 
178 | ```
179 | 
180 | ### Consolidation Replace blocked due to ongoing job
181 | 
182 | Around two minutes after the deployment, the first job finishes:
183 | 
184 | ```sh
185 | $> kubectl get jobs
186 | NAME        COMPLETIONS   DURATION   AGE
187 | 2-min-job   1/1           2m41s      2m46s
188 | 5-min-job   0/1           2m45s      2m45s
189 | ```
190 | 
191 | The pod executing the job is terminated. Now you should just see two pods, one for the NGINX server and one for the other 5-minutes job:
192 | 
193 | ```sh
194 | $> kubectl get pods
195 | NAME                   READY   STATUS    RESTARTS   AGE
196 | 5-min-job-9jc4b        1/1     Running   0          2m56s
197 | nginx-8467c776-bbl8w   1/1     Running   0          2m57s
198 | ```
199 | 
200 | Now, the total number of vCPU required by the running pods are **4 vCPU**:
201 | * NGINX server - 2 vCPU required
202 | * 5-minutes job - 2 vCPU required
203 | 
204 |  In contrast to the default behaviour, even though a smaller and cheaper instance could be used, Karpenter reads the `karpenter.sh/do-not-disrupt: "true"` annotation on the 5-minutes job pod and **blocks the consolidation replace** process for that node:
205 | 
206 | ```sh
207 | $> kubectl describe node <node_name>
208 | ...
209 |   Normal   NodeReady                6m7s                   kubelet                Node ip-10-0-97-15.eu-west-1.compute.internal status is now: NodeReady
210 |   Normal   DisruptionBlocked        4m12s                  karpenter              Cannot disrupt Node: pod "default/2-min-job-2fssd" has "karpenter.sh/do-not-disrupt" annotation
211 |   Normal   DisruptionBlocked        2m12s                  karpenter              Cannot disrupt Node: pod "default/5-min-job-7pqdt" has "karpenter.sh/do-not-disrupt" annotation
212 | ```
213 | 
214 | ### Consolidation Replace allowed after last job finishes
215 | 
216 | Around five minutes after the deployment, the other job finishes:
217 | 
218 | ```sh
219 | $> kubectl get jobs
220 | NAME        COMPLETIONS   DURATION   AGE
221 | 5-min-job   1/1           5m40s      5m46s
222 | ```
223 | 
224 | Now, **it is possible to replace the node** by a cheaper and smaller instance because the the NGINX server can be disrupted as it does't contain the `karpenter.sh/do-not-disrupt: "true"` annotation. You can check this in the Karpenter logs terminal:
225 | 
226 | ```json
227 | {"level":"INFO","time":"2024-08-16T10:17:21.322Z","logger":"controller","message":"created nodeclaim","commit":"5bdf9c3","controller":"disruption","namespace":"","name":"","reconcileID":"1135db0e-45ef-4529-9492-63789a9837c6","NodePool":{"name":"default"},"NodeClaim":{"name":"default-9m4bv"},"requests":{"cpu":"2260m","memory":"290Mi","pods":"6"},"instance-types":"c4.xlarge, c5.xlarge, c5a.xlarge, c5d.xlarge, c5n.xlarge and 32 other(s)"}
228 | ...
229 | {"level":"INFO","time":"2024-08-16T10:17:23.452Z","logger":"controller","message":"launched nodeclaim","commit":"5bdf9c3","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-9m4bv"},"namespace":"","name":"default-9m4bv","reconcileID":"f0e0cc47-45a9-479c-a1c7-b5f0f0341026","provider-id":"aws:///eu-west-2a/i-0a4fa068af5550afa","instance-type":"c6g.xlarge","zone":"eu-west-2a","capacity-type":"on-demand","allocatable":{"cpu":"3920m","ephemeral-storage":"17Gi","memory":"6525Mi","pods":"58","vpc.amazonaws.com/pod-eni":"18"}}
230 | ...
231 | {"level":"INFO","time":"2024-08-16T10:18:07.430Z","logger":"controller","message":"tainted node","commit":"5bdf9c3","controller":"node.termination","controllerGroup":"","controllerKind":"Node","Node":{"name":"ip-10-0-42-175.eu-west-2.compute.internal"},"namespace":"","name":"ip-10-0-42-175.eu-west-2.compute.internal","reconcileID":"a57044a6-f00f-41e5-a1ab-31e4b19dd838","taint.Key":"karpenter.sh/disrupted","taint.Value":"","taint.Effect":"NoSchedule"}
232 | 
233 | {"level":"INFO","time":"2024-08-16T10:18:50.331Z","logger":"controller","message":"deleted node","commit":"5bdf9c3","controller":"node.termination","controllerGroup":"","controllerKind":"Node","Node":{"name":"ip-10-0-42-175.eu-west-2.compute.internal"},"namespace":"","name":"ip-10-0-42-175.eu-west-2.compute.internal","reconcileID":"2a51acf8-702f-4c75-988d-92052d690b01"}
234 | ```
235 | 
236 | Karpenter replaces the **c6g.4xlarge** (16 vCPU, 32 GiB) with a **c6g.xlarge** node (4 vCPU, 8 GiB), enough for the NGINX server:
237 | 
238 | ```sh
239 | $> kubectl get nodes --label-columns node.kubernetes.io/instance-type
240 | NAME                                         STATUS   ROLES    AGE   VERSION               INSTANCE-TYPE
241 | ip-10-0-105-122.eu-west-2.compute.internal   Ready    <none>   10m   v1.32.3-eks-473151a   m4.large
242 | ip-10-0-34-49.eu-west-2.compute.internal     Ready    <none>   10m   v1.32.3-eks-473151a   m4.large
243 | ip-10-0-85-30.eu-west-1.compute.internal     Ready    <none>   10m   v1.32.3-eks-473151a   c6g.xlarge
244 | ```
245 | 
246 | Finally, you can check the NGINX server pod has been re-scheduled into the new pod:
247 | 
248 | ```sh
249 | $> kubectl get pods
250 | NAME                   READY   STATUS    RESTARTS   AGE
251 | nginx-8467c776-vjwgv   1/1     Running   0          22s
252 | ```
253 | 
254 | ## Cleanup
255 | 
256 | ```sh
257 | kubectl delete -f .
258 | ```
259 | 


--------------------------------------------------------------------------------
/blueprints/disruption-budgets/README.md:
--------------------------------------------------------------------------------
  1 | # Karpenter Blueprint: NodePool Disruption Budgets
  2 | 
  3 | ## Purpose
  4 | 
  5 | Karpenter's actions like consolidation, drift detection and `expireAfter`, allow users to optimize for cost in the case of consolidation, keep up with the latest security patches and desired configuration, or ensure governance best practices, like refreshing instances every N days. These actions cause, as a trade-off, some level of disruption in the cluster caused by expected causes. To control the trade-off between, for example, being on the latest AMI (drift detection) and nodes restarting when that happens we can use disruption controls and configure `disruption budgets` in the Karpenter `NodePool` configuration. If no disruption budget is configured their is a default budget with `nodes: 10%`. When calculating if a budget will block nodes from disruption, Karpenter checks if the number of nodes being deleted is greater than the number of allowed disruptions. Budgets take into consideration voluntary disruptions through expiration, drift, emptiness and consolidation. If there are multiple budgets defined in the `NodePool`, Karpenter will honour the most restrictive of the budgets.
  6 | 
  7 | By applying a combination of disruptions budgets and Pod Disruptions Budgets (PDBs) you get both application and platform voluntary disruption controls, this can help you move towards continually operations to protect workload availability. You can learn more about Karpenter NodePool disruption budgets and how the Kapenter disruption controller works in the [Karpenter documentation](https://karpenter.sh/docs/concepts/disruption/#disruption-controller).
  8 | 
  9 | ## Examples
 10 | 
 11 | The following provides a set of example disruption budgets:
 12 | 
 13 | ### Limit Disruptions to a Percentage of Nodes
 14 | 
 15 | To prevent disruptions from affecting more than a certain percentage of nodes in a NodePool
 16 | 
 17 | The following Disruption Budgets says, at any-point in time only disrupt 20% of the Nodes managed by the NodePool. For instance, if there were 19 nodes owned by the NodePool, 4 disruptions would be allowed, rounding up from 19 * .2 = 3.8.
 18 | 
 19 | ```yaml
 20 | apiVersion: karpenter.sh/v1
 21 | kind: NodePool
 22 | metadata:
 23 |   name: default
 24 | spec:
 25 |   ...
 26 |   disruption:
 27 |     consolidationPolicy: WhenEmptyOrUnderutilized
 28 |     budgets:
 29 |     - nodes: "20%"
 30 |   template:
 31 |     spec:
 32 |       expireAfter: 720h # 30 days
 33 | ```
 34 | 
 35 | ### No Disruptions During Peak Hours
 36 | 
 37 | This configuration ensures that Karpenter avoids disrupting workloads during peak traffic periods. Specifically, it prevents disruptions from UTC 9:00 for an 8-hour window and limits disruptions to 20% outside of this window.
 38 | 
 39 | ```yaml
 40 | apiVersion: karpenter.sh/v1
 41 | kind: NodePool
 42 | metadata:
 43 |   name: default
 44 | spec:
 45 |   disruption:
 46 |     consolidationPolicy: WhenEmptyOrUnderutilized
 47 |     consolidateAfter: 1m
 48 |     budgets:
 49 |     - nodes: "0"
 50 |       schedule: "0 9 * * *"
 51 |       duration: 8h
 52 |     - nodes: "20%"
 53 |       schedule: "0 17 * * *"
 54 |       duration: 16h
 55 | ```
 56 | 
 57 | ### Allow 20% disruptions during a maintenance window from UTC 22:00 to 2:00, but only 10% disruptions outside of a maintenance window
 58 | 
 59 | By setting multiple disruption budgets, you can gain precise control over node disruptions. Karpenter will use the most restrictive budget applicable at any given time.
 60 | 
 61 | In the following example, disruptions are limited to 20% of nodes during a 4-hour period starting from UTC 22:00. During the remaining hours (UTC 2:00 - 22:00), disruptions are limited to 10% of nodes.
 62 | 
 63 | ```yaml
 64 | apiVersion: karpenter.sh/v1
 65 | kind: NodePool
 66 | metadata:
 67 |   name: default
 68 | spec:
 69 |   disruption:
 70 |     consolidationPolicy: WhenEmptyOrUnderutilized
 71 |     consolidateAfter: 1m
 72 |     budgets:
 73 |     - nodes: "20%"
 74 |       schedule: "0 22 * * *"
 75 |       duration: 4h
 76 |     - nodes: "10%"
 77 |       schedule: "0 2 * * *"
 78 |       duration: 20h
 79 | ```
 80 | 
 81 | ### Multiple Budgets Defined
 82 | 
 83 | The following configuration illustrates a NodePool with three disruption budgets:
 84 | 
 85 | The first budget allows up to 20% of nodes to be disrupted at any time.
 86 | The second budget imposes a maximum of 5 disruptions.
 87 | The third budget blocks all disruptions during the first 10 minutes of each day.
 88 | 
 89 | While the first and second budgets are always in effect, they work together to limit disruptions to a maximum of 5 nodes at any given time. Karpenter will apply the most restrictive budget when multiple budgets overlap, enabling flexible disruption policies for different scenarios, such as during maintenance windows.
 90 | 
 91 | > **Note:** If multiple budgets are active at the same time, Karpenter will consider the most restrictive budget. You might consider using multiple disruption budgets to establish a default policy while providing an alternative policy for specific times, such as allowing more disruptions during maintenance windows to roll out new Amazon Machine Images faster.
 92 | 
 93 | ```yaml
 94 | apiVersion: karpenter.sh/v1
 95 | kind: NodePool
 96 | metadata:
 97 |   name: default
 98 | spec:
 99 |   disruption:
100 |     consolidationPolicy: WhenEmptyOrUnderutilized
101 |     consolidateAfter: 1m
102 |     budgets:
103 |     - nodes: "20%"
104 |     - nodes: "5"
105 |     - nodes: "0"
106 |       schedule: "@daily"
107 |       duration: 10m
108 | ```
109 | 
110 | ### Disrupting by Reasons
111 | 
112 | Karpenter allows specifying if a budget applies to any of `Drifted`, `Underutilized`, or `Empty`. When a budget has no reasons, it’s assumed that it applies to all reasons. When calculating allowed disruptions for a given reason, Karpenter will take the minimum of the budgets that have listed the reason or have left reasons undefined.
113 | 
114 | #### Only Drifted Nodes
115 | 
116 | This example sets a budget that applies only to nodes classified as Drifted. During times when nodes are identified as Drifted, Karpenter will only disrupt up to 20% of those nodes.
117 | 
118 | ```yaml
119 | apiVersion: karpenter.sh/v1
120 | kind: NodePool
121 | metadata:
122 |   name: example-drifted
123 | spec:
124 |   disruption:
125 |     consolidationPolicy: WhenEmptyOrUnderutilized
126 |     budgets:
127 |     - nodes: "20%"
128 |       reasons:
129 |       - "Drifted"
130 | ```
131 | 
132 | #### Only Underutilized Nodes
133 | 
134 | This example sets a budget that applies only to nodes classified as Underutilized. During times when nodes are identified as Underutilized, Karpenter will only disrupt up to 30% of those nodes.
135 | 
136 | ```yaml
137 | apiVersion: karpenter.sh/v1
138 | kind: NodePool
139 | metadata:
140 |   name: example-underutilized
141 | spec:
142 |   disruption:
143 |     consolidationPolicy: WhenEmptyOrUnderutilized
144 |     budgets:
145 |     - nodes: "30%"
146 |       reasons:
147 |       - "Underutilized"
148 | ```
149 | 
150 | #### Only Empty Nodes
151 | 
152 | This example sets a budget that applies only to nodes classified as Empty. During times when nodes are identified as Empty, Karpenter will only disrupt up to 10% of those nodes.
153 | 
154 | ```yaml
155 | apiVersion: karpenter.sh/v1
156 | kind: NodePool
157 | metadata:
158 |   name: example-empty
159 | spec:
160 |   disruption:
161 |     consolidationPolicy: WhenEmptyOrUnderutilized
162 |     budgets:
163 |     - nodes: "10%"
164 |       reasons:
165 |       - "Empty"
166 | ```
167 | 
168 | ## Requirements
169 | 
170 | * A Kubernetes cluster with Karpenter installed. You can use the blueprint we've used to test this pattern at the cluster folder in the root of this repository.
171 | 
172 | ## Deploy
173 | 
174 | Let's say you want to control how nodes are upgraded when switching to Bottlerocket via Karpenter Drift, in this example we deploy a disruption budget, that prevents disruptions 24 hours a day 7 days a week. You can use the schedule and duration of the budget to control when disruptions via Drift can take place.
175 | 
176 | If you're using the Terraform template provided in this repo, run the following commands to get the EKS cluster name and the IAM Role name for the Karpenter nodes:
177 | 
178 | ```sh
179 | export CLUSTER_NAME=$(terraform -chdir="../../cluster/terraform" output -raw cluster_name)
180 | export KARPENTER_NODE_IAM_ROLE_NAME=$(terraform -chdir="../../cluster/terraform" output -raw node_instance_role_name)
181 | ```
182 | 
183 | > ***NOTE***: If you're not using Terraform, you need to get those values manually. `CLUSTER_NAME` is the name of your EKS cluster (not the ARN). Karpenter auto-generates the [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles) in your `EC2NodeClass` given the role that you specify in [spec.role](https://karpenter.sh/preview/concepts/nodeclasses/) with the placeholder `KARPENTER_NODE_IAM_ROLE_NAME`, which is a way to pass a single IAM role to the EC2 instance launched by the Karpenter `NodePool`. Typically, the instance profile name is the same as the IAM role(not the ARN).
184 | 
185 | To deploy the Karpenter NodePool and the sample workload, simply run this command:
186 | 
187 | ```sh
188 | sed -i '' "s/<<CLUSTER_NAME>>/$CLUSTER_NAME/g" disruption-budgets.yaml
189 | sed -i '' "s/<<KARPENTER_NODE_IAM_ROLE_NAME>>/$KARPENTER_NODE_IAM_ROLE_NAME/g" disruption-budgets.yaml
190 | kubectl apply -f .
191 | ```
192 | 
193 | You should see the following output:
194 | 
195 | ```console
196 | nodepool.karpenter.sh/disruption-budget created
197 | ec2nodeclass.karpenter.k8s.aws/disruption-budget created
198 | deployment.apps/disruption-budget created
199 | ```
200 | 
201 | You should now see new nodes provisioned in your Amazon EKS cluster:
202 | 
203 | ```sh
204 | > kubectl get nodes
205 | NAME                                         STATUS   ROLES    AGE     VERSION
206 | ip-10-0-103-232.eu-west-2.compute.internal   Ready    <none>   2m8s    v1.32.2-eks-677bac1
207 | ip-10-0-120-141.eu-west-2.compute.internal   Ready    <none>   2m44s   v1.32.2-eks-677bac1
208 | ip-10-0-38-179.eu-west-2.compute.internal    Ready    <none>   2m8s    v1.32.2-eks-677bac1
209 | ip-10-0-39-106.eu-west-2.compute.internal    Ready    <none>   2m18s   v1.32.2-eks-677bac1
210 | ip-10-0-50-60.eu-west-2.compute.internal     Ready    <none>   17m     v1.32.2-eks-677bac1
211 | ip-10-0-55-94.eu-west-2.compute.internal     Ready    <none>   2m47s   v1.32.2-eks-677bac1
212 | ip-10-0-63-247.eu-west-2.compute.internal    Ready    <none>   2m40s   v1.32.2-eks-677bac1
213 | ip-10-0-66-70.eu-west-2.compute.internal     Ready    <none>   17m     v1.32.2-eks-677bac1
214 | ip-10-0-72-85.eu-west-2.compute.internal     Ready    <none>   2m50s   v1.32.2-eks-677bac1
215 | ip-10-0-82-100.eu-west-2.compute.internal    Ready    <none>   2m29s   v1.32.2-eks-677bac1
216 | ip-10-0-95-228.eu-west-2.compute.internal    Ready    <none>   2m19s   v1.32.2-eks-677bac1
217 | ip-10-0-96-121.eu-west-2.compute.internal    Ready    <none>   2m26s   v1.32.2-eks-677bac1
218 | ```
219 | 
220 | Now, use the `kubectl patch` command to change `spec.amiSelectorTerms` alias from `al20232023.0.20230222` to `bottlerocket@v1.39.1`.
221 | 
222 | ```sh
223 | kubectl patch ec2nodeclass disruption-budget --type='json' -p='[
224 |   {"op": "replace", "path": "/spec/amiSelectorTerms/0/alias", "value": "bottlerocket@v1.39.1"}
225 | ]'
226 | ```
227 | 
228 | ## Results
229 | 
230 | This is an example of an overly restrictive budget for demo purposes as it will prevent any voluntary disruptions via emptiness, drift, emptiness and consolidation. We learn from this that the schedule states when the budget is first active and the duration specifies how long the budget is active - a duration must be specified if a schedule is set otherwise the budget is always active.
231 | 
232 | Karpenter will try to replace nodes via the Drift mechanism on an AMI change. However, if you watch the nodes, you’ll notice that they’re not being replaced with new instances provisioned with the Bottlerocket Amazon EKS optimized AMI.
233 | 
234 | ```sh
235 | > kubectl get nodes -o wide -w
236 | 
237 | NAME                                         STATUS   ROLES    AGE     VERSION               INTERNAL-IP    EXTERNAL-IP   OS-IMAGE                       KERNEL-VERSION                    CONTAINER-RUNTIME
238 | ip-10-0-103-232.eu-west-2.compute.internal   Ready    <none>   3m22s   v1.32.2-eks-677bac1   10.0.103.232   <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
239 | ip-10-0-120-141.eu-west-2.compute.internal   Ready    <none>   3m58s   v1.32.2-eks-677bac1   10.0.120.141   <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
240 | ip-10-0-38-179.eu-west-2.compute.internal    Ready    <none>   3m22s   v1.32.2-eks-677bac1   10.0.38.179    <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
241 | ip-10-0-39-106.eu-west-2.compute.internal    Ready    <none>   3m32s   v1.32.2-eks-677bac1   10.0.39.106    <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
242 | ip-10-0-50-60.eu-west-2.compute.internal     Ready    <none>   18m     v1.32.2-eks-677bac1   10.0.50.60     <none>        Amazon Linux 2023.5.20240805   6.1.102-108.177.amzn2023.x86_64   containerd://1.7.20
243 | ip-10-0-55-94.eu-west-2.compute.internal     Ready    <none>   4m1s    v1.32.2-eks-677bac1   10.0.55.94     <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
244 | ip-10-0-63-247.eu-west-2.compute.internal    Ready    <none>   3m54s   v1.32.2-eks-677bac1   10.0.63.247    <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
245 | ip-10-0-66-70.eu-west-2.compute.internal     Ready    <none>   18m     v1.32.2-eks-677bac1   10.0.66.70     <none>        Amazon Linux 2023.5.20240805   6.1.102-108.177.amzn2023.x86_64   containerd://1.7.20
246 | ip-10-0-72-85.eu-west-2.compute.internal     Ready    <none>   4m4s    v1.32.2-eks-677bac1   10.0.72.85     <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.x86_64     containerd://1.7.20
247 | ip-10-0-82-100.eu-west-2.compute.internal    Ready    <none>   3m43s   v1.32.2-eks-677bac1   10.0.82.100    <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
248 | ip-10-0-95-228.eu-west-2.compute.internal    Ready    <none>   3m33s   v1.32.2-eks-677bac1   10.0.95.228    <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
249 | ip-10-0-96-121.eu-west-2.compute.internal    Ready    <none>   3m40s   v1.32.2-eks-677bac1   10.0.96.121    <none>        Amazon Linux 2                 5.10.223-211.872.amzn2.aarch64    containerd://1.7.20
250 | ```
251 | 
252 | You will also see the following message in Kubernetes events stating disruptions are blocked:
253 | 
254 | ```sh
255 | > kubectl get events -w
256 | 
257 | 0s          Normal    DisruptionBlocked               nodepool/restrictive-budget                       No allowed disruptions for disruption reason Drifted due to blocking budget
258 | 0s          Normal    DisruptionBlocked               nodepool/restrictive-budget                       No allowed disruptions for disruption reason Underutilized due to blocking budget
259 | 0s          Normal    DisruptionBlocked               nodepool/restrictive-budget                       No allowed disruptions for disruption reason Empty due to blocking budget
260 | 0s          Normal    DisruptionBlocked               nodepool/restrictive-budget                       No allowed disruptions due to blocking budget
261 | ```
262 | 
263 | This is because the NodePool defines the following budget which states, starting at UTC 00:00 everyday, for a time period of 24 hours no nodes can be voluntary drifted. This is a great fit when you want consolidation but might not want to apply it all the time.
264 | 
265 | ```yaml
266 | budgets:
267 |   - nodes: "0"
268 |     schedule: "0 0 * * *"
269 |     duration: 24h
270 | ```
271 | 
272 | If you edit the NodePool and replace the budget with the following, Karpenter will be able to Drift 20% of the Nodes.
273 | 
274 | Edit with the `kubectl patch` command.
275 | 
276 | ```sh
277 | kubectl patch nodepool disruption-budget --type='json' -p='[
278 |   {"op": "replace", "path": "/spec/disruption/budgets/0/nodes", "value": "20"}
279 | ]'
280 | ```
281 | 
282 | After modifying that budget for the NodePool you should observe the nodes drifting and new nodes being provisioned with the latest Amazon EKS optimized Bottlerocket AMI.
283 | 
284 | ```sh
285 | > kubectl get nodes -o custom-columns=NAME:.metadata.name,OS-IMAGE:.status.nodeInfo.osImage
286 | 
287 | NAME                                         OS-IMAGE
288 | ip-10-0-103-176.eu-west-2.compute.internal   Bottlerocket OS 1.39.1 (aws-k8s-1.32)
289 | ip-10-0-106-25.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
290 | ip-10-0-108-51.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
291 | ip-10-0-115-104.eu-west-2.compute.internal   Bottlerocket OS 1.39.1 (aws-k8s-1.32)
292 | ip-10-0-116-220.eu-west-2.compute.internal   Bottlerocket OS 1.39.1 (aws-k8s-1.32)
293 | ip-10-0-121-123.eu-west-2.compute.internal   Bottlerocket OS 1.39.1 (aws-k8s-1.32)
294 | ip-10-0-43-37.eu-west-2.compute.internal     Bottlerocket OS 1.39.1 (aws-k8s-1.32)
295 | ip-10-0-50-60.eu-west-2.compute.internal     Amazon Linux 2023.5.20240805
296 | ip-10-0-57-199.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
297 | ip-10-0-59-82.eu-west-2.compute.internal     Bottlerocket OS 1.39.1 (aws-k8s-1.32)
298 | ip-10-0-62-198.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
299 | ip-10-0-62-228.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
300 | ip-10-0-66-249.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
301 | ip-10-0-66-70.eu-west-2.compute.internal     Amazon Linux 2023.5.20240805
302 | ip-10-0-67-142.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
303 | ip-10-0-67-203.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
304 | ip-10-0-67-255.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
305 | ip-10-0-68-97.eu-west-2.compute.internal     Bottlerocket OS 1.39.1 (aws-k8s-1.32)
306 | ip-10-0-70-55.eu-west-2.compute.internal     Bottlerocket OS 1.39.1 (aws-k8s-1.32)
307 | ip-10-0-73-112.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
308 | ip-10-0-75-130.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
309 | ip-10-0-77-110.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
310 | ip-10-0-78-43.eu-west-2.compute.internal     Bottlerocket OS 1.39.1 (aws-k8s-1.32)
311 | ip-10-0-91-17.eu-west-2.compute.internal     Bottlerocket OS 1.39.1 (aws-k8s-1.32)
312 | ip-10-0-97-201.eu-west-2.compute.internal    Bottlerocket OS 1.39.1 (aws-k8s-1.32)
313 | ```
314 | 
315 | You will also see the following message in Kubernetes events stating a node has been drifted:
316 | 
317 | ```console
318 | 0s       Normal    DisruptionTerminating        node/ip-10-0-96-121.eu-west-2.compute.internal    Disrupting Node: Drifted/Delete
319 | 0s       Warning   InstanceTerminating          node/ip-10-0-96-121.eu-west-2.compute.internal    Instance is terminating
320 | 0s       Normal    RemovingNode                 node/ip-10-0-96-121.eu-west-2.compute.internal    Node ip-10-0-96-121.eu-west-2.compute.internal event: Removing Node ip-10-0-96-121.eu-west-2.compute.internal from Controller
321 | ```
322 | 
323 | ## Clean-up
324 | 
325 | To remove all objects created, simply run the following commands:
326 | 
327 | ```sh
328 | kubectl delete -f .
329 | ```
330 | 


--------------------------------------------------------------------------------