├── .gitignore ├── CODEOWNERS ├── NOTICE ├── infrastructure ├── .npmignore ├── k8s-config │ └── clusters │ │ ├── kit-infrastructure │ │ ├── monitoring │ │ │ ├── monitoring.yaml │ │ │ ├── grafana.yaml │ │ │ └── prometheus.yaml │ │ ├── metrics-server │ │ │ └── metrics-server.yaml │ │ ├── kit-provisioner.yaml │ │ ├── provisioner.yaml │ │ ├── node-problem-detector │ │ │ └── node-problem-detector.yaml │ │ ├── aws-node-termination-handler │ │ │ └── aws-node-termination-handler.yaml │ │ └── tekton-pipelines │ │ │ └── tekton.yaml │ │ └── addons │ │ └── perfdash │ │ ├── kustomization.yaml │ │ └── deployment-patch.yaml ├── .gitignore ├── bin │ └── kit-infrastructure.ts ├── cdk.json ├── tsconfig.json ├── package.json ├── cache-iam-policies.sh └── lib │ └── addons │ ├── aws-lbc.ts │ ├── perfdash.ts │ ├── aws-ebs-csi-driver.ts │ ├── fluent-bit-for-aws.ts │ ├── kit.ts │ ├── cached │ └── aws-ebs-csi-driver-iam-policy-v1.9.0.json │ ├── karpenter.ts │ └── crossplane.ts ├── tests ├── tekton-resources │ ├── tasks │ │ ├── teardown │ │ │ ├── .DS_Store │ │ │ ├── eksctl.yaml │ │ │ ├── kitctl.yaml │ │ │ ├── awscli-eks-fargate.yaml │ │ │ ├── kit-cluster.yaml │ │ │ ├── awscli-vpc-delete.yaml │ │ │ └── karpenter │ │ │ │ └── kubectl-get-karpenter-logs.yaml │ │ ├── notifications │ │ │ └── slack.yaml │ │ ├── setup │ │ │ ├── eks │ │ │ │ ├── awscli-role.yaml │ │ │ │ ├── awscli-vpc.yaml │ │ │ │ ├── eksctl.yaml │ │ │ │ ├── awscli-cfn-lt.yaml │ │ │ │ ├── awscli-cp.yaml │ │ │ │ ├── awscli-pod-identity-association.yaml │ │ │ │ └── awscli-fargate.yaml │ │ │ ├── karpenter │ │ │ │ ├── awscli-instanceprofiles.yaml │ │ │ │ ├── awscli-mng.yaml │ │ │ │ ├── kubectl-nodeclass.yaml │ │ │ │ ├── awscli-karpenter-cfn-stack.yaml │ │ │ │ ├── kubectl-nodepools.yaml │ │ │ │ └── awscli-node-role.yaml │ │ │ └── kitctl │ │ │ │ ├── dataplane.yaml │ │ │ │ └── controlplane.yaml │ │ ├── addons │ │ │ ├── cw-metric.yaml │ │ │ ├── fluentbit.yaml │ │ │ └── cwagent.yaml │ │ └── generators │ │ │ ├── manual-deployment │ │ │ └── deploy-pods-with-size.yaml │ │ │ ├── clusterloader │ │ │ └── pod-density.yaml │ │ │ └── karpenter │ │ │ ├── kubectl-scale.yaml │ │ │ ├── kubectl-drift.yaml │ │ │ └── kubectl-nodepool-replicas-wait.yaml │ ├── pipelines │ │ ├── cleanup │ │ │ ├── binding.yaml │ │ │ ├── eventlistener.yaml │ │ │ ├── serviceaccount.yaml │ │ │ ├── cronjob.yaml │ │ │ ├── README.md │ │ │ └── cleanup-template.yaml │ │ ├── kitctl │ │ │ ├── run.yaml │ │ │ └── template.yaml │ │ ├── eks │ │ │ ├── awscli-eks-fargate.yaml │ │ │ └── upstream-load.yaml │ │ └── kit-cluster │ │ │ └── upstream-load.yaml │ └── triggers │ │ └── rbac.yaml ├── assets │ ├── karpenter │ │ ├── node-role-policy-document.json │ │ ├── controller-role-trust-policy-document.json │ │ ├── nodepool.yaml │ │ ├── nodeclass.yaml │ │ └── controller-role-policy-document.json │ ├── eks-pod-identity │ │ ├── pia-trust-policy.json │ │ ├── pod-image-preload.yaml │ │ ├── pod-default.yaml │ │ └── config.yaml │ ├── eks-networking │ │ ├── test-svc.yaml │ │ └── config-eks-networking.yaml │ ├── eks_service_role.json │ ├── eks_node_group_launch_template.json │ ├── asg_node_group.yaml │ ├── aiml-workload │ │ ├── medium-batch-jobs │ │ │ ├── job-w-fsx.yaml │ │ │ └── config.yaml │ │ └── large-sts │ │ │ ├── config.yaml │ │ │ └── sts.yaml │ └── eks_node_group_launch_template_al2023.yaml └── images │ ├── clusterloader2 │ └── Dockerfile │ └── toolkit-base │ ├── get_versions_matrix.sh │ └── Dockerfile ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── actions │ └── install-go-and-dependencies │ │ └── action.yml ├── dependabot.yml └── workflows │ ├── cdk-ci.yaml │ └── toolkit-base-build-push.yaml ├── kitctl.rb └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | operator/bin 2 | .vscode/ 3 | node_modules 4 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | @mengqiy @hakuna-matatah @rcrozean @adityavenneti @dheeraj-coding 2 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | kit 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /infrastructure/.npmignore: -------------------------------------------------------------------------------- 1 | *.ts 2 | !*.d.ts 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/monitoring/monitoring.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: monitoring 5 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/teardown/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/HEAD/tests/tekton-resources/tasks/teardown/.DS_Store -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Issue #, if available: 2 | 3 | Description of changes: 4 | 5 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 6 | -------------------------------------------------------------------------------- /infrastructure/.gitignore: -------------------------------------------------------------------------------- 1 | # CDK asset staging directory 2 | *.js 3 | !jest.config.js 4 | *.d.ts 5 | bin/*.d.ts 6 | bin/*.js 7 | node_modules 8 | cdk.context.json 9 | jest.config.js 10 | .cdk.staging 11 | cdk.out 12 | -------------------------------------------------------------------------------- /infrastructure/bin/kit-infrastructure.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import 'source-map-support/register'; 3 | import { App } from 'aws-cdk-lib'; 4 | import { KITInfrastructure } from '../lib/kit-infrastructure'; 5 | 6 | const app = new App(); 7 | new KITInfrastructure(app, 'KITInfrastructure'); 8 | -------------------------------------------------------------------------------- /tests/assets/karpenter/node-role-policy-document.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "ec2.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/cleanup/binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: triggers.tekton.dev/v1alpha1 2 | kind: TriggerBinding 3 | metadata: 4 | name: cleanup-details 5 | namespace: scalability 6 | spec: 7 | params: 8 | - name: keep 9 | value: $(body.params.cleanup.keep) 10 | - name: namespace 11 | value: $(body.params.target.namespace) -------------------------------------------------------------------------------- /tests/assets/eks-pod-identity/pia-trust-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "beta.pods.eks.aws.internal" 8 | }, 9 | "Action": [ 10 | "sts:AssumeRole", 11 | "sts:TagSession" 12 | ] 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/kitctl/run.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: PipelineRun 3 | metadata: 4 | name: pipeline-template 5 | namespace: scalability 6 | spec: 7 | params: 8 | - name: name 9 | value: "example" 10 | podTemplate: 11 | nodeSelector: 12 | kubernetes.io/arch: amd64 13 | serviceAccountName: tekton-pipelines-executor 14 | pipelineRef: 15 | name: pipeline-template 16 | -------------------------------------------------------------------------------- /infrastructure/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "npx ts-node --prefer-ts-exts bin/kit-infrastructure.ts", 3 | "context": { 4 | "aws-cdk:enableDiffNoFail": "true", 5 | "@aws-cdk/core:stackRelativeExports": "true", 6 | "@aws-cdk/aws-ecr-assets:dockerIgnoreSupport": true, 7 | "@aws-cdk/aws-secretsmanager:parseOwnedSecretName": true, 8 | "@aws-cdk/aws-kms:defaultKeyPolicies": true, 9 | "@aws-cdk/aws-s3:grantWriteWithoutAcl": true, 10 | "@aws-cdk/aws-ecs-patterns:removeDefaultDesiredCount": true 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /tests/images/clusterloader2/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.16.4 AS builder 2 | WORKDIR /go/src/k8s.io 3 | RUN git clone https://github.com/kubernetes/perf-tests 4 | WORKDIR perf-tests 5 | RUN git fetch origin --verbose --tags 6 | RUN git checkout $branch 7 | WORKDIR /go/src/k8s.io/perf-tests/clusterloader2 8 | RUN GOPROXY=direct GOOS=linux CGO_ENABLED=0 go build -o ./clusterloader ./cmd 9 | 10 | FROM amazon/aws-cli 11 | WORKDIR / 12 | COPY --from=builder /go/src/k8s.io/perf-tests/clusterloader2/clusterloader /clusterloader 13 | ENTRYPOINT ["/clusterloader"] -------------------------------------------------------------------------------- /.github/actions/install-go-and-dependencies/action.yml: -------------------------------------------------------------------------------- 1 | name: setup-go-and-cache 2 | description: 'Set up go and cache' 3 | runs: 4 | using: "composite" 5 | steps: 6 | - uses: actions/checkout@v3 7 | - uses: actions/cache@v3 8 | with: 9 | path: | 10 | ~/.cache/go-build 11 | ~/go/pkg/mod 12 | ~/go/bin/ 13 | key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} 14 | restore-keys: | 15 | ${{ runner.os }}-go- 16 | - uses: actions/setup-go@v3 17 | with: 18 | go-version: '1.17.1' -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/addons/perfdash/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | # This 2 files pin to a upstream config. To upgrade, simply use a newer commit ID from upstream. 4 | resources: 5 | - https://raw.githubusercontent.com/kubernetes/perf-tests/4c08d581d7196071891095a0d442f7cd3e9d3d3d/perfdash/deployment.yaml 6 | - https://raw.githubusercontent.com/kubernetes/perf-tests/4c08d581d7196071891095a0d442f7cd3e9d3d3d/perfdash/perfdash-service.yaml 7 | patchesStrategicMerge: 8 | - deployment-patch.yaml 9 | -------------------------------------------------------------------------------- /kitctl.rb: -------------------------------------------------------------------------------- 1 | # Update the version and SHA256 for the CLI when new version is released 2 | require 'formula' 3 | class Kitctl < Formula 4 | homepage 'https://github.com/awslabs/kubernetes-iteration-toolkit/substrate' 5 | version '0.0.22' 6 | if OS.mac? && Hardware::CPU.is_64_bit? 7 | url 'https://github.com/awslabs/kubernetes-iteration-toolkit/releases/download/v0.0.22/kitctl_v0.0.22_darwin_amd64.zip' 8 | sha256 '8e7fd7a6466f97037788b498a2a98d3f5eb980a7db5c3b678c664b66fef6d007' 9 | else 10 | echo "Hardware not supported" 11 | exit 1 12 | end 13 | def install 14 | bin.install 'kitctl' 15 | end 16 | end -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/cleanup/eventlistener.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: triggers.tekton.dev/v1beta1 2 | kind: EventListener 3 | metadata: 4 | name: tekton-cd 5 | namespace: scalability 6 | spec: 7 | serviceAccountName: tekton-cleaner 8 | triggers: 9 | - name: cleanup 10 | interceptors: 11 | - ref: 12 | name: "cel" 13 | params: 14 | - name: "filter" 15 | value: | 16 | 'trigger-template' in body && body['trigger-template'] == 'cleanup' 17 | bindings: 18 | - ref: cleanup-details 19 | template: 20 | ref: cleanup-runs -------------------------------------------------------------------------------- /tests/assets/karpenter/controller-role-trust-policy-document.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Federated": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:oidc-provider/${OIDC_ENDPOINT}" 8 | }, 9 | "Action": "sts:AssumeRoleWithWebIdentity", 10 | "Condition": { 11 | "StringEquals": { 12 | "${OIDC_ENDPOINT}:aud": "sts.amazonaws.com", 13 | "${OIDC_ENDPOINT}:sub": "system:serviceaccount:karpenter:karpenter" 14 | } 15 | } 16 | } 17 | ] 18 | } -------------------------------------------------------------------------------- /infrastructure/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2018", 4 | "module": "commonjs", 5 | "lib": ["es2018", "dom"], 6 | "declaration": true, 7 | "strict": true, 8 | "noImplicitAny": true, 9 | "strictNullChecks": true, 10 | "noImplicitThis": true, 11 | "alwaysStrict": true, 12 | "noUnusedLocals": false, 13 | "noUnusedParameters": false, 14 | "noImplicitReturns": true, 15 | "noFallthroughCasesInSwitch": false, 16 | "inlineSourceMap": true, 17 | "inlineSources": true, 18 | "experimentalDecorators": true, 19 | "strictPropertyInitialization": false, 20 | "typeRoots": ["./node_modules/@types"] 21 | }, 22 | "exclude": ["cdk.out"] 23 | } 24 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # See https://docs.github.com/en/github/administering-a-repository/configuration-options-for-dependency-updates#package-ecosystem 2 | version: 2 3 | updates: 4 | - package-ecosystem: "gomod" 5 | directory: "/" 6 | schedule: 7 | interval: "monthly" 8 | ignore: 9 | - dependency-name: "k8s.io/api" 10 | - dependency-name: "k8s.io/apimachinery" 11 | - dependency-name: "k8s.io/client-go" 12 | - dependency-name: "sigs.k8s.io/controller-runtime" 13 | - dependency-name: "github.com/containerd/containerd" 14 | - dependency-name: "github.com/google/ko" 15 | - package-ecosystem: "github-actions" 16 | directory: "/.github" 17 | schedule: 18 | interval: "monthly" -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/notifications/slack.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: slack-notification 6 | namespace: scalability 7 | spec: 8 | description: | 9 | sends slack notification for a given slack hook 10 | params: 11 | - name: slack-hook 12 | description: The name of the EKS cluster you want to spin. 13 | - name: slack-message 14 | default: 'default' 15 | description: The message you want to post. 16 | steps: 17 | - name: send-notification 18 | image: alpine/k8s:1.23.7 19 | script: | 20 | if [ -n "$(params.slack-hook)" ]; then 21 | curl -H "Content-type: application/json" --data '{"Message": "$(params.slack-message)"}' -X POST $(params.slack-hook) 22 | fi -------------------------------------------------------------------------------- /.github/workflows/cdk-ci.yaml: -------------------------------------------------------------------------------- 1 | name: Build and test CDK module 2 | on: 3 | pull_request: 4 | branches: [main] 5 | push: 6 | branches: [main] 7 | workflow_dispatch: {} 8 | jobs: 9 | cdk-ci: 10 | strategy: 11 | matrix: 12 | platform: [ubuntu-latest, macos-latest] 13 | node-version: [16, 18] 14 | runs-on: ${{ matrix.platform }} 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Use Node.js ${{ matrix.node-version }} 18 | uses: actions/setup-node@v3 19 | with: 20 | node-version: ${{ matrix.node-version }} 21 | - name: Build and test NPM packages 22 | run: | 23 | cd infrastructure/ 24 | npm ci 25 | npm run build 26 | # TODO: enable it when we have at least one test. 27 | # npm test 28 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/teardown/eksctl.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: eks-cluster-teardown 6 | namespace: scalability 7 | spec: 8 | description: | 9 | Teardown an EKS cluster. 10 | This Task can be used to teardown an EKS cluster in an AWS account. 11 | params: 12 | - name: cluster-name 13 | description: The name of the EKS cluster which will be teared down. 14 | - name: region 15 | default: us-west-2 16 | description: The region where the cluster is in. 17 | steps: 18 | - name: delete-cluster 19 | image: docker.io/weaveworks/eksctl:0.35.0@sha256:48c1fa508970a01fd87a73ac7932a7160479d678cd019a3c84533d911fc54327 20 | script: | 21 | eksctl delete cluster \ 22 | --name $(params.cluster-name) \ 23 | --region $(params.region) -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/kitctl/template.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Pipeline 4 | metadata: 5 | name: pipeline-template 6 | namespace: scalability 7 | spec: 8 | params: 9 | - name: name 10 | description: The name of the test cluster. 11 | default: "guest" 12 | tasks: 13 | - name: setup-control-plane 14 | taskRef: 15 | name: control-plane-setup 16 | params: 17 | - name: name 18 | value: '$(params.name)' 19 | - name: setup-data-plane 20 | runAfter: [setup-control-plane] 21 | taskRef: 22 | name: data-plane-setup 23 | params: 24 | - name: name 25 | value: '$(params.name)' 26 | finally: 27 | - name: teardown 28 | taskRef: 29 | name: teardown 30 | params: 31 | - name: name 32 | value: '$(params.name)' 33 | -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/monitoring/grafana.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: source.toolkit.fluxcd.io/v1beta2 2 | kind: HelmRepository 3 | metadata: 4 | name: grafana 5 | namespace: monitoring 6 | spec: 7 | interval: 5m0s 8 | url: https://grafana.github.io/helm-charts 9 | --- 10 | apiVersion: helm.toolkit.fluxcd.io/v2beta1 11 | kind: HelmRelease 12 | metadata: 13 | name: grafana 14 | namespace: monitoring 15 | spec: 16 | interval: 2m 17 | targetNamespace: monitoring 18 | chart: 19 | spec: 20 | chart: grafana 21 | sourceRef: 22 | kind: HelmRepository 23 | name: grafana 24 | namespace: monitoring 25 | interval: 1m 26 | upgrade: 27 | remediation: 28 | remediateLastFailure: true 29 | values: 30 | tolerations: 31 | - key: CriticalAddonsOnly 32 | operator: Exists 33 | 34 | -------------------------------------------------------------------------------- /tests/assets/eks-networking/test-svc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: test-svc 5 | --- 6 | apiVersion: apps/v1 7 | kind: Deployment 8 | metadata: 9 | name: test-svc-deployment 10 | namespace: test-svc 11 | spec: 12 | replicas: 5000 13 | selector: 14 | matchLabels: 15 | app: test-svc 16 | template: 17 | metadata: 18 | labels: 19 | app: test-svc 20 | spec: 21 | containers: 22 | - name: pause 23 | image: public.ecr.aws/eks-distro/kubernetes/pause:3.9 24 | ports: 25 | - containerPort: 8080 26 | name: http 27 | --- 28 | apiVersion: v1 29 | kind: Service 30 | metadata: 31 | name: test-svc 32 | namespace: test-svc 33 | spec: 34 | type: ClusterIP 35 | selector: 36 | app: test-svc 37 | ports: 38 | - name: http 39 | port: 80 40 | targetPort: http 41 | -------------------------------------------------------------------------------- /tests/assets/eks-pod-identity/pod-image-preload.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: image-preload 5 | labels: 6 | group: image-preload 7 | spec: 8 | selector: 9 | matchLabels: 10 | name: image-preload 11 | template: 12 | metadata: 13 | labels: 14 | group: image-preload 15 | name: image-preload 16 | spec: 17 | containers: 18 | - name: image-preload 19 | image: {{.PodImage}} 20 | resources: 21 | requests: 22 | cpu: "10m" 23 | memory: "10Mi" 24 | limits: 25 | cpu: "15m" 26 | memory: "15Mi" 27 | command: 28 | - sh 29 | - -c 30 | - | 31 | while true; do 32 | echo "Sleeping for 1 hour..." 33 | sleep 3600 34 | done 35 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/teardown/kitctl.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: teardown 6 | namespace: scalability 7 | annotations: 8 | tekton.dev/pipelines.minVersion: "0.17.0" 9 | tekton.dev/categories: Kubernetes 10 | tekton.dev/tags: CLI, kubectl 11 | tekton.dev/displayName: "kubernetes actions" 12 | tekton.dev/platforms: "linux/amd64" 13 | spec: 14 | description: | 15 | Tear down the guest cluster. 16 | params: 17 | - name: name 18 | default: "guest" 19 | description: Name of the guess cluster 20 | steps: 21 | - name: teardown 22 | image: bitnami/kubectl 23 | script: | 24 | #!/bin/bash 25 | echo "Tear down guest cluster" 26 | kubectl delete controlplane -n $(params.name) $(params.name) 27 | echo "Delete namespace" 28 | kubectl delete namespace $(params.name) 29 | -------------------------------------------------------------------------------- /infrastructure/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "kit-infrastructure", 3 | "license": "Apache-2", 4 | "version": "0.1.0", 5 | "bin": { 6 | "infra": "bin/kit-infrastructure.ts" 7 | }, 8 | "scripts": { 9 | "build": "tsc", 10 | "watch": "tsc -w", 11 | "test": "jest", 12 | "cdk": "cdk" 13 | }, 14 | "devDependencies": { 15 | "@types/jest": "^27.4.1", 16 | "@types/node": "17.0.38", 17 | "jest": "^27.5.1", 18 | "ts-jest": "^27.1.4", 19 | "ts-node": "^10.8.0", 20 | "typescript": "~4.7.4" 21 | }, 22 | "dependencies": { 23 | "@types/js-yaml": "^4.0.5", 24 | "aws-cdk": "^2.94.0", 25 | "aws-cdk-lib": "^2.85.0", 26 | "constructs": "^10.1.78", 27 | "js-yaml": "^4.1.0", 28 | "source-map-support": "^0.5.21", 29 | "sync-request": "^6.1.0", 30 | "temp-write": "^5.0.0", 31 | "tempy": "^3.0.0", 32 | "@aws-cdk/lambda-layer-kubectl-v26": "2.0.1" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/metrics-server/metrics-server.yaml: -------------------------------------------------------------------------------- 1 | 2 | apiVersion: v1 3 | kind: Namespace 4 | metadata: 5 | name: metrics-server 6 | --- 7 | apiVersion: source.toolkit.fluxcd.io/v1beta2 8 | kind: HelmRepository 9 | metadata: 10 | name: metrics-server 11 | namespace: metrics-server 12 | spec: 13 | interval: 5m0s 14 | url: https://kubernetes-sigs.github.io/metrics-server/ 15 | --- 16 | apiVersion: helm.toolkit.fluxcd.io/v2beta1 17 | kind: HelmRelease 18 | metadata: 19 | name: metrics-server 20 | namespace: metrics-server 21 | spec: 22 | interval: 5m 23 | targetNamespace: metrics-server 24 | chart: 25 | spec: 26 | chart: metrics-server 27 | sourceRef: 28 | kind: HelmRepository 29 | name: metrics-server 30 | namespace: metrics-server 31 | interval: 1m 32 | upgrade: 33 | remediation: 34 | remediateLastFailure: true 35 | values: 36 | tolerations: 37 | - key: CriticalAddonsOnly 38 | operator: Exists -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/eks/awscli-eks-fargate.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Pipeline 4 | metadata: 5 | name: awscli-eks-fargate 6 | namespace: scalability 7 | spec: 8 | workspaces: 9 | - name: config 10 | params: 11 | - name: cluster-name 12 | - name: desired-nodes 13 | - name: host-cluster-node-role-name 14 | tasks: 15 | - name: setup-control-plane 16 | taskRef: 17 | name: awscli-eks-cluster-create 18 | params: 19 | - name: cluster-name 20 | value: '$(params.cluster-name)' 21 | workspaces: 22 | - name: config 23 | workspace: config 24 | - name: setup-data-plane 25 | runAfter: [setup-control-plane] 26 | taskRef: 27 | name: awscli-eks-fargate-create 28 | params: 29 | - name: cluster-name 30 | value: '$(params.cluster-name)' 31 | - name: desired-nodes 32 | value: '$(params.desired-nodes)' 33 | - name: host-cluster-node-role-name 34 | value: '$(params.host-cluster-node-role-name)' -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/kit-provisioner.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.k8s.aws/v1alpha1 2 | kind: AWSNodeTemplate 3 | metadata: 4 | name: kit 5 | spec: 6 | amiFamily: AL2 7 | subnetSelector: 8 | kit.sh/stack: KITInfrastructure 9 | aws-cdk:subnet-type: Private 10 | securityGroupSelector: 11 | kit.sh/stack: KITInfrastructure 12 | --- 13 | apiVersion: karpenter.sh/v1alpha5 14 | kind: Provisioner 15 | metadata: 16 | name: kit 17 | spec: 18 | requirements: 19 | - key: karpenter.sh/capacity-type 20 | operator: In 21 | values: 22 | - on-demand 23 | - key: kubernetes.io/arch 24 | operator: In 25 | values: 26 | - amd64 27 | - key: kit.k8s.sh/app 28 | operator: Exists 29 | - key: "karpenter.k8s.aws/instance-cpu" 30 | operator: In 31 | values: ["16", "32", "48", "64"] 32 | - key: kit.k8s.sh/control-plane-name 33 | operator: Exists 34 | limits: 35 | resources: 36 | cpu: 1000 37 | memory: 1000Gi 38 | providerRef: 39 | name: default 40 | ttlSecondsAfterEmpty: 300 41 | -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/provisioner.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.k8s.aws/v1alpha1 2 | kind: AWSNodeTemplate 3 | metadata: 4 | name: default 5 | spec: 6 | amiFamily: AL2 7 | subnetSelector: 8 | kit.sh/stack: KITInfrastructure 9 | aws-cdk:subnet-type: Private 10 | securityGroupSelector: 11 | kit.sh/stack: KITInfrastructure 12 | blockDeviceMappings: 13 | - deviceName: /dev/xvda 14 | ebs: 15 | volumeSize: 100Gi 16 | volumeType: gp3 17 | --- 18 | apiVersion: karpenter.sh/v1alpha5 19 | kind: Provisioner 20 | metadata: 21 | name: default 22 | spec: 23 | requirements: 24 | - key: karpenter.sh/capacity-type 25 | operator: In 26 | values: 27 | - on-demand 28 | - key: kubernetes.io/arch 29 | operator: In 30 | values: 31 | - amd64 32 | - key: "karpenter.k8s.aws/instance-cpu" 33 | operator: In 34 | values: ["16", "32", "48", "64"] 35 | limits: 36 | resources: 37 | cpu: 8000 38 | memory: 2000Gi 39 | providerRef: 40 | name: default 41 | ttlSecondsAfterEmpty: 864000 42 | -------------------------------------------------------------------------------- /infrastructure/cache-iam-policies.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # Downloads and saves cached versions of IAM policies for aws-load-balancer-controller and aws-ebs-csi-driver 5 | # This file must be executed to update the cache when these two dependencies are upgraded 6 | 7 | CACHE_DIR="./lib/addons/cached" 8 | 9 | LOAD_BALANCER_CONTROLLER_VERSION="v2.4.2" 10 | LOAD_BALANCER_CACHED_FILE="${CACHE_DIR}/aws-load-balancer-controller-iam-policy-${LOAD_BALANCER_CONTROLLER_VERSION}.json" 11 | LOAD_BALANCER_CACHED_URL="https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/${LOAD_BALANCER_CONTROLLER_VERSION}/docs/install/iam_policy.json" 12 | 13 | EBS_CSI_DRIVER_VERSION="v1.9.0" 14 | EBS_CSI_DRIVER_FILE="${CACHE_DIR}/aws-ebs-csi-driver-iam-policy-${EBS_CSI_DRIVER_VERSION}.json" 15 | EBS_CSI_DRIVER_URL="https://raw.githubusercontent.com/kubernetes-sigs/aws-ebs-csi-driver/${EBS_CSI_DRIVER_VERSION}/docs/example-iam-policy.json" 16 | 17 | curl -o "${LOAD_BALANCER_CACHED_FILE}" "${LOAD_BALANCER_CACHED_URL}" 18 | curl -o "${EBS_CSI_DRIVER_FILE}" "${EBS_CSI_DRIVER_URL}" 19 | -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/node-problem-detector/node-problem-detector.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: node-problem-detector 5 | --- 6 | apiVersion: source.toolkit.fluxcd.io/v1beta2 7 | kind: HelmRepository 8 | metadata: 9 | name: node-problem-detector 10 | namespace: node-problem-detector 11 | spec: 12 | interval: 5m0s 13 | url: https://charts.deliveryhero.io/ 14 | --- 15 | apiVersion: helm.toolkit.fluxcd.io/v2beta1 16 | kind: HelmRelease 17 | metadata: 18 | name: node-problem-detector 19 | namespace: node-problem-detector 20 | spec: 21 | interval: 2m 22 | targetNamespace: node-problem-detector 23 | chart: 24 | spec: 25 | chart: node-problem-detector 26 | version: 2.2.2 27 | sourceRef: 28 | kind: HelmRepository 29 | name: node-problem-detector 30 | namespace: node-problem-detector 31 | interval: 1m 32 | upgrade: 33 | remediation: 34 | remediateLastFailure: true 35 | values: 36 | tolerations: 37 | - key: CriticalAddonsOnly 38 | operator: Exists 39 | 40 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/teardown/awscli-eks-fargate.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: awscli-eks-fargate-profile-teardown 6 | namespace: scalability 7 | spec: 8 | description: | 9 | Teardown an EKS fargate profile for a cluster. 10 | This Task can be used to teardown an EKS fargate profile in an AWS account. 11 | params: 12 | - name: cluster-name 13 | description: The name of the EKS cluster which will be teared down. 14 | - name: region 15 | default: us-west-2 16 | description: The region where the cluster is in. 17 | - name: endpoint 18 | default: "" 19 | - name: profile-name 20 | description: name of the fargate profile that needs to be deleted 21 | steps: 22 | - name: delete-profile 23 | image: alpine/k8s:1.23.7 24 | script: | 25 | ENDPOINT_FLAG="" 26 | if [ -n "$(params.endpoint)" ]; then 27 | ENDPOINT_FLAG="--endpoint $(params.endpoint)" 28 | fi 29 | aws eks delete-fargate-profile --cluster-name $(params.cluster-name) --region $(params.region) $ENDPOINT_FLAG --fargate-profile-name $(params.profile-name) -------------------------------------------------------------------------------- /tests/assets/karpenter/nodepool.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.sh/v1 2 | kind: NodePool 3 | metadata: 4 | name: ${CLUSTER_NAME}-${AZ} 5 | spec: 6 | disruption: 7 | budgets: 8 | - nodes: 10% 9 | consolidateAfter: 0s 10 | consolidationPolicy: WhenEmptyOrUnderutilized 11 | replicas: 0 12 | template: 13 | spec: 14 | expireAfter: 720h 15 | nodeClassRef: 16 | group: karpenter.k8s.aws 17 | kind: EC2NodeClass 18 | name: default 19 | requirements: 20 | - key: topology.kubernetes.io/zone 21 | operator: In 22 | values: 23 | - ${AZ} 24 | - key: kubernetes.io/arch 25 | operator: In 26 | values: 27 | - amd64 28 | - key: kubernetes.io/os 29 | operator: In 30 | values: 31 | - linux 32 | - key: karpenter.sh/capacity-type 33 | operator: In 34 | values: 35 | - on-demand 36 | - key: node.kubernetes.io/instance-category 37 | operator: In 38 | values: 39 | - c 40 | - m 41 | - r 42 | - t 43 | - key: karpenter.k8s.aws/instance-size 44 | operator: In 45 | values: 46 | - medium -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/eks/awscli-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: awscli-role-create 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Creates Roles from CFN json stack. 9 | This Task can be used to create IM Roles using the CFN json/yaml stack. 10 | results: 11 | - name: role-arn 12 | description: Stores the role arn created by the task 13 | params: 14 | - name: stack-name 15 | description: The name of the Role name you want to spin. 16 | - name: role-cfn-url 17 | description: The url of the CFN YAML/JSON to create IAM Roles 18 | - name: region 19 | default: "us-west-2" 20 | - name: role-name 21 | description: The name of the role that needs to be created. 22 | steps: 23 | - name: create-role 24 | image: alpine/k8s:1.23.7 25 | script: | 26 | curl -s $(params.role-cfn-url) -o ./amazon-role-eks 27 | aws cloudformation deploy --stack-name $(params.stack-name) --region $(params.region) --template-file ./amazon-role-eks --parameter-overrides Name=$(params.role-name) --capabilities CAPABILITY_NAMED_IAM || true 28 | aws iam get-role --role-name $(params.role-name) --query 'Role.[Arn]' --output text > $(results.role-arn.path) -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/aws-node-termination-handler/aws-node-termination-handler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: aws-node-termination-handler 5 | --- 6 | apiVersion: source.toolkit.fluxcd.io/v1beta2 7 | kind: HelmRepository 8 | metadata: 9 | name: eks-charts 10 | namespace: aws-node-termination-handler 11 | spec: 12 | interval: 5m0s 13 | url: https://aws.github.io/eks-charts 14 | --- 15 | apiVersion: helm.toolkit.fluxcd.io/v2beta1 16 | kind: HelmRelease 17 | metadata: 18 | name: nth 19 | namespace: aws-node-termination-handler 20 | spec: 21 | interval: 2m 22 | targetNamespace: aws-node-termination-handler 23 | chart: 24 | spec: 25 | chart: aws-node-termination-handler 26 | sourceRef: 27 | kind: HelmRepository 28 | name: eks-charts 29 | namespace: aws-node-termination-handler 30 | interval: 1m 31 | upgrade: 32 | remediation: 33 | remediateLastFailure: true 34 | values: 35 | tolerations: 36 | - key: CriticalAddonsOnly 37 | operator: Exists 38 | enableSpotInterruptionDraining: true 39 | enableRebalanceMonitoring: true 40 | enableRebalanceDraining: false 41 | enableScheduledEventDraining: true 42 | 43 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/teardown/kit-cluster.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: kit-cluster-teardown 6 | namespace: scalability 7 | spec: 8 | description: | 9 | Teardown an KIT cluster. 10 | This Task can be used to teardown an KIT cluster in an AWS account. 11 | params: 12 | - name: cluster-name 13 | description: The name of the kit cluster which will be teared down. 14 | - name: region 15 | default: us-west-2 16 | description: The region where the EKS/host cluster is in. 17 | - name: host-cluster-name 18 | description: The name of the Host cluster on which you spin up KIT Guest cluster. 19 | default: "testbed" 20 | steps: 21 | - name: delete-cluster 22 | image: amazon/aws-cli 23 | script: | 24 | curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" 25 | install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl 26 | kubectl version 27 | aws eks update-kubeconfig --name $(params.host-cluster-name) --region $(params.region) 28 | kubectl config current-context 29 | #delete kit controlplane spec and dataplane spec crds 30 | kubectl delete controlplane $(params.cluster-name) 31 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/addons/cw-metric.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: cloudwatch 5 | namespace: scalability 6 | spec: 7 | description: "Cloudwatch task publishes the provided metric data points to Amazon CloudWatch. More details can be found in the aws cli reference doc: https://awscli.amazonaws.com/v2/documentation/api/2.1.29/reference/cloudwatch/put-metric-data.html" 8 | params: 9 | - name: region 10 | default: "us-west-2" 11 | description: "The region to use for publishing the metrics" 12 | - name: metric-name 13 | default: "cl2-loadtest" 14 | description: "The name of the metric you want to pass" 15 | - name: namespace 16 | description: "The namespace for the metric data" 17 | - name: dimensions 18 | description: "Dimensions to associate with the metrics" 19 | - name: unit 20 | default: "Count" 21 | description: "The unit of the metrics" 22 | - name: value 23 | description: "The value for the metric" 24 | steps: 25 | - name: cw-emit 26 | image: amazon/aws-cli 27 | script: | 28 | aws sts get-caller-identity 29 | aws cloudwatch --region $(params.region) put-metric-data --metric-name $(params.metric-name) --namespace $(params.namespace) --dimensions Nodes=$(params.dimensions) --unit $(params.unit) --value $(params.value) -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/teardown/awscli-vpc-delete.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: awscli-delete-vpc 5 | namespace: scalability 6 | spec: 7 | description: | 8 | This Task can be used to delete CloudFormation stack containing VPC resources that was used for EKS clusters. 9 | params: 10 | - name: stack-name 11 | description: The name of the VPC name you want to delete. 12 | - name: region 13 | default: "us-west-2" 14 | steps: 15 | - name: awscli-delete-vpc 16 | image: alpine/k8s:1.23.7 17 | script: | 18 | #!/bin/bash 19 | aws sts get-caller-identity 20 | # Check if the stack exists 21 | aws cloudformation --region $(params.region) describe-stacks --stack-name $(params.stack-name) 22 | if [ $? -ne 0 ]; then 23 | echo "Stack $(params.stack-name) not found. Exiting..." 24 | exit 1 25 | else 26 | echo "Deleting stack $(params.stack-name)..." 27 | fi 28 | #Deletes the CFN stack 29 | aws cloudformation delete-stack --region $(params.region) --stack-name $(params.stack-name) 30 | # Wait for the stack to be deleted 31 | aws cloudformation wait stack-delete-complete --region $(params.region) --stack-name $(params.stack-name) 32 | echo "Stack deleted successfully!" 33 | -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/cleanup/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: tekton-cleaner 5 | namespace: scalability 6 | --- 7 | apiVersion: rbac.authorization.k8s.io/v1 8 | kind: ClusterRole 9 | metadata: 10 | name: tekton-cleaner-roles 11 | rules: 12 | - apiGroups: [""] 13 | resources: ["namespaces", "configmaps"] 14 | verbs: ["get", "list", "watch"] 15 | - apiGroups: ["tekton.dev"] 16 | resources: ["pipelineruns", "taskruns", "pipelineresources"] 17 | verbs: ["get", "list", "delete", "create"] 18 | - apiGroups: ["triggers.tekton.dev"] 19 | resources: ["eventlisteners", "triggerbindings", "triggertemplates", "interceptors","clusterinterceptors"] 20 | verbs: ["get", "list", "watch"] 21 | --- 22 | apiVersion: rbac.authorization.k8s.io/v1 23 | kind: ClusterRole 24 | metadata: 25 | name: tekton-cleaner-clusterroles 26 | rules: 27 | - apiGroups: ["triggers.tekton.dev"] 28 | resources: ["clustertriggerbindings", "clusterinterceptors"] 29 | verbs: ["get", "list", "watch"] 30 | --- 31 | apiVersion: rbac.authorization.k8s.io/v1 32 | kind: RoleBinding 33 | metadata: 34 | name: tektoncd-cleaner-delete-pr-tr-rolebinding 35 | namespace: scalability 36 | subjects: 37 | - kind: ServiceAccount 38 | name: tekton-cleaner 39 | namespace: scalability 40 | roleRef: 41 | apiGroup: rbac.authorization.k8s.io 42 | kind: ClusterRole 43 | name: tekton-cleaner-roles 44 | -------------------------------------------------------------------------------- /tests/assets/eks_service_role.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSTemplateFormatVersion": "2010-09-09", 3 | "Parameters": { 4 | "Name": { 5 | "Type": "String", 6 | "Default": "eks-service-role", 7 | "Description": "Names of the role." 8 | } 9 | }, 10 | "Resources": { 11 | "RootRole": { 12 | "Type": "AWS::IAM::Role", 13 | "Properties": { 14 | "RoleName" : { 15 | "Ref": "Name" 16 | }, 17 | "AssumeRolePolicyDocument": { 18 | "Version": "2012-10-17", 19 | "Statement": [ 20 | { 21 | "Effect": "Allow", 22 | "Principal": { 23 | "Service": [ 24 | "eks-gamma.aws.internal", 25 | "eks.amazonaws.com", 26 | "eks-beta.aws.internal" 27 | ] 28 | }, 29 | "Action": "sts:AssumeRole" 30 | } 31 | ] 32 | }, 33 | "ManagedPolicyArns": [ 34 | "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy", 35 | "arn:aws:iam::aws:policy/AmazonEKSServicePolicy" 36 | ] 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/cleanup/cronjob.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: CronJob 3 | metadata: 4 | name: cleanup-trigger 5 | namespace: scalability 6 | spec: 7 | schedule: "0 * * * *" 8 | jobTemplate: 9 | spec: 10 | template: 11 | spec: 12 | volumes: 13 | - name: workspace 14 | emptyDir: {} 15 | containers: 16 | - name: trigger 17 | image: curlimages/curl 18 | command: 19 | - /bin/sh 20 | args: 21 | - -ce 22 | - | 23 | cat < /workspace/post-body.json 24 | { 25 | "trigger-template": "cleanup", 26 | "params": { 27 | "target": { 28 | "namespace": "$NAMESPACE" 29 | }, 30 | "cleanup": { 31 | "keep": "$CLEANUP_KEEP" 32 | } 33 | } 34 | } 35 | EOF 36 | curl -d @/workspace/post-body.json $SINK_URL 37 | volumeMounts: 38 | - mountPath: /workspace 39 | name: workspace 40 | env: 41 | - name: SINK_URL 42 | value: "http://el-tekton-cd.scalability.svc.cluster.local:8080" 43 | - name: NAMESPACE 44 | value: "scalability" 45 | - name: CLEANUP_KEEP 46 | value: "50" 47 | restartPolicy: Never -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/cleanup/README.md: -------------------------------------------------------------------------------- 1 | # Cleanup old TaskRuns and PipelineRuns 2 | 3 | Here is how users can clean up old TaskRuns and PipelineRuns. 4 | 5 | The general method is to use a CronJob to trigger a Task that deletes all but the `n` most recent PipelineRuns and `2*n` most recent TaskRuns. 6 | 7 | ## Prerequisites 8 | 9 | * A Kubernetes cluster with Tekton Pipelines installed 10 | * Several old TaskRuns and/or PipelineRuns you wish to delete 11 | 12 | ## Scheduling the cleanup job 13 | 14 | You'll need to install all the files in this directory to run the cleanup task. 15 | 16 | * [serviceaccount.yaml](serviceaccount.yaml): this creates the service account needed to run the job, along with the associated ClusterRole and Rolebinding. 17 | 18 | * [cleanup-template.yaml](cleanup-template.yaml): this creates the TriggerTemplate that spawns the TaskRun that does the deleting. It uses the `tkn` CLI to do the deleting. 19 | 20 | * [binding.yaml](binding.yaml): this creates the TriggerBinding that is used to pass parameters to the TaskRun. 21 | 22 | * [eventlistener.yaml](eventlistener.yaml): this creates the sink that receives the incoming event that triggers the creation of the cleanup job. 23 | 24 | * [cronjob.yaml](cronjob.yaml): this is used to run the cleanup job on a schedule. There are two environmental variables that need to be set in the job: `NAMESPACE` for the namespace you wish to clean up, and `CLEANUP_KEEP` for the number of PipelineRuns to keep. The schedule for the job running can be set in the `.spec.schedule` field using [crontab format](https://crontab.guru/) -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/cleanup/cleanup-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: triggers.tekton.dev/v1alpha1 2 | kind: TriggerTemplate 3 | metadata: 4 | name: cleanup-runs 5 | namespace: scalability 6 | spec: 7 | params: 8 | - name: namespace 9 | description: Namespace to cleanup to in the target cluster 10 | - name: clusterResource 11 | description: Name of the cluster resource that points to the target cluster 12 | - name: keep 13 | description: Amount of old resources to keep 14 | default: "200" 15 | resourcetemplates: 16 | - apiVersion: tekton.dev/v1beta1 17 | kind: TaskRun 18 | metadata: 19 | name: cleanupruns-$(uid) 20 | spec: 21 | serviceAccountName: tekton-cleaner 22 | taskSpec: 23 | params: 24 | - name: keep 25 | - name: namespace 26 | steps: 27 | - name: cleanup-pr-tr 28 | image: gcr.io/tekton-releases/dogfooding/tkn 29 | script: | 30 | #!/bin/sh 31 | set -ex 32 | # A safety check, to avoid deleting too much! 33 | if [[ $(params.keep) -eq 0 || $(params.keep) == "" ]]; then 34 | echo "This task cannot be used to delete *all* resources from a cluster" >&2 35 | echo "Please specifcy a value for keep > 0" 36 | exit 1 37 | fi 38 | # Cleanup pipelineruns first, as this will delete tasksruns too 39 | tkn pr delete -n $(params.namespace) --keep $(params.keep) 40 | params: 41 | - name: keep 42 | value: $(tt.params.keep) 43 | - name: namespace 44 | value: $(tt.params.namespace) -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: awscli-instanceprofiles 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Creates the karpenter instance profile 9 | params: 10 | - name: cluster-name 11 | description: The name of the cluster 12 | steps: 13 | - name: create-role 14 | image: alpine/k8s:1.30.2 15 | script: | 16 | # Check if the instance profile already exists 17 | if aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" >/dev/null 2>&1; then 18 | echo "Instance profile KarpenterNodeInstanceProfile-$(params.cluster-name) already exists. Skipping creation..." 19 | else 20 | echo "Creating instance profile KarpenterNodeInstanceProfile-$(params.cluster-name)..." 21 | aws iam create-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" 22 | fi 23 | 24 | # Check if the role is already added to the instance profile 25 | EXISTING_ROLES=$(aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --query 'InstanceProfile.Roles[?RoleName==`KarpenterNodeRole-$(params.cluster-name)`].RoleName' --output text) 26 | if [ -n "$EXISTING_ROLES" ]; then 27 | echo "Role KarpenterNodeRole-$(params.cluster-name) is already attached to instance profile. Skipping..." 28 | else 29 | echo "Adding role KarpenterNodeRole-$(params.cluster-name) to instance profile..." 30 | aws iam add-role-to-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --role-name "KarpenterNodeRole-$(params.cluster-name)" 31 | fi 32 | -------------------------------------------------------------------------------- /tests/assets/eks_node_group_launch_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSTemplateFormatVersion": "2010-09-09", 3 | "Description": "Create an EKS Node Group Launch Template", 4 | "Parameters": { 5 | "LaunchTemplateName": { 6 | "Type": "String", 7 | "Description": "Name of the Launch Template" 8 | }, 9 | "ClusterName": { 10 | "Type": "String", 11 | "Description": "Name of the Cluster" 12 | }, 13 | "SSHKeyName": { 14 | "Type": "String", 15 | "Description": "SSH Key Name for EC2 instances" 16 | } 17 | }, 18 | "Resources": { 19 | "NodeGroupLaunchTemplate": { 20 | "Type": "AWS::EC2::LaunchTemplate", 21 | "Properties": { 22 | "LaunchTemplateName": { "Ref": "LaunchTemplateName" }, 23 | "LaunchTemplateData": { 24 | "KeyName": { "Ref": "SSHKeyName" }, 25 | "BlockDeviceMappings": [ 26 | { 27 | "DeviceName": "/dev/xvda", 28 | "Ebs": { 29 | "VolumeSize": 20, 30 | "VolumeType": "gp2" 31 | } 32 | } 33 | ], 34 | "MetadataOptions": { 35 | "HttpPutResponseHopLimit": 2, 36 | "HttpEndpoint": "enabled", 37 | "HttpTokens": "required" 38 | } 39 | } 40 | } 41 | } 42 | }, 43 | "Outputs": { 44 | "NodeGroupLaunchTemplateName": { 45 | "Description": "Name of the Node Group Launch Template", 46 | "Value": { "Ref": "NodeGroupLaunchTemplate" } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tests/assets/asg_node_group.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | AWSTemplateFormatVersion: '2010-09-09' 3 | Description: 'Unmanaged EKS nodegroup using EC2 AutoScaling' 4 | Parameters: 5 | ClusterName: 6 | Type: String 7 | Description: Name of EKS cluster. 8 | AutoScalingGroupName: 9 | Description: Name of ASG. 10 | Type: String 11 | VpcId: 12 | Type: AWS::EC2::VPC::Id 13 | SubnetIds: 14 | Type: List 15 | SecurityGroup: 16 | Type: AWS::EC2::SecurityGroup::Id 17 | LaunchTemplateName: 18 | Type: String 19 | Description: Launch template name. 20 | LaunchTemplateVersion: 21 | Type: String 22 | Description: Launch template version. Default is 1, since our launch templates are generally ephemeral/single-use. 23 | Default: "1" 24 | NodeCount: 25 | Type: Number 26 | Resources: 27 | AutoScalingGroup: 28 | Type: AWS::AutoScaling::AutoScalingGroup 29 | UpdatePolicy: 30 | AutoScalingRollingUpdate: 31 | WaitOnResourceSignals: true 32 | PauseTime: PT15M 33 | Properties: 34 | AutoScalingGroupName: !Ref AutoScalingGroupName 35 | DesiredCapacity: !Ref NodeCount 36 | MinSize: !Ref NodeCount 37 | MaxSize: !Ref NodeCount 38 | MixedInstancesPolicy: 39 | LaunchTemplate: 40 | LaunchTemplateSpecification: 41 | LaunchTemplateName: !Ref LaunchTemplateName 42 | Version: !Ref LaunchTemplateVersion 43 | # this will be replaced out-of-band, CFN really doesn't want you to pass in sub-structs as JSON 44 | Overrides: PLACEHOLDER_LAUNCH_TEMPLATE_OVERRIDES 45 | VPCZoneIdentifier: 46 | !Ref SubnetIds 47 | Tags: 48 | # necessary for kubelet's legacy, in-tree cloud provider 49 | - Key: !Sub kubernetes.io/cluster/${ClusterName} 50 | Value: owned 51 | PropagateAtLaunch: true 52 | -------------------------------------------------------------------------------- /tests/assets/karpenter/nodeclass.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.k8s.aws/v1 2 | kind: EC2NodeClass 3 | metadata: 4 | name: default 5 | spec: 6 | amiFamily: Custom 7 | instanceProfile: "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" 8 | amiSelectorTerms: 9 | - alias: "al2023@${ALIAS_VERSION}" 10 | subnetSelectorTerms: 11 | - tags: 12 | karpenter.sh/discovery: "${CLUSTER_NAME}" 13 | - tags: 14 | aws:cloudformation:stack-name: "${CLUSTER_NAME}" 15 | securityGroupSelectorTerms: 16 | - tags: 17 | karpenter.sh/discovery: "${CLUSTER_NAME}" 18 | - tags: 19 | aws:cloudformation:stack-name: "${CLUSTER_NAME}" 20 | - tags: 21 | kubernetes.io/cluster/${CLUSTER_NAME}: owned 22 | kubelet: 23 | maxPods: 110 24 | systemReserved: 25 | cpu: 100m 26 | memory: 100Mi 27 | ephemeral-storage: 1Gi 28 | kubeReserved: 29 | cpu: 100m 30 | memory: 100Mi 31 | ephemeral-storage: 1Gi 32 | evictionHard: 33 | memory.available: 5% 34 | nodefs.available: 10% 35 | nodefs.inodesFree: 10% 36 | userData: | 37 | MIME-Version: 1.0 38 | Content-Type: multipart/mixed; boundary="BOUNDARY" 39 | 40 | --BOUNDARY 41 | Content-Type: application/node.eks.aws 42 | 43 | apiVersion: node.eks.aws/v1alpha1 44 | kind: NodeConfig 45 | spec: 46 | cluster: 47 | name: ${CLUSTER_NAME} 48 | apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint 49 | certificateAuthority: ${CLUSTER_CA} 50 | cidr: "172.20.0.0/16" 51 | kubelet: 52 | config: 53 | nodeStatusReportFrequency: "60m" 54 | nodeLeaseDurationSeconds: 60 55 | maxPods: 110 56 | clusterDNS: ["172.20.0.10"] 57 | flags: 58 | - --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool 59 | - --register-with-taints=karpenter.sh/unregistered:NoExecute 60 | --BOUNDARY-- -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/kitctl/dataplane.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: data-plane-setup 6 | namespace: scalability 7 | annotations: 8 | tekton.dev/pipelines.minVersion: "0.17.0" 9 | tekton.dev/categories: Kubernetes 10 | tekton.dev/tags: CLI, kubectl 11 | tekton.dev/displayName: "kubernetes actions" 12 | tekton.dev/platforms: "linux/amd64" 13 | spec: 14 | description: | 15 | Setup a kubernetes data plane in the guest cluster. 16 | params: 17 | - name: name 18 | default: "guest" 19 | description: Name of the guest cluster 20 | - name: node-count 21 | default: "5" 22 | description: Number of worker nodes 23 | steps: 24 | - name: setup-data-plane 25 | image: bitnami/kubectl 26 | script: | 27 | #!/bin/bash 28 | echo "Getting subnet and security group tags" 29 | TAG=$(kubectl get provisioner -oyaml | grep karpenter.sh/discovery | awk 'NR==1{ print $2}') 30 | echo "Setting up data plane" 31 | cat < /tmp/kubeconfig 45 | echo "Waiting for the worker nodes to be ready" 46 | while true; do 47 | ready_node=$(kubectl --kubeconfig=/tmp/kubeconfig get nodes 2>/dev/null | grep -w Ready | wc -l) 48 | if [[ "$ready_node" -eq $(params.node-count) ]]; then break; fi 49 | sleep 5 50 | done 51 | kubectl --kubeconfig=/tmp/kubeconfig get nodes 52 | -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/monitoring/prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: source.toolkit.fluxcd.io/v1beta2 2 | kind: HelmRepository 3 | metadata: 4 | name: prometheus 5 | namespace: monitoring 6 | spec: 7 | interval: 5m0s 8 | url: https://prometheus-community.github.io/helm-charts 9 | --- 10 | apiVersion: helm.toolkit.fluxcd.io/v2beta1 11 | kind: HelmRelease 12 | metadata: 13 | name: prometheus 14 | namespace: monitoring 15 | spec: 16 | interval: 2m 17 | targetNamespace: monitoring 18 | chart: 19 | spec: 20 | chart: prometheus 21 | sourceRef: 22 | kind: HelmRepository 23 | name: prometheus 24 | namespace: monitoring 25 | interval: 1m 26 | upgrade: 27 | remediation: 28 | remediateLastFailure: true 29 | values: 30 | kube-state-metrics: 31 | tolerations: 32 | - key: CriticalAddonsOnly 33 | operator: Exists 34 | alertmanager: 35 | tolerations: 36 | - key: CriticalAddonsOnly 37 | operator: Exists 38 | nodeExporter: 39 | tolerations: 40 | - key: CriticalAddonsOnly 41 | operator: Exists 42 | server: 43 | tolerations: 44 | - key: CriticalAddonsOnly 45 | operator: Exists 46 | pushgateway: 47 | tolerations: 48 | - key: CriticalAddonsOnly 49 | operator: Exists 50 | --- 51 | apiVersion: source.toolkit.fluxcd.io/v1beta2 52 | kind: GitRepository 53 | metadata: 54 | name: prometheus-crds 55 | namespace: monitoring 56 | spec: 57 | interval: 5m0s 58 | url: https://github.com/prometheus-community/helm-charts 59 | ref: 60 | commit: 0e74668531467f47050f2980b3fcb62f859cf062 61 | --- 62 | apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 63 | kind: Kustomization 64 | metadata: 65 | name: prometheus-crds 66 | namespace: monitoring 67 | spec: 68 | interval: 10m 69 | sourceRef: 70 | kind: GitRepository 71 | name: prometheus-crds 72 | path: "./charts/kube-prometheus-stack/crds" 73 | prune: true -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/addons/perfdash/deployment-patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: perfdash 5 | namespace: perfdash 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: perfdash 11 | command: 12 | - /perfdash 13 | - --www=true 14 | - --dir=/www/ 15 | - --address=0.0.0.0:8080 16 | - --builds=100 17 | - --force-builds 18 | - --mode=s3 19 | - --aws-region=us-west-2 20 | - --configPath=/etc/config/jobs.yaml 21 | - --logsBucket=$(PERFDASH_LOG_BUCKET) 22 | - --logsPath= 23 | - --storageURL=NotSupported 24 | - --allow-parsers-matching-all-tests=true 25 | env: 26 | - name: PERFDASH_LOG_BUCKET 27 | valueFrom: 28 | configMapKeyRef: 29 | name: perfdash-config 30 | key: PERFDASH_LOG_BUCKET 31 | - name: AWS_DEFAULT_REGION 32 | valueFrom: 33 | configMapKeyRef: 34 | name: perfdash-config 35 | key: AWS_DEFAULT_REGION 36 | - name: AWS_REGION 37 | valueFrom: 38 | configMapKeyRef: 39 | name: perfdash-config 40 | key: AWS_REGION 41 | - name: AWS_WEB_IDENTITY_TOKEN_FILE 42 | value: /var/run/secrets/eks.amazonaws.com/serviceaccount/token 43 | # remove the resource request since it requests a lot. 44 | resources: null 45 | volumeMounts: 46 | - name: config-volume 47 | mountPath: /etc/config 48 | livenessProbe: 49 | initialDelaySeconds: 300 50 | timeoutSeconds: 5 51 | serviceAccountName: perfdash-log-fetcher 52 | tolerations: 53 | - key: CriticalAddonsOnly 54 | operator: Exists 55 | volumes: 56 | - name: config-volume 57 | configMap: 58 | name: perfdash-config 59 | -------------------------------------------------------------------------------- /tests/assets/eks-networking/config-eks-networking.yaml: -------------------------------------------------------------------------------- 1 | {{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} 2 | {{$defaultQps := DefaultParam .CL2_DEFAULT_QPS (IfThenElse (le .Nodes 500) 10 100)}} 3 | {{$uniformQps := DefaultParam .CL2_UNIFORM_QPS 500}} 4 | 5 | name: load-eks-networking 6 | tuningSets: 7 | - name: Sequence 8 | parallelismLimitedLoad: 9 | parallelismLimit: 1 10 | - name: UniformQPS 11 | qpsLoad: 12 | qps: {{$uniformQps}} 13 | - name: default 14 | globalQPSLoad: 15 | qps: {{$defaultQps}} 16 | burst: 1 17 | steps: 18 | - name: sanity-check 19 | measurements: 20 | - Identifier: Print 21 | Method: Sleep 22 | Params: 23 | duration: 1s 24 | - module: 25 | path: /modules/measurements.yaml 26 | params: 27 | action: start 28 | {{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} 29 | - module: 30 | path: modules/network-policy/net-policy-enforcement-latency.yaml 31 | params: 32 | setup: true 33 | run: true 34 | testType: "pod-creation" 35 | {{end}} 36 | - module: 37 | path: modules/dns-k8s-hostnames.yaml 38 | {{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} 39 | - module: 40 | path: modules/network-policy/net-policy-metrics.yaml 41 | params: 42 | action: gather 43 | usePolicyCreationMetrics: false 44 | - module: 45 | path: modules/network-policy/net-policy-enforcement-latency.yaml 46 | params: 47 | complete: true 48 | testType: "pod-creation" 49 | - module: 50 | path: modules/network-policy/net-policy-enforcement-latency.yaml 51 | params: 52 | run: true 53 | testType: "policy-creation" 54 | {{end}} 55 | - module: 56 | path: /modules/measurements.yaml 57 | params: 58 | action: gather 59 | {{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} 60 | - module: 61 | path: modules/network-policy/net-policy-enforcement-latency.yaml 62 | params: 63 | complete: true 64 | testType: "policy-creation" 65 | {{end}} 66 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/eks/awscli-vpc.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: awscli-vpc-create 6 | namespace: scalability 7 | spec: 8 | description: | 9 | Creates VPC. 10 | This Task can be used to create VPC resources that could be used for EKS clusters. This stack outputs resources SubnetIds, SecurityGroups, VpcId. 11 | params: 12 | - name: stack-name 13 | description: The name of the VPC name you want to spin. 14 | - name: vpc-cfn-url 15 | description: The url of the CFN YAML/JSON to create VPC resources 16 | - name: region 17 | default: "us-west-2" 18 | steps: 19 | - name: create-vpc 20 | image: alpine/k8s:1.23.7 21 | script: | 22 | curl -s $(params.vpc-cfn-url) -o ./amazon-vpc-eks 23 | aws cloudformation --region $(params.region) deploy --stack-name $(params.stack-name) --template-file ./amazon-vpc-eks 24 | 25 | VPC_ID=$(aws cloudformation describe-stacks --stack-name $(params.stack-name) --query "Stacks[0].Outputs[?OutputKey=='VpcId'].OutputValue" --output text) 26 | 27 | echo "VPC_ID: $(VPC_ID)" 28 | 29 | # Get all subnets with /13 CIDR blocks from the VPC 30 | SUBNETS=$(aws ec2 describe-subnets \ 31 | --filters "Name=vpc-id,Values=$VPC_ID" \ 32 | --query "Subnets[?CidrBlock!=null] | [?contains(CidrBlock, '/13')].{SubnetId:SubnetId,CidrBlock:CidrBlock}" \ 33 | --output json) 34 | 35 | # Create CIDR reservations for each subnet 36 | echo "$SUBNETS" | jq -c '.[]' | while read -r subnet; do 37 | CIDR_BLOCK=$(echo "$subnet" | jq -r '.CidrBlock') 38 | SUBNET_ID=$(echo "$subnet" | jq -r '.SubnetId') 39 | # Calculate the first /14 subnet within the /13 CIDR 40 | BASE_IP=$(echo "$CIDR_BLOCK" | cut -d'/' -f1) 41 | NEW_CIDR="${BASE_IP}/14" 42 | echo "Creating CIDR reservation for subnet $SUBNET_ID with CIDR $NEW_CIDR" 43 | aws ec2 create-subnet-cidr-reservation \ 44 | --subnet-id "$SUBNET_ID" \ 45 | --cidr "$NEW_CIDR" \ 46 | --reservation-type prefix 47 | done 48 | -------------------------------------------------------------------------------- /tests/tekton-resources/triggers/rbac.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | name: tekton-triggers 6 | namespace: tekton-pipelines 7 | rules: 8 | # EventListeners need to be able to fetch all namespaced resources 9 | - apiGroups: ["triggers.tekton.dev"] 10 | resources: ["eventlisteners", "triggerbindings", "triggertemplates", "triggers"] 11 | verbs: ["get", "list", "watch"] 12 | - apiGroups: [""] 13 | # configmaps is needed for updating logging config 14 | resources: ["configmaps"] 15 | verbs: ["get", "list", "watch"] 16 | # Permissions to create resources in associated TriggerTemplates 17 | - apiGroups: ["tekton.dev"] 18 | resources: ["pipelineruns", "pipelineresources", "taskruns"] 19 | verbs: ["create"] 20 | - apiGroups: [""] 21 | resources: ["serviceaccounts"] 22 | verbs: ["impersonate"] 23 | - apiGroups: ["policy"] 24 | resources: ["podsecuritypolicies"] 25 | resourceNames: ["tekton-triggers"] 26 | verbs: ["use"] 27 | --- 28 | apiVersion: rbac.authorization.k8s.io/v1 29 | kind: RoleBinding 30 | metadata: 31 | name: tekton-triggers 32 | namespace: tekton-pipelines 33 | subjects: 34 | - kind: ServiceAccount 35 | name: tekton-triggers 36 | roleRef: 37 | apiGroup: rbac.authorization.k8s.io 38 | kind: Role 39 | name: tekton-triggers 40 | 41 | --- 42 | kind: ClusterRole 43 | apiVersion: rbac.authorization.k8s.io/v1 44 | metadata: 45 | name: tekton-triggers 46 | namespace: tekton-pipelines 47 | rules: 48 | # EventListeners need to be able to fetch any clustertriggerbindings 49 | - apiGroups: ["triggers.tekton.dev"] 50 | resources: ["clustertriggerbindings", "clusterinterceptors"] 51 | verbs: ["get", "list", "watch"] 52 | --- 53 | apiVersion: rbac.authorization.k8s.io/v1 54 | kind: ClusterRoleBinding 55 | metadata: 56 | name: tekton-triggers 57 | namespace: tekton-pipelines 58 | subjects: 59 | - kind: ServiceAccount 60 | name: tekton-triggers 61 | namespace: default 62 | roleRef: 63 | apiGroup: rbac.authorization.k8s.io 64 | kind: ClusterRole 65 | name: tekton-triggers 66 | --- 67 | apiVersion: v1 68 | kind: ServiceAccount 69 | metadata: 70 | name: tekton-triggers 71 | namespace: tekton-pipelines -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/addons/fluentbit.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: eks-addon-fluentbit 6 | namespace: scalability 7 | spec: 8 | description: | 9 | This task installs the FluentBit addon on an EKS cluster. 10 | params: 11 | - name: cluster-name 12 | description: The name of the EKS cluster you want to add addons for. 13 | - name: region 14 | default: "us-west-2" 15 | description: The region where the cluster is in. 16 | workspaces: 17 | - name: config 18 | mountPath: /config/ 19 | stepTemplate: 20 | env: 21 | - name: KUBECONFIG 22 | value: /config/kubeconfig 23 | steps: 24 | - name: create-fluentbit-addon 25 | image: alpine/k8s:1.23.7 26 | script: | 27 | #kubectl commands are purely for knowing state of cluster before kicking off the test. 28 | kubectl version 29 | kubectl config current-context 30 | #install fluent bit addon 31 | kubectl apply -f https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/cloudwatch-namespace.yaml 32 | cat > "./fluentbit-configmap.yaml" < "./cwagent-configmap.yaml" < /tmp/kubeconfig 32 | echo "Deploy dummy pods" 33 | cat </tmp/deploy.yaml 34 | apiVersion: apps/v1 35 | kind: Deployment 36 | metadata: 37 | name: workload 38 | namespace: default 39 | spec: 40 | replicas: $(params.number-of-pods) 41 | selector: 42 | matchLabels: 43 | app: pod_size_workload 44 | template: 45 | metadata: 46 | labels: 47 | app: pod_size_workload 48 | spec: 49 | nodeSelector: 50 | kubernetes.io/arch: amd64 51 | containers: 52 | - name: dummy 53 | image: busybox:latest 54 | command: 55 | - "/bin/sh" 56 | - "-ec" 57 | - "echo -n __LOADSTRING__ >> /config/output.txt && while true; do sleep 100; done" 58 | volumeMounts: 59 | - mountPath: /config 60 | name: config 61 | volumes: 62 | - name: config 63 | emptyDir: {} 64 | EOF 65 | # Replace __LOADSTRING__ with a random string. This will create a pod object with the given size. 66 | sed -i "s/__LOADSTRING__/$(tr -dc A-Za-z0-9 < /dev/urandom | dd bs=$(params.pod-size) count=1 2>/dev/null)/g" /tmp/deploy.yaml 67 | kubectl --kubeconfig=/tmp/kubeconfig apply -f /tmp/deploy.yaml 68 | echo "Get pods" 69 | kubectl --kubeconfig=/tmp/kubeconfig get pod -A 70 | -------------------------------------------------------------------------------- /infrastructure/lib/addons/perfdash.ts: -------------------------------------------------------------------------------- 1 | import { Construct } from 'constructs'; 2 | import { StackProps } from 'aws-cdk-lib'; 3 | import { aws_eks as eks } from 'aws-cdk-lib'; 4 | import { aws_iam as iam } from 'aws-cdk-lib'; 5 | 6 | export interface PerfDashProps extends StackProps { 7 | cluster: eks.Cluster; 8 | namespace: string; 9 | } 10 | 11 | export class PerfDash extends Construct { 12 | constructor(scope: Construct, id: string, props: PerfDashProps) { 13 | super(scope, id); 14 | 15 | const ns = props.cluster.addManifest('perfdash-namespace', { 16 | apiVersion: 'v1', 17 | kind: 'Namespace', 18 | metadata: { 19 | name: props.namespace 20 | } 21 | }) 22 | 23 | const sa = props.cluster.addServiceAccount('perfdash-sa', { 24 | name: 'perfdash-log-fetcher', 25 | namespace: props.namespace 26 | }); 27 | sa.node.addDependency(ns) 28 | sa.role.attachInlinePolicy(new iam.Policy(this, 'perfdash-policy', { 29 | statements: [ 30 | new iam.PolicyStatement({ 31 | resources: ['*'], 32 | actions: [ 33 | // S3 readonly access 34 | "s3:Get*", 35 | "s3:List*", 36 | "s3-object-lambda:Get*", 37 | "s3-object-lambda:List*", 38 | ], 39 | }), 40 | ], 41 | })); 42 | 43 | const perfdashKustomizationManifest = props.cluster.addManifest('PerfdashKustomizationSelf', { 44 | apiVersion: 'kustomize.toolkit.fluxcd.io/v1beta1', 45 | kind: 'Kustomization', 46 | metadata: { 47 | name: 'flux-addon-perfdash', 48 | namespace: props.namespace, 49 | }, 50 | spec: { 51 | interval: '5m0s', 52 | path: "./infrastructure/k8s-config/clusters/addons/perfdash", 53 | prune: true, 54 | sourceRef: { 55 | kind: 'GitRepository', 56 | name: 'flux-system', 57 | namespace: 'flux-system' 58 | }, 59 | validation: 'client', 60 | patches: [ 61 | { 62 | target: { 63 | kind: "Deployment", 64 | name: "perfdash", 65 | namespace: "perfdash", 66 | }, 67 | patch: `apiVersion: apps/v1 68 | kind: Deployment 69 | metadata: 70 | name: perfdash 71 | namespace: perfdash 72 | spec: 73 | template: 74 | spec: 75 | containers: 76 | - name: perfdash 77 | env: 78 | - name: AWS_ROLE_ARN 79 | value: `+sa.role.roleArn 80 | }, 81 | ] 82 | } 83 | }); 84 | perfdashKustomizationManifest.node.addDependency(ns); 85 | perfdashKustomizationManifest.node.addDependency(sa); 86 | } 87 | } -------------------------------------------------------------------------------- /infrastructure/lib/addons/aws-ebs-csi-driver.ts: -------------------------------------------------------------------------------- 1 | import { Construct } from 'constructs'; 2 | import { aws_iam as iam, StackProps } from 'aws-cdk-lib'; 3 | import { aws_eks as eks } from 'aws-cdk-lib'; 4 | import * as request from 'sync-request'; 5 | import * as fs from 'fs'; 6 | 7 | export interface AWSEBSCSIDriverProps extends StackProps { 8 | cluster: eks.Cluster 9 | namespace: string 10 | version: string 11 | chartVersion: string 12 | } 13 | 14 | export class AWSEBSCSIDriver extends Construct { 15 | constructor(scope: Construct, id: string, props: AWSEBSCSIDriverProps) { 16 | super(scope, id) 17 | const ns = props.cluster.addManifest('aws-ebs-csi-namespace', { 18 | apiVersion: 'v1', 19 | kind: 'Namespace', 20 | metadata: { 21 | name: props.namespace 22 | } 23 | }) 24 | 25 | // Controller Role 26 | const sa = props.cluster.addServiceAccount('aws-ebs-csi-driver-sa', { 27 | name: "aws-ebs-csi-driver", 28 | namespace: props.namespace 29 | }) 30 | sa.node.addDependency(ns) 31 | sa.role.attachInlinePolicy(new iam.Policy(this, 'aws-ebs-csi-driver-policy', {document: iam.PolicyDocument.fromJson(this.getIAMPolicy(props.version))})) 32 | 33 | const chart = props.cluster.addHelmChart('aws-ebs-csi-driver-chart', { 34 | chart: 'aws-ebs-csi-driver', 35 | release: 'aws-ebs-csi-driver', 36 | version: props.chartVersion, 37 | repository: 'https://kubernetes-sigs.github.io/aws-ebs-csi-driver', 38 | namespace: props.namespace, 39 | createNamespace: false, 40 | values: { 41 | 'controller': { 42 | 'replicaCount': 1, 43 | 'serviceAccount': { 44 | 'create': false, 45 | 'name': sa.serviceAccountName, 46 | 'annotations': { 47 | 'eks.amazonaws.com/role-arn': sa.role.roleArn 48 | }, 49 | }, 50 | }, 51 | tolerations: [ 52 | { 53 | key: 'CriticalAddonsOnly', 54 | operator: 'Exists', 55 | }, 56 | ], 57 | } 58 | }) 59 | chart.node.addDependency(ns) 60 | } 61 | private getIAMPolicy(version: string): any { 62 | // Update and run REPO_DIR/cache-iam-policies.sh to download and cache this policy 63 | return JSON.parse( 64 | fs.readFileSync(`lib/addons/cached/aws-ebs-csi-driver-iam-policy-${version}.json`,'utf8') 65 | ); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/generators/clusterloader/pod-density.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: pod-density 6 | namespace: scalability 7 | spec: 8 | description: "clusterloader2 pod density task to run various types of cl2 tests on a given cluster." 9 | params: 10 | - name: giturl 11 | description: "git url to clone the package" 12 | default: https://github.com/kubernetes/perf-tests.git 13 | - name: nodes-per-namespace 14 | description: "nodes per namespace to get created for load test " 15 | - name: pods-per-node 16 | description: "pod density" 17 | - name: nodes 18 | description: "number of dataplane nodes to run the load test against" 19 | workspaces: 20 | - name: source 21 | - name: config 22 | - name: results 23 | steps: 24 | - name: git-clone 25 | image: alpine/git 26 | workingDir: $(workspaces.source.path) 27 | args: ["clone", "$(params.giturl)"] 28 | - name: prepare-loadtest 29 | image: amazon/aws-cli 30 | workingDir: $(workspaces.source.path) 31 | script: | 32 | cat > "$(workspaces.source.path)/overrides.yaml" <> $GITHUB_OUTPUT 37 | 38 | # Optional: Keep the echo for logging, but ONLY to stdout, not GITHUB_OUTPUT 39 | echo "Found static tools: $LATEST_TOOLS_JSON" 40 | 41 | 42 | # --- 2. Find the top 4 latest K8s minor versions (Output as a JSON Array) --- 43 | 44 | RELEASES=$(curl -s https://api.github.com/repos/kubernetes/kubernetes/releases | jq -r '.[].tag_name | select(test("alpha|beta|rc") | not)') 45 | 46 | MINOR_VERSIONS=() 47 | for RELEASE in $RELEASES; do 48 | MINOR_VERSION=$(echo $RELEASE | awk -F'.' '{print $1"."$2}') 49 | if [[ ! " ${MINOR_VERSIONS[@]} " =~ " ${MINOR_VERSION} " ]]; then 50 | MINOR_VERSIONS+=($MINOR_VERSION) 51 | fi 52 | done 53 | 54 | SORTED_MINOR_VERSIONS=($(echo "${MINOR_VERSIONS[@]}" | tr ' ' '\n' | sort -rV)) 55 | 56 | K8S_TAGS=() 57 | for i in $(seq 0 3); do 58 | MINOR_VERSION="${SORTED_MINOR_VERSIONS[$i]}" 59 | LATEST_VERSION=$(echo "$RELEASES" | grep "^$MINOR_VERSION\." | sort -rV | head -1 | sed 's/v//') 60 | K8S_TAGS+=("$LATEST_VERSION") 61 | done 62 | 63 | # Convert the bash array into a single-line JSON array string (using -c flag for compact output) 64 | K8S_TAGS_JSON=$(printf '%s\n' "${K8S_TAGS[@]}" | jq -R . | jq -s -c .) 65 | 66 | echo "k8s_versions=$K8S_TAGS_JSON" >> $GITHUB_OUTPUT 67 | echo "Found K8s versions: ${K8S_TAGS[*]}" 68 | -------------------------------------------------------------------------------- /.github/workflows/toolkit-base-build-push.yaml: -------------------------------------------------------------------------------- 1 | name: Create and publish toolkit base image 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | REGISTRY: ghcr.io 8 | IMAGE_NAME: ${{ github.repository_owner }}/eks-toolkit-base 9 | BUILD_CONTEXT: tests/images/toolkit-base/ 10 | 11 | jobs: 12 | get_versions_job: 13 | runs-on: ubuntu-latest 14 | outputs: 15 | k8s_versions: ${{ steps.determine_versions.outputs.k8s_versions }} 16 | latest_tools: ${{ steps.determine_versions.outputs.latest_tools }} 17 | 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v4 21 | - name: Install jq 22 | run: sudo apt-get update && sudo apt-get install -y jq 23 | 24 | - name: Determine K8s Versions and Tool Versions 25 | id: determine_versions 26 | working-directory: ${{ env.BUILD_CONTEXT }} 27 | run: | 28 | chmod +x ./get_versions_matrix.sh # We need a new version of the script 29 | ./get_versions_matrix.sh 30 | 31 | build_and_push_image: 32 | needs: get_versions_job 33 | runs-on: ubuntu-latest 34 | permissions: 35 | contents: read 36 | packages: write 37 | attestations: write 38 | id-token: write 39 | 40 | strategy: 41 | fail-fast: false # Optional: Set to false if you want other builds to finish even if one fails 42 | matrix: 43 | k8s_tag: ${{ fromJson(needs.get_versions_job.outputs.k8s_versions) }} 44 | steps: 45 | - name: Checkout repository 46 | uses: actions/checkout@v4 47 | 48 | - name: Setup QEMU 49 | uses: docker/setup-qemu-action@v3 50 | 51 | - name: Setup docker buildx 52 | uses: docker/setup-buildx-action@v3 53 | 54 | - name: Log in to the Container registry 55 | uses: docker/login-action@v3 56 | with: 57 | registry: ${{ env.REGISTRY }} 58 | username: ${{ github.actor }} 59 | password: ${{ secrets.GITHUB_TOKEN }} 60 | 61 | - name: Set Image Tag for Matrix Run 62 | id: tags 63 | run: | 64 | # Use the K8s version as the primary tag 65 | echo "tag=${{ matrix.k8s_tag }}" >> $GITHUB_OUTPUT 66 | 67 | - name: Build and push Docker image 68 | uses: docker/build-push-action@v6 69 | with: 70 | context: ${{ env.BUILD_CONTEXT }} 71 | platforms: linux/amd64,linux/arm64 72 | push: true 73 | # The tags are set dynamically by the 'Set Image Tag' step 74 | tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ matrix.k8s_tag }} 75 | 76 | # Extract the static tool versions from the needs output 77 | build-args: | 78 | KUBECTL_VERSION=${{ matrix.k8s_tag }} 79 | HELM_VERSION=${{ fromJson(needs.get_versions_job.outputs.latest_tools).helm_version }} 80 | KUSTOMIZE_VERSION=${{ fromJson(needs.get_versions_job.outputs.latest_tools).kustomize_version }} 81 | KUBESEAL_VERSION=${{ fromJson(needs.get_versions_job.outputs.latest_tools).kubeseal_version }} 82 | KREW_VERSION=${{ fromJson(needs.get_versions_job.outputs.latest_tools).krew_version }} 83 | VALS_VERSION=${{ fromJson(needs.get_versions_job.outputs.latest_tools).vals_version }} 84 | KUBECONFORM_VERSION=${{ fromJson(needs.get_versions_job.outputs.latest_tools).kubeconform_version }} 85 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: awscli-eks-pia-create 6 | namespace: scalability 7 | spec: 8 | description: | 9 | Create an EKS Pod Identity Association for a given cluster. 10 | This Task can be used to create an EKS Pod Identity Association for namespace default and service account default. 11 | params: 12 | - name: cluster-name 13 | description: The name of the EKS cluster you want to create an EKS Pod Identity Association for. 14 | - name: region 15 | default: "us-west-2" 16 | description: The region where the cluster is in. 17 | - name: endpoint 18 | default: "" 19 | - name: namespace-prefix 20 | default: "default" 21 | description: "The prefix of namespaces for EKS Pod Identity test." 22 | - name: namespace-count 23 | default: "1" 24 | description: "The number of namespaces for EKS Pod Identity test." 25 | - name: pia-trust-policy-url 26 | default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks-pod-identity/pia-trust-policy.json" 27 | workspaces: 28 | - name: config 29 | mountPath: /config/ 30 | stepTemplate: 31 | env: 32 | - name: KUBECONFIG 33 | value: /config/kubeconfig 34 | steps: 35 | - name: write-kubeconfig 36 | image: alpine/k8s:1.31.5 37 | script: | 38 | ENDPOINT_FLAG="" 39 | if [ -n "$(params.endpoint)" ]; then 40 | ENDPOINT_FLAG="--endpoint $(params.endpoint)" 41 | fi 42 | aws eks $ENDPOINT_FLAG update-kubeconfig --name $(params.cluster-name) --region $(params.region) 43 | - name: create-pia 44 | image: alpine/k8s:1.31.5 45 | script: | 46 | ENDPOINT_FLAG="" 47 | if [ -n "$(params.endpoint)" ]; then 48 | ENDPOINT_FLAG="--endpoint $(params.endpoint)" 49 | fi 50 | 51 | S3_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" 52 | CLOUDWATCH_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/CloudWatchFullAccess" 53 | TRUST_POLICY_FILE="pia-trust-policy.json" 54 | # create a trust policy json file 55 | curl -s $(params.pia-trust-policy-url) -o ./$TRUST_POLICY_FILE 56 | for i in $(seq 1 $(params.namespace-count)); do 57 | kubectl create namespace $(params.namespace-prefix)-$i 58 | 59 | PIA_ROLE_NAME=$(params.cluster-name)-pia-role-$i 60 | aws iam create-role --role-name $PIA_ROLE_NAME --assume-role-policy-document file://$TRUST_POLICY_FILE 61 | aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $S3_MANAGED_POLICY_ARN 62 | aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $CLOUDWATCH_MANAGED_POLICY_ARN 63 | PIA_ROLE_ARN=$(aws iam get-role --role-name $PIA_ROLE_NAME --query 'Role.Arn' --output text) 64 | echo "$PIA_ROLE_ARN is created" 65 | 66 | aws eks $ENDPOINT_FLAG --region $(params.region) create-pod-identity-association \ 67 | --cluster-name $(params.cluster-name) \ 68 | --namespace $(params.namespace-prefix)-$i \ 69 | --service-account default \ 70 | --role-arn $PIA_ROLE_ARN 71 | done 72 | 73 | aws eks $ENDPOINT_FLAG --region $(params.region) list-pod-identity-associations --cluster-name $(params.cluster-name) --query 'associations' 74 | 75 | echo "waiting for 30 seconds..." 76 | sleep 30 77 | echo "resuming execution..." 78 | -------------------------------------------------------------------------------- /tests/assets/eks_node_group_launch_template_al2023.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | AWSTemplateFormatVersion: '2010-09-09' 3 | Description: Create a launch template for use in an autoscaling group of EKS nodes 4 | (Amazon Linux 2023) 5 | Parameters: 6 | LaunchTemplateName: 7 | Type: String 8 | Description: Name of the Launch Template 9 | ClusterName: 10 | Type: String 11 | Description: Name of the Cluster 12 | SSHKeyName: 13 | Type: String 14 | Description: SSH Key Name for EC2 instances 15 | APIServerEndpoint: 16 | Type: String 17 | Description: Kubernetes API Server Endpoint 18 | CertificateAuthority: 19 | Type: String 20 | Description: Certificate Authority data (base64 encoded) 21 | ClusterCIDR: 22 | Type: String 23 | Description: CIDR for cluster (IP range for pods) 24 | KubeletConfig: 25 | Type: String 26 | Description: Kubelet config JSON (will be merged with default config) 27 | Default: '{}' 28 | AMI: 29 | Type: String 30 | Description: Launch template ImageId value, which may be an AMI ID or resolve:ssm reference. 31 | Default: '' 32 | NodeRoleName: 33 | Type: String 34 | Description: Name of the IAM Role for the node instances. 35 | SecurityGroup: 36 | Type: AWS::EC2::SecurityGroup::Id 37 | Description: EKS-created cluster security group that allows node communication with the control plane. 38 | Conditions: 39 | AMIProvided: 40 | !Not [!Equals [!Ref AMI, '']] 41 | Resources: 42 | NodeInstanceProfile: 43 | Type: AWS::IAM::InstanceProfile 44 | Properties: 45 | Path: "/" 46 | Roles: 47 | - !Ref NodeRoleName 48 | LaunchTemplate: 49 | Type: AWS::EC2::LaunchTemplate 50 | Properties: 51 | LaunchTemplateName: 52 | Ref: LaunchTemplateName 53 | LaunchTemplateData: 54 | KeyName: 55 | Ref: SSHKeyName 56 | BlockDeviceMappings: 57 | - DeviceName: "/dev/xvda" 58 | Ebs: 59 | VolumeSize: 40 60 | VolumeType: gp3 61 | MetadataOptions: 62 | HttpPutResponseHopLimit: 2 63 | HttpEndpoint: enabled 64 | HttpTokens: required 65 | IamInstanceProfile: 66 | Arn: !GetAtt NodeInstanceProfile.Arn 67 | SecurityGroupIds: 68 | - !Ref SecurityGroup 69 | ImageId: 70 | !If 71 | - AMIProvided 72 | - !Ref AMI 73 | - !Ref "AWS::NoValue" 74 | UserData: 75 | Fn::Base64: 76 | Fn::Sub: | 77 | Content-Type: multipart/mixed; boundary="BOUNDARY" 78 | MIME-Version: 1.0 79 | 80 | --BOUNDARY 81 | Content-Type: application/node.eks.aws 82 | MIME-Version: 1.0 83 | 84 | --- 85 | apiVersion: node.eks.aws/v1alpha1 86 | kind: NodeConfig 87 | spec: 88 | cluster: 89 | name: ${ClusterName} 90 | apiServerEndpoint: ${APIServerEndpoint} 91 | certificateAuthority: ${CertificateAuthority} 92 | cidr: ${ClusterCIDR} 93 | kubelet: 94 | config: ${KubeletConfig} 95 | 96 | --BOUNDARY-- 97 | Outputs: 98 | LaunchTemplateName: 99 | Description: Name of the Node Group Launch Template 100 | Value: 101 | Ref: LaunchTemplate 102 | -------------------------------------------------------------------------------- /infrastructure/lib/addons/kit.ts: -------------------------------------------------------------------------------- 1 | import { Construct } from 'constructs'; 2 | import { Aws, CfnStack, StackProps } from 'aws-cdk-lib'; 3 | import { aws_eks as eks } from 'aws-cdk-lib'; 4 | import { aws_iam as iam } from 'aws-cdk-lib'; 5 | 6 | export interface KITProps extends StackProps { 7 | cluster: eks.Cluster; 8 | namespace: string; 9 | version: string; 10 | } 11 | 12 | export class KIT extends Construct { 13 | constructor(scope: Construct, id: string, props: KITProps) { 14 | super(scope, id); 15 | 16 | const ns = props.cluster.addManifest('kit-namespace', { 17 | apiVersion: 'v1', 18 | kind: 'Namespace', 19 | metadata: { 20 | name: props.namespace 21 | } 22 | }) 23 | 24 | const sa = props.cluster.addServiceAccount('kit-sa', { 25 | name: 'kit', 26 | namespace: props.namespace 27 | }); 28 | 29 | sa.node.addDependency(ns) 30 | sa.role.attachInlinePolicy(new iam.Policy(this, 'kit-controller-policy', { 31 | statements: [ 32 | new iam.PolicyStatement({ 33 | resources: ['*'], 34 | actions: [ 35 | // Write Operations 36 | "ec2:CreateTags", 37 | "ec2:CreateLaunchTemplate", 38 | "ec2:CreateLaunchTemplateVersion", 39 | "ec2:DeleteLaunchTemplate", 40 | "ec2:RunInstances", 41 | "iam:passRole", 42 | "autoscaling:CreateOrUpdateTags", 43 | "autoscaling:CreateAutoScalingGroup", 44 | "autoscaling:DeleteAutoScalingGroup", 45 | "autoscaling:UpdateAutoScalingGroup", 46 | "autoscaling:SetDesiredCapacity", 47 | "iam:CreateRole", 48 | "iam:AddRoleToInstanceProfile", 49 | "iam:CreateInstanceProfile", 50 | "iam:AttachRolePolicy", 51 | "iam:RemoveRoleFromInstanceProfile", 52 | "iam:DeleteInstanceProfile", 53 | "iam:DetachRolePolicy", 54 | "iam:DeleteRole", 55 | "iam:TagRole", 56 | // Read Operations 57 | "ec2:DescribeInstances", 58 | "ec2:DescribeLaunchTemplates", 59 | "ec2:DescribeLaunchTemplateVersions", 60 | "ec2:DescribeSubnets", 61 | "ssm:GetParameter", 62 | "autoscaling:DescribeAutoScalingGroups", 63 | "iam:GetRole", 64 | "iam:GetInstanceProfile", 65 | ], 66 | }), 67 | ], 68 | })); 69 | 70 | const chart = props.cluster.addHelmChart('kit-chart', { 71 | chart: 'kit-operator', 72 | release: 'kit', 73 | repository: 'https://awslabs.github.io/kubernetes-iteration-toolkit', 74 | namespace: props.namespace, 75 | createNamespace: false, 76 | values: { 77 | 'serviceAccount': { 78 | 'create': false, 79 | 'name': sa.serviceAccountName, 80 | 'annotations': { 81 | 'eks.amazonaws.com/role-arn': sa.role.roleArn 82 | } 83 | }, 84 | controller: { 85 | tolerations: [ 86 | { 87 | key: 'CriticalAddonsOnly', 88 | operator: 'Exists', 89 | }, 90 | ], 91 | }, 92 | webhook: { 93 | tolerations: [ 94 | { 95 | key: 'CriticalAddonsOnly', 96 | operator: 'Exists', 97 | }, 98 | ], 99 | }, 100 | } 101 | }); 102 | chart.node.addDependency(sa) 103 | } 104 | } -------------------------------------------------------------------------------- /infrastructure/lib/addons/cached/aws-ebs-csi-driver-iam-policy-v1.9.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "ec2:CreateSnapshot", 8 | "ec2:AttachVolume", 9 | "ec2:DetachVolume", 10 | "ec2:ModifyVolume", 11 | "ec2:DescribeAvailabilityZones", 12 | "ec2:DescribeInstances", 13 | "ec2:DescribeSnapshots", 14 | "ec2:DescribeTags", 15 | "ec2:DescribeVolumes", 16 | "ec2:DescribeVolumesModifications" 17 | ], 18 | "Resource": "*" 19 | }, 20 | { 21 | "Effect": "Allow", 22 | "Action": [ 23 | "ec2:CreateTags" 24 | ], 25 | "Resource": [ 26 | "arn:aws:ec2:*:*:volume/*", 27 | "arn:aws:ec2:*:*:snapshot/*" 28 | ], 29 | "Condition": { 30 | "StringEquals": { 31 | "ec2:CreateAction": [ 32 | "CreateVolume", 33 | "CreateSnapshot" 34 | ] 35 | } 36 | } 37 | }, 38 | { 39 | "Effect": "Allow", 40 | "Action": [ 41 | "ec2:DeleteTags" 42 | ], 43 | "Resource": [ 44 | "arn:aws:ec2:*:*:volume/*", 45 | "arn:aws:ec2:*:*:snapshot/*" 46 | ] 47 | }, 48 | { 49 | "Effect": "Allow", 50 | "Action": [ 51 | "ec2:CreateVolume" 52 | ], 53 | "Resource": "*", 54 | "Condition": { 55 | "StringLike": { 56 | "aws:RequestTag/ebs.csi.aws.com/cluster": "true" 57 | } 58 | } 59 | }, 60 | { 61 | "Effect": "Allow", 62 | "Action": [ 63 | "ec2:CreateVolume" 64 | ], 65 | "Resource": "*", 66 | "Condition": { 67 | "StringLike": { 68 | "aws:RequestTag/CSIVolumeName": "*" 69 | } 70 | } 71 | }, 72 | { 73 | "Effect": "Allow", 74 | "Action": [ 75 | "ec2:CreateVolume" 76 | ], 77 | "Resource": "*", 78 | "Condition": { 79 | "StringLike": { 80 | "aws:RequestTag/kubernetes.io/cluster/*": "owned" 81 | } 82 | } 83 | }, 84 | { 85 | "Effect": "Allow", 86 | "Action": [ 87 | "ec2:DeleteVolume" 88 | ], 89 | "Resource": "*", 90 | "Condition": { 91 | "StringLike": { 92 | "ec2:ResourceTag/ebs.csi.aws.com/cluster": "true" 93 | } 94 | } 95 | }, 96 | { 97 | "Effect": "Allow", 98 | "Action": [ 99 | "ec2:DeleteVolume" 100 | ], 101 | "Resource": "*", 102 | "Condition": { 103 | "StringLike": { 104 | "ec2:ResourceTag/CSIVolumeName": "*" 105 | } 106 | } 107 | }, 108 | { 109 | "Effect": "Allow", 110 | "Action": [ 111 | "ec2:DeleteVolume" 112 | ], 113 | "Resource": "*", 114 | "Condition": { 115 | "StringLike": { 116 | "ec2:ResourceTag/kubernetes.io/cluster/*": "owned" 117 | } 118 | } 119 | }, 120 | { 121 | "Effect": "Allow", 122 | "Action": [ 123 | "ec2:DeleteSnapshot" 124 | ], 125 | "Resource": "*", 126 | "Condition": { 127 | "StringLike": { 128 | "ec2:ResourceTag/CSIVolumeSnapshotName": "*" 129 | } 130 | } 131 | }, 132 | { 133 | "Effect": "Allow", 134 | "Action": [ 135 | "ec2:DeleteSnapshot" 136 | ], 137 | "Resource": "*", 138 | "Condition": { 139 | "StringLike": { 140 | "ec2:ResourceTag/ebs.csi.aws.com/cluster": "true" 141 | } 142 | } 143 | } 144 | ] 145 | } 146 | -------------------------------------------------------------------------------- /tests/assets/eks-pod-identity/pod-default.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | generateName: eks-pod-identity-pod-churn- 5 | labels: 6 | group: {{.Group}} 7 | spec: 8 | containers: 9 | - name: app-with-awsapi 10 | image: {{.PodImage}} 11 | imagePullPolicy: IfNotPresent 12 | resources: 13 | requests: 14 | cpu: "120m" 15 | memory: "100Mi" 16 | limits: 17 | cpu: "150m" 18 | memory: "150Mi" 19 | env: 20 | - name: CLUSTER_NAME 21 | value: "{{.ClusterName}}" 22 | - name: DIMENSION_NAME 23 | value: "{{.MetricDimensionName}}" 24 | - name: NAMESPACE 25 | value: "{{.MetricNamespace}}" 26 | - name: METRIC_LATENCY_NAME 27 | value: "{{.MetricLatencyName}}" 28 | - name: PERIOD 29 | value: "{{.MetricPeriod}}" 30 | command: 31 | - sh 32 | - -c 33 | - | 34 | AUTH_TOKEN=$(cat $AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE) 35 | 36 | DIMENSION_VALUE=$CLUSTER_NAME 37 | METRIC_MAX_RETRIES=5 38 | METRIC_RETRY_DELAY=1 39 | 40 | # make an attempt on credential fetching, and calculate the time taken 41 | # push metrics on time taken on credential fetching 42 | # to minimize failure from cloudwatch metrics, add retries on put-metric-data 43 | start_epoch=$(date +%s%3N) 44 | # fetch credentials 45 | status_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 2 -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials) 46 | end_epoch=$(date +%s%3N) 47 | if [ "$status_code" -eq 200 ]; then 48 | printf "Endpoint is reachable at try %d\n" "$i" 49 | 50 | latency_ms=$((end_epoch - start_epoch)) 51 | latency_sec=$(awk "BEGIN { print $latency_ms / 1000 }") 52 | 53 | # send CredentialFetchLatency metric 54 | for ((j=1; j<=METRIC_MAX_RETRIES; j++)); do 55 | aws cloudwatch put-metric-data \ 56 | --namespace "$NAMESPACE" \ 57 | --metric-name "$METRIC_LATENCY_NAME" \ 58 | --dimensions "$DIMENSION_NAME=$DIMENSION_VALUE" \ 59 | --value "$latency_sec" \ 60 | --unit Seconds && { 61 | echo "Metric CredentialFetchLatency sent successfully." 62 | break 63 | } 64 | 65 | if [ "$j" -lt "$METRIC_MAX_RETRIES" ]; then 66 | echo "Attempt $j failed. Retrying in $METRIC_RETRY_DELAY seconds..." >&2 67 | sleep $METRIC_RETRY_DELAY 68 | METRIC_RETRY_DELAY=$((METRIC_RETRY_DELAY * 2)) # exponential backoff 69 | else 70 | echo "Failed to send metric CredentialFetchLatency after $METRIC_MAX_RETRIES attempts." >&2 71 | exit 1 72 | fi 73 | done 74 | else 75 | echo "Failed to fetch credential with status code: $status_code" 76 | exit 1 77 | fi 78 | 79 | # it is noted that a Pod with host network will fallback to Node role permissions that includes this s3 access 80 | # however, in our test case, we are not using host network 81 | # https://github.com/awslabs/kubernetes-iteration-toolkit/blob/main/tests/assets/eks_node_role.json 82 | # the main reason we are not doing an STS get identity verification is about the quota of STS APIs with scale tests 83 | 84 | # s3 api call 85 | while ! aws s3 ls; do 86 | echo "Waiting for S3 bucket access..." 87 | done 88 | echo "S3 bucket is accessible, proceeding." 89 | 90 | # pause 91 | while true; do 92 | echo "Sleeping for 1 hour..." 93 | sleep 3600 94 | done 95 | -------------------------------------------------------------------------------- /infrastructure/lib/addons/karpenter.ts: -------------------------------------------------------------------------------- 1 | import { aws_eks as eks, aws_iam as iam, Duration, StackProps } from 'aws-cdk-lib' 2 | import { Construct } from 'constructs' 3 | 4 | export interface KarpenterProps extends StackProps { 5 | cluster: eks.Cluster 6 | namespace: string 7 | nodeRoleName: string 8 | } 9 | 10 | export class Karpenter extends Construct { 11 | constructor(scope: Construct, id: string, props: KarpenterProps) { 12 | super(scope, id) 13 | const ns = props.cluster.addManifest('karpenter-namespace', { 14 | apiVersion: 'v1', 15 | kind: 'Namespace', 16 | metadata: { 17 | name: props.namespace 18 | } 19 | }) 20 | 21 | // Controller Role 22 | const sa = props.cluster.addServiceAccount('karpenter-controller-sa', { 23 | name: "karpenter", 24 | namespace: props.namespace 25 | }) 26 | sa.node.addDependency(ns) 27 | const karpenterControllerPolicy = new iam.PolicyDocument({ 28 | statements: [ 29 | new iam.PolicyStatement({ 30 | resources: ['*'], 31 | actions: [ 32 | // Write Operations 33 | "ec2:CreateLaunchTemplate", 34 | "ec2:CreateFleet", 35 | "ec2:RunInstances", 36 | "ec2:CreateTags", 37 | "iam:PassRole", 38 | "ec2:TerminateInstances", 39 | "ec2:DeleteLaunchTemplate", 40 | // Read Operations 41 | "ec2:DescribeAvailabilityZones", 42 | "ec2:DescribeImages", 43 | "ec2:DescribeInstances", 44 | "ec2:DescribeInstanceTypeOfferings", 45 | "ec2:DescribeInstanceTypes", 46 | "ec2:DescribeLaunchTemplates", 47 | "ec2:DescribeSecurityGroups", 48 | "ec2:DescribeSpotPriceHistory", 49 | "ec2:DescribeSubnets", 50 | "pricing:GetProducts", 51 | "ssm:GetParameter", 52 | ], 53 | }), 54 | ], 55 | }); 56 | sa.role.attachInlinePolicy(new iam.Policy(this, 'karpenter-controller-policy', { 57 | document: karpenterControllerPolicy, 58 | })); 59 | 60 | const nodeInstanceProfile = new iam.CfnInstanceProfile(this, 'karpenter-instance-profile', { 61 | roles: [props.nodeRoleName], 62 | instanceProfileName: `KarpenterNodeInstanceProfile-${props.cluster.clusterName}` 63 | }); 64 | 65 | // Install Karpenter 66 | const chart = props.cluster.addHelmChart('karpenter-chart', { 67 | chart: 'karpenter', 68 | release: 'karpenter', 69 | version: 'v0.24.0', 70 | repository: 'oci://public.ecr.aws/karpenter/karpenter', 71 | namespace: props.namespace, 72 | createNamespace: false, 73 | timeout: Duration.minutes(10), 74 | wait: true, 75 | values: { 76 | 'settings': { 77 | 'aws': { 78 | 'clusterName': props.cluster.clusterName, 79 | 'clusterEndpoint': props.cluster.clusterEndpoint, 80 | 'defaultInstanceProfile': nodeInstanceProfile.instanceProfileName, 81 | }, 82 | 'featureGates': { 83 | "driftEnabled": true, 84 | }, 85 | }, 86 | 'serviceAccount': { 87 | 'create': false, 88 | 'name': sa.serviceAccountName, 89 | }, 90 | }, 91 | }) 92 | chart.node.addDependency(sa) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: scale-nodepool 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Scales a Karpenter nodepool by modifying the number of replicas. 9 | This task configures kubectl access to the EKS cluster, captures the current 10 | cluster state for monitoring purposes, performs the scaling operation, 11 | and verifies the scaling request was applied successfully. 12 | DOES NOT CHECK TO SEE IF ALL NODES HAVE GONE READY. Use kubectl-nodepool-replicas-wait.yaml for that 13 | params: 14 | - name: replicas 15 | description: Number of replicas to scale the nodepool to (target replica count) 16 | - name: nodepool 17 | description: Name of the Karpenter nodepool resource to scale 18 | - name: cluster-name 19 | description: The name of the EKS cluster containing the nodepool 20 | - name: endpoint 21 | description: EKS cluster endpoint URL for kubectl configuration 22 | - name: aws-region 23 | description: AWS region where the cluster is located (used for AZ discovery) 24 | default: us-west-2 25 | steps: 26 | - name: scale-nodepool 27 | image: alpine/k8s:1.30.2 28 | script: | 29 | echo "Starting Nodepool Scaling Task" 30 | echo "==============================" 31 | 32 | # Configure kubeconfig to connect to the EKS cluster 33 | echo "[INFO] Configuring kubeconfig for cluster access..." 34 | aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) 35 | echo "[SUCCESS] Successfully configured kubeconfig" 36 | echo "" 37 | 38 | # Discover availability zones and scale nodepools 39 | echo "" 40 | echo "[INFO] Discovering availability zones in region: $(params.aws-region)" 41 | 42 | # Get list of availability zones 43 | AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') 44 | AZ_COUNT=$(echo "$AZ_LIST" | wc -l) 45 | 46 | echo "[INFO] Found $AZ_COUNT availability zones:" 47 | echo "$AZ_LIST" | sed 's/^/ - /' 48 | echo "" 49 | 50 | # Capture current cluster state before scaling operation 51 | echo "[INFO] Capturing cluster state before scaling operation..." 52 | echo "--------------------------------------------------------" 53 | 54 | echo "[INFO] Current nodepool status:" 55 | kubectl get nodepools -o wide 56 | echo "" 57 | 58 | # Process each availability zone 59 | NODEPOOL_COUNT=0 60 | echo "$AZ_LIST" | while read -r az; do 61 | export AZ=$az 62 | 63 | echo "[INFO] Current nodepool $(params.nodepool) detailed status:" 64 | kubectl get nodepool $(params.nodepool)-${az} -o yaml 65 | echo "" 66 | 67 | echo "[INFO] Current nodepool nodes:" 68 | kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o wide 69 | echo "" 70 | 71 | # Perform the scaling operation 72 | echo "[INFO] Scaling nodepool $(params.nodepool)-${az} to $(params.replicas) replicas..." 73 | kubectl scale nodepool $(params.nodepool)-${az} --replicas $(params.replicas) 74 | echo "[SUCCESS] Scaling command executed successfully" 75 | echo "" 76 | 77 | echo "[INFO] Updated nodepool $(params.nodepool) detailed status:" 78 | kubectl get nodepool $(params.nodepool)-${az} -o yaml 79 | echo "" 80 | done 81 | 82 | 83 | # Verify the scaling operation was applied 84 | echo "[INFO] Verifying scaling operation results..." 85 | echo "=============================================" 86 | 87 | echo "[INFO] Updated nodepool status:" 88 | kubectl get nodepools -o wide 89 | echo "" 90 | 91 | echo "=============================================" 92 | echo "Nodepool Scaling Operation Begun" 93 | echo "=============================================" -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: awscli-mng 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Creates a dedicated Karpenter managed node group (MNG) for the EKS cluster. 9 | This task creates a large-capacity node group specifically designed to host Karpenter 10 | system components with appropriate taints and labels to ensure proper scheduling. 11 | The node group uses r5.24xlarge instances with dedicated=karpenter taints. 12 | params: 13 | - name: cluster-name 14 | description: The name of the EKS cluster where the Karpenter MNG will be created 15 | - name: aws-account-id 16 | description: AWS account ID used to construct the node role ARN 17 | - name: endpoint 18 | description: EKS cluster endpoint URL for AWS EKS CLI operations 19 | - name: region 20 | default: "us-west-2" 21 | description: AWS region where the EKS cluster is located 22 | steps: 23 | - name: create-mng 24 | image: alpine/k8s:1.30.2 25 | script: | 26 | echo "Starting Karpenter Managed Node Group Creation" 27 | 28 | # Discover subnets associated with the cluster 29 | echo "[INFO] Discovering subnets for cluster $(params.cluster-name)..." 30 | SUBNET_IDS=$(aws ec2 describe-subnets \ 31 | --filters "Name=tag:aws:cloudformation:stack-name,Values=$(params.cluster-name)" \ 32 | --query 'Subnets[*].SubnetId' \ 33 | --output text \ 34 | --region $(params.region)) 35 | echo "[INFO] Discovered Subnets: $SUBNET_IDS" 36 | 37 | # Create the Karpenter managed node group 38 | echo "[INFO] Creating Karpenter managed node group..." 39 | echo "" 40 | 41 | aws eks create-nodegroup \ 42 | --cluster-name $(params.cluster-name) \ 43 | --nodegroup-name karpenter-system-large \ 44 | --node-role arn:aws:iam::$(params.aws-account-id):role/$(params.cluster-name)-node-role \ 45 | --instance-types r5.24xlarge \ 46 | --scaling-config minSize=2,maxSize=3,desiredSize=2 \ 47 | --subnets ${SUBNET_IDS} \ 48 | --labels dedicated=karpenter \ 49 | --region $(params.region) \ 50 | --endpoint-url $(params.endpoint) \ 51 | --taints key=dedicated,value=karpenter,effect=NO_SCHEDULE 52 | 53 | # Verify the node group was created and list all node groups 54 | echo "[INFO] Verifying node group creation..." 55 | echo "======================================" 56 | 57 | NODE_GROUPS=$(aws eks list-nodegroups \ 58 | --endpoint-url $(params.endpoint) \ 59 | --cluster-name $(params.cluster-name) \ 60 | --region $(params.region) \ 61 | --query 'nodegroups' \ 62 | --output text) 63 | 64 | if [ -z "$NODE_GROUPS" ]; then 65 | echo "[WARNING] No node groups found in cluster" 66 | else 67 | NODE_GROUP_COUNT=$(echo $NODE_GROUPS | wc -w) 68 | echo "[SUCCESS] Found $NODE_GROUP_COUNT node group(s) in cluster:" 69 | echo "$NODE_GROUPS" | tr ' ' '\n' | sed 's/^/ - /' 70 | fi 71 | echo "" 72 | 73 | # Display detailed information about the Karpenter node group 74 | echo "[INFO] Retrieving Karpenter node group details..." 75 | aws eks describe-nodegroup \ 76 | --cluster-name $(params.cluster-name) \ 77 | --nodegroup-name karpenter-system-large \ 78 | --region $(params.region) \ 79 | --endpoint-url $(params.endpoint) \ 80 | --query '{ 81 | Status: nodegroup.status, 82 | InstanceTypes: nodegroup.instanceTypes, 83 | ScalingConfig: nodegroup.scalingConfig, 84 | Labels: nodegroup.labels, 85 | Taints: nodegroup.taints, 86 | SubnetIds: nodegroup.subnets 87 | }' \ 88 | --output table 89 | echo "" 90 | 91 | echo "==============================================" 92 | echo "Karpenter Managed Node Group Creation Complete" 93 | echo "==============================================" 94 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: drift-nodepool 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Triggers Karpenter nodepool drift by modifying nodepool template labels. 9 | This task connects to an EKS cluster, captures the current nodepool state, 10 | applies a label change to force node replacement, and verifies the drift operation. 11 | The drift process causes Karpenter to replace existing nodes with new ones 12 | that match the updated nodepool template specification. 13 | DOES NOT CHECK TO SEE IF ALL NODES SUCESSFULLY DRIFT. Use kubectl-nodepool-condition-wait.yaml for that 14 | params: 15 | - name: nodepool 16 | description: Name of the Karpenter nodepool to drift (must exist in cluster) 17 | - name: cluster-name 18 | description: The name of the EKS cluster containing the target nodepool 19 | - name: endpoint 20 | description: EKS cluster endpoint URL for kubectl configuration 21 | - name: aws-region 22 | description: AWS region where the cluster is located (used for AZ discovery) 23 | default: us-west-2 24 | - name: label-key 25 | description: Label key to add/modify in the nodepool template 26 | default: myLabel 27 | - name: label-val 28 | description: Label value to set for the specified label key 29 | default: myValue 30 | steps: 31 | - name: drift-nodepool 32 | image: alpine/k8s:1.30.2 33 | script: | 34 | echo "Starting Nodepool Drift Operation" 35 | echo "=================================" 36 | 37 | # Configure kubeconfig to connect to the EKS cluster 38 | echo "[INFO] Configuring kubeconfig for cluster access..." 39 | aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) 40 | echo "[SUCCESS] Successfully configured kubeconfig" 41 | echo "" 42 | 43 | # Get list of availability zones 44 | AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') 45 | AZ_COUNT=$(echo "$AZ_LIST" | wc -l) 46 | 47 | echo "[INFO] Found $AZ_COUNT availability zones:" 48 | echo "$AZ_LIST" | sed 's/^/ - /' 49 | echo "" 50 | 51 | # Capture cluster state before drift operation 52 | echo "[INFO] Capturing cluster state before nodepool drift..." 53 | echo "-----------------------------------------------------" 54 | 55 | echo "[INFO] Current cluster nodes:" 56 | kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool) -o wide --show-labels 57 | echo "" 58 | 59 | echo "[INFO] Current nodepool configuration:" 60 | kubectl get nodepool -o yaml 61 | echo "" 62 | 63 | echo "$AZ_LIST" | while read -r az; do 64 | export AZ=$az 65 | # Apply the drift-inducing label change to the nodepool 66 | echo "[INFO] Applying label change to trigger nodepool drift..." 67 | echo "[INFO] Patching nodepool $(params.nodepool)-${az} with label $(params.label-key)=$(params.label-val)" 68 | 69 | kubectl patch nodepool $(params.nodepool)-${az} --type='merge' --patch='{"spec": {"template": {"metadata": {"labels": {"$(params.label-key)": "$(params.label-val)"}}}}}' 70 | 71 | echo "[SUCCESS] Successfully patched nodepool $(params.nodepool)-${az}" 72 | echo "" 73 | 74 | # Verify the drift operation was applied 75 | echo "[INFO] Verifying nodepool drift configuration..." 76 | echo "===============================================" 77 | 78 | echo "[INFO] Updated nodepool configuration:" 79 | kubectl get nodepool $(params.nodepool)-${az} -o yaml 80 | echo "" 81 | done 82 | 83 | echo "===============================================" 84 | echo "[SUCCESS] Nodepool drift operation completed" 85 | echo "[INFO] Karpenter will now begin replacing nodes to match the new template" 86 | echo "===============================================" 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![kitctl](https://github.com/awslabs/kubernetes-iteration-toolkit/actions/workflows/kitctl-test.yml/badge.svg)](https://github.com/awslabs/kubernetes-iteration-toolkit/actions/workflows/kitctl-test.yml) **`kitctl`** test coverage when new modifications are committed to this repository 2 | 3 | # What is Kubernetes Iteration Toolkit? 4 | 5 | ## What is KIT? 6 | 7 | [KIT](https://github.com/awslabs/kubernetes-iteration-toolkit) is a set of decoupled tools designed to accelerate the development of Kubernetes through testing. It combines a variety of open source projects to define an opinionated way to rapidly configure and test Kubernetes components on AWS. 8 | 9 | ## Why did we build KIT? 10 | 11 | The EKS Scalability team is responsible for improving performance across the Kubernetes stack. We started our journey by manually running tests against modified dev clusters. This helped us to identify some bottlenecks, but results were difficult to demonstrate and reproduce. We wanted to increase the velocity of our discoveries, as well as our confidence in our results. We set out to build automation to help us configure cluster components, execute well known test workloads, and analyze the results. This evolved into KIT, and we’re ready to share it to help accelerate testing in other teams. 12 | 13 | ## What can I do with KIT? 14 | 15 | KIT can help you run scale tests against a KIT cluster or an EKS cluster, collect logs and metrics from the cluster control plane and nodes to help analyze the performance for a Kubernetes cluster. KIT comes with a set of tools like Karpenter, ELB controller, Prometheus, Grafana and Tekton etc. installed and configured to manage cluster lifecycle, run tests and collect results. 16 | 17 | ## What are KIT Environments? 18 | 19 | KIT Environments provide an opinionated testing environment with support for test workflow execution, analysis, and observability. Developers can use `kitctl` cli to create a personal or shared testing environment for oneshot or periodic tests. KIT Environments consists of a management Kubernetes cluster that come preinstalled with a suite of Kubernetes operators that enable the execution of the tests, help analyse the test results easily, and persists logs and control plane metrics for the test runs. 20 | 21 | Additionally, KIT Environments provide a library of predefined [Tasks](https://github.com/awslabs/kubernetes-iteration-toolkit/tree/c6925e3db92ae909cafb2751b153dd8221d6fd55/tests/tasks) to configure clusters, generate load, and analyze results. For example, you can combine the “MegaXL KIT cluster” task and “upstream pod density load generator” task to reproduce the scalability team’s MegaXL test results. You can then swap in the “EKS Cluster” task and verify the results as improvements are merged into EKS. You can also parameterize existing tasks or define your own to meet your use cases. 22 | 23 | ## What are KIT clusters? 24 | 25 | KIT clusters enables developers to declaratively configure eks-like clusters with arbitrary modifications. Using a Kubernetes CRD, you can modify the EC2 instance types, container image, environment variables, or command line arguments of any cluster component. These configurations can be [checked into git](https://github.com/awslabs/kubernetes-iteration-toolkit/blob/main/operator/docs/examples/cluster-1.21.yaml) and reproduced for periodic regression testing or against new test scenarios. 26 | 27 | KIT clusters are implemented using Kubernetes primitives like deployments, statefulsets, and services. More advanced use cases can be achieved by implementing a new feature in the [KIT cluster Operator](https://github.com/awslabs/kubernetes-iteration-toolkit/tree/main/operator) and exposing it as a new parameter in the CRD. You can install the KIT cluster Operator on any Kubernetes cluster or with `kitctl bootstrap`. 28 | 29 | ## How do I get started with KIT? 30 | 31 | KIT-v0.1 (alpha) is available now. You can get started with kitctl by following these instructions [How To get started with KIT](/docs/how-to-use-kit.md) 32 | 33 | > Note: KIT is an alpha project things are changing and evolving rapidly. If you run into any issues, feel free to open an issue. 34 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: create-ec2nodeclass 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Creates and applies an EC2NodeClass resource for Karpenter node provisioning. 9 | This task retrieves cluster configuration, downloads a nodeclass template, 10 | substitutes environment variables, and applies the configuration to the cluster. 11 | params: 12 | - name: cluster-name 13 | description: The name of the EKS cluster where the EC2NodeClass will be created 14 | - name: endpoint 15 | description: The AWS EKS API endpoint URL to use for cluster operations 16 | - name: karpenter-nodeclass-url 17 | description: The URL of the EC2NodeClass YAML template to download and apply 18 | workspaces: 19 | - name: source 20 | mountPath: /src/karpenter/ 21 | description: Workspace for storing downloaded and processed nodeclass files 22 | steps: 23 | - name: create-ec2nodeclass 24 | image: alpine/k8s:1.30.2 25 | script: | 26 | echo "Starting EC2NodeClass creation process for cluster: $(params.cluster-name)" 27 | 28 | # Retrieve cluster certificate authority data for node authentication 29 | echo "Fetching cluster certificate authority data..." 30 | export CLUSTER_CA=$(aws eks describe-cluster \ 31 | --name $(params.cluster-name) \ 32 | --endpoint-url $(params.endpoint) \ 33 | --query 'cluster.certificateAuthority.data' \ 34 | --output text) 35 | echo "Successfully retrieved cluster CA data" 36 | 37 | # Retrieve cluster API endpoint for node communication 38 | echo "Fetching cluster API endpoint..." 39 | export CLUSTER_ENDPOINT=$(aws eks describe-cluster \ 40 | --name $(params.cluster-name) \ 41 | --endpoint-url $(params.endpoint) \ 42 | --query 'cluster.endpoint' \ 43 | --output text) 44 | echo "Cluster endpoint retrieved: ${CLUSTER_ENDPOINT}" 45 | 46 | # Set cluster name for template substitution 47 | export CLUSTER_NAME=$(params.cluster-name) 48 | echo "Using cluster name: ${CLUSTER_NAME}" 49 | 50 | # Set AMI alias version for node instances 51 | export ALIAS_VERSION=latest 52 | echo "Using AMI alias version: ${ALIAS_VERSION}" 53 | 54 | # Download the EC2NodeClass template from the specified URL 55 | echo "Downloading EC2NodeClass template from: $(params.karpenter-nodeclass-url)" 56 | curl -fsSL $(params.karpenter-nodeclass-url) -o $(workspaces.source.path)ec2nodeclass.yaml 57 | echo "Template downloaded successfully to $(workspaces.source.path)ec2nodeclass.yaml" 58 | 59 | # Display the original template for verification 60 | echo "Original EC2NodeClass template content:" 61 | cat $(workspaces.source.path)ec2nodeclass.yaml 62 | 63 | # Substitute environment variables in the template 64 | echo "Performing environment variable substitution in template..." 65 | envsubst < $(workspaces.source.path)ec2nodeclass.yaml > $(workspaces.source.path)ec2nodeclass-modified.yaml 66 | echo "Environment variable substitution completed" 67 | 68 | # Display the processed template with substituted values 69 | echo "Processed EC2NodeClass configuration:" 70 | cat $(workspaces.source.path)ec2nodeclass-modified.yaml 71 | 72 | # Update kubeconfig to authenticate with the target cluster 73 | echo "Updating kubeconfig for cluster access..." 74 | aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) 75 | echo "Kubeconfig updated successfully" 76 | 77 | # Apply the EC2NodeClass configuration to the cluster 78 | echo "Applying EC2NodeClass configuration to cluster..." 79 | kubectl apply -f $(workspaces.source.path)ec2nodeclass-modified.yaml 80 | echo "EC2NodeClass applied successfully" 81 | 82 | # Verify the EC2NodeClass was created and display its configuration 83 | echo "Retrieving and displaying created EC2NodeClass resources:" 84 | kubectl get ec2nodeclass -o yaml 85 | echo "EC2NodeClass creation process completed successfully" 86 | -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/eks/upstream-load.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Pipeline 4 | metadata: 5 | name: loadtest 6 | namespace: scalability 7 | spec: 8 | workspaces: 9 | - name: config 10 | - name: source 11 | - name: results 12 | params: 13 | - name: cluster-name 14 | description: The name of the EKS cluster you want to spin. 15 | - name: eks-version 16 | default: "1.20" 17 | description: The EKS version to install. 18 | - name: region 19 | description: The region where the cluster is in. 20 | default: "us-west-2" 21 | - name: instance-selector-vcpus 22 | default: "2" 23 | description: The vcpus of the EC2 instaces for the nodegroup of the cluster. 24 | - name: instance-selector-memory 25 | default: "4" 26 | description: The memory of the EC2 instaces for the nodegroup of the cluster. 27 | - name: desired-nodes 28 | description: The desired number of nodes in the cluster. 29 | default: "100" 30 | - name: min-nodes 31 | default: "1" 32 | description: The minimum number of nodes in the cluster. 33 | - name: max-nodes 34 | default: "100" 35 | description: The maximum number of nodes in the cluster. 36 | - name: giturl 37 | description: "git url to clone the package" 38 | default: https://github.com/kubernetes/perf-tests.git 39 | - name: pods-per-node 40 | description: "pod density" 41 | default: "10" 42 | - name: nodes-per-namespace 43 | description: "nodes per namespace to get created for load test " 44 | default: "100" 45 | - name: cl2-load-test-throughput 46 | description: "throughput used for mutate operations" 47 | default: "15" 48 | - name: results-bucket 49 | description: "Results bucket with path of s3 to upload results" 50 | - name: amp-workspace-id 51 | description: The AMP workspace ID where remote write needs to happen. 52 | default: "" 53 | tasks: 54 | - name: create-eks-cluster 55 | taskRef: 56 | name: eks-cluster-create 57 | params: 58 | - name: cluster-name 59 | value: '$(params.cluster-name)' 60 | - name: eks-version 61 | value: '$(params.eks-version)' 62 | - name: region 63 | value: '$(params.region)' 64 | - name: instance-selector-vcpus 65 | value: '$(params.instance-selector-vcpus)' 66 | - name: instance-selector-memory 67 | value: '$(params.instance-selector-memory)' 68 | - name: desired-nodes 69 | value: '$(params.desired-nodes)' 70 | - name: min-nodes 71 | value: '$(params.min-nodes)' 72 | - name: max-nodes 73 | value: '$(params.max-nodes)' 74 | workspaces: 75 | - name: config 76 | workspace: config 77 | - name: install-fluentbit-addon 78 | params: 79 | - name: cluster-name 80 | value: $(params.cluster-name) 81 | runAfter: 82 | - create-eks-cluster 83 | taskRef: 84 | kind: Task 85 | name: eks-addon-fluentbit 86 | - name: generate 87 | runAfter: [install-fluentbit-addon] 88 | taskRef: 89 | name: load 90 | params: 91 | - name: giturl 92 | value: '$(params.giturl)' 93 | - name: pods-per-node 94 | value: '$(params.pods-per-node)' 95 | - name: nodes-per-namespace 96 | value: '$(params.nodes-per-namespace)' 97 | - name: cl2-load-test-throughput 98 | value: '$(params.cl2-load-test-throughput)' 99 | - name: results-bucket 100 | value: '$(params.results-bucket)' 101 | - name: nodes 102 | value: '$(params.desired-nodes)' 103 | - name: amp-workspace-id 104 | value: '$(params.amp-workspace-id)' 105 | workspaces: 106 | - name: source 107 | workspace: source 108 | - name: config 109 | workspace: config 110 | - name: results 111 | workspace: results 112 | - name: teardown 113 | runAfter: [generate] 114 | taskRef: 115 | name: eks-cluster-teardown 116 | params: 117 | - name: cluster-name 118 | value: '$(params.cluster-name)' 119 | - name: region 120 | value: '$(params.region)' 121 | -------------------------------------------------------------------------------- /tests/assets/aiml-workload/large-sts/config.yaml: -------------------------------------------------------------------------------- 1 | # AI/ML Workload test configuration 2 | # This test creates n number of statefulsets with x replicas each. 3 | 4 | 5 | {{$NAMESPACE := DefaultParam .NAMESPACE "aiml-test2"}} 6 | {{$STATEFULSETS_COUNT := DefaultParam .STATEFULSETS_COUNT 1}} 7 | {{$REPLICAS_PER_STATEFULSET := DefaultParam .REPLICAS_PER_STATEFULSET 100000}} 8 | {{$STATEFULSET_CREATION_QPS := DefaultParam .STATEFULSET_CREATION_QPS 5}} 9 | {{$SCALING_QPS := DefaultParam .SCALING_QPS 10}} 10 | {{$BATCH_SIZE := DefaultParam .BATCH_SIZE 500}} 11 | {{$CREATION_TIMEOUT := DefaultParam .CREATION_TIMEOUT "15m"}} 12 | {{$REQUEST_CPU := DefaultParam .REQUEST_CPU "100m"}} 13 | {{$REQUEST_MEMORY := DefaultParam .REQUEST_MEMORY "128Mi"}} 14 | {{$LIMIT_CPU := DefaultParam .LIMIT_CPU "1000m"}} 15 | {{$LIMIT_MEMORY := DefaultParam .LIMIT_MEMORY "512Mi"}} 16 | {{$POD_STARTUP_THRESHOLD := DefaultParam .POD_STARTUP_THRESHOLD "10s"}} 17 | {{$CLUSTER_NAME := DefaultParam .CLUSTER_NAME "perflab-titan-1"}} 18 | {{$defaultQps := DefaultParam .CL2_DEFAULT_QPS 1000}} 19 | {{$defaultBurst := DefaultParam .CL2_DEFAULT_BURST 1000}} 20 | {{$SLEEP_DURATION := DefaultParam .SLEEP_DURATION 1}} 21 | 22 | name: aws-mock-aiml-application-test 23 | 24 | tuningSets: 25 | - name: StatefulSetCreationQPS 26 | qpsLoad: 27 | qps: {{$STATEFULSET_CREATION_QPS}} 28 | - name: ScalingQPS 29 | qpsLoad: 30 | qps: {{$SCALING_QPS}} 31 | - name: default 32 | globalQPSLoad: 33 | qps: {{$defaultQps}} 34 | burst: {{$defaultBurst}} 35 | 36 | steps: 37 | - name: Starting measurements 38 | measurements: 39 | - Identifier: PodStartupLatency 40 | Method: PodStartupLatency 41 | Params: 42 | action: start 43 | labelSelector: app = aiml-training-job 44 | threshold: {{$POD_STARTUP_THRESHOLD}} 45 | - Identifier: WaitForRunningStatefulSets 46 | Method: WaitForControlledPodsRunning 47 | Params: 48 | action: start 49 | apiVersion: apps/v1 50 | kind: StatefulSet 51 | labelSelector: app = aiml-training-job 52 | operationTimeout: {{$CREATION_TIMEOUT}} 53 | 54 | - name: Creating StatefulSets with full replica count 55 | phases: 56 | - namespaceRange: 57 | min: 1 58 | max: 1 59 | basename: {{$NAMESPACE}} 60 | replicasPerNamespace: {{$STATEFULSETS_COUNT}} 61 | tuningSet: StatefulSetCreationQPS 62 | objectBundle: 63 | - basename: aiml-training-job 64 | objectTemplatePath: sts.yaml 65 | templateFillMap: 66 | Group: aiml-training-job 67 | Replicas: {{$BATCH_SIZE}} 68 | RequestCPU: "{{$REQUEST_CPU}}" 69 | RequestMemory: "{{$REQUEST_MEMORY}}" 70 | LimitCPU: "{{$LIMIT_CPU}}" 71 | LimitMemory: "{{$LIMIT_MEMORY}}" 72 | ClusterName: "{{$CLUSTER_NAME}}" 73 | 74 | {{range $batch := Loop (SubtractInt (DivideInt $REPLICAS_PER_STATEFULSET $BATCH_SIZE) 1)}} 75 | - name: Scaling StatefulSets batch {{AddInt $batch 2}} 76 | phases: 77 | - namespaceRange: 78 | min: 1 79 | max: 1 80 | basename: {{$NAMESPACE}} 81 | replicasPerNamespace: {{$STATEFULSETS_COUNT}} 82 | tuningSet: ScalingQPS 83 | objectBundle: 84 | - basename: aiml-training-job 85 | objectTemplatePath: sts.yaml 86 | templateFillMap: 87 | Group: aiml-training-job 88 | Replicas: {{MultiplyInt $BATCH_SIZE (AddInt $batch 2)}} 89 | RequestCPU: "{{$REQUEST_CPU}}" 90 | RequestMemory: "{{$REQUEST_MEMORY}}" 91 | LimitCPU: "{{$LIMIT_CPU}}" 92 | LimitMemory: "{{$LIMIT_MEMORY}}" 93 | ClusterName: "{{$CLUSTER_NAME}}" 94 | - name: Sleep after scaling batch {{AddInt $batch 2}} 95 | measurements: 96 | - Identifier: Wait 97 | Method: Sleep 98 | Params: 99 | duration: "{{$SLEEP_DURATION}}s" 100 | {{end}} 101 | 102 | - name: Waiting for StatefulSets to be ready 103 | measurements: 104 | - Identifier: WaitForRunningStatefulSets 105 | Method: WaitForControlledPodsRunning 106 | Params: 107 | action: gather 108 | 109 | - name: Gathering measurements 110 | measurements: 111 | - Identifier: PodStartupLatency 112 | Method: PodStartupLatency 113 | Params: 114 | action: gather 115 | labelSelector: app = aiml-training-job 116 | threshold: {{$POD_STARTUP_THRESHOLD}} -------------------------------------------------------------------------------- /tests/assets/aiml-workload/medium-batch-jobs/config.yaml: -------------------------------------------------------------------------------- 1 | # AI/ML Workload test configuration using Kubernetes Jobs 2 | # This test creates X number of Jobs with AI/ML workload pods, scaling in batches by updating parallelism. 3 | 4 | {{$NAMESPACE := DefaultParam .NAMESPACE "aiml-test"}} 5 | {{$JOBS_COUNT := DefaultParam .JOBS_COUNT 10}} 6 | {{$COMPLETIONS_PER_JOB := DefaultParam .COMPLETIONS_PER_JOB 10100}} 7 | {{$BATCH_SIZE := DefaultParam .BATCH_SIZE 200}} 8 | {{$JOB_CREATION_QPS := DefaultParam .JOB_CREATION_QPS 100}} 9 | {{$SCALING_QPS := DefaultParam .SCALING_QPS 100}} 10 | {{$JOB_COMPLETION_TIMEOUT := DefaultParam .JOB_COMPLETION_TIMEOUT "45m"}} 11 | {{$CREATION_TIMEOUT := DefaultParam .CREATION_TIMEOUT "51m"}} 12 | {{$REQUEST_CPU := DefaultParam .REQUEST_CPU "1000m"}} 13 | {{$REQUEST_MEMORY := DefaultParam .REQUEST_MEMORY "128Mi"}} 14 | {{$LIMIT_CPU := DefaultParam .LIMIT_CPU "1500m"}} 15 | {{$LIMIT_MEMORY := DefaultParam .LIMIT_MEMORY "512Mi"}} 16 | {{$CLUSTER_NAME := DefaultParam .CLUSTER_NAME "perflab-titan-1"}} 17 | {{$defaultQps := DefaultParam .CL2_DEFAULT_QPS 500}} 18 | {{$defaultBurst := DefaultParam .CL2_DEFAULT_BURST 500}} 19 | {{$SLEEP_DURATION := DefaultParam .SLEEP_DURATION 2}} 20 | 21 | name: aws-mock-aiml-application-large-jobs 22 | 23 | tuningSets: 24 | - name: JobCreationQPS 25 | qpsLoad: 26 | qps: {{$JOB_CREATION_QPS}} 27 | - name: ScalingQPS 28 | qpsLoad: 29 | qps: {{$SCALING_QPS}} 30 | - name: default 31 | globalQPSLoad: 32 | qps: {{$defaultQps}} 33 | burst: {{$defaultBurst}} 34 | 35 | steps: 36 | - name: Starting measurements 37 | measurements: 38 | - Identifier: PodStartupLatency 39 | Method: PodStartupLatency 40 | Params: 41 | action: start 42 | labelSelector: app = aiml-training-job 43 | threshold: 5s 44 | - Identifier: WaitForRunningJobs 45 | Method: WaitForControlledPodsRunning 46 | Params: 47 | action: start 48 | apiVersion: batch/v1 49 | kind: Job 50 | labelSelector: app = aiml-training-job 51 | operationTimeout: {{$CREATION_TIMEOUT}} 52 | 53 | - name: Creating Jobs with initial parallelism 54 | phases: 55 | - namespaceRange: 56 | min: 1 57 | max: 1 58 | basename: {{$NAMESPACE}} 59 | replicasPerNamespace: {{$JOBS_COUNT}} 60 | tuningSet: JobCreationQPS 61 | objectBundle: 62 | - basename: aiml-training-job 63 | objectTemplatePath: job-with-fsx.yaml 64 | templateFillMap: 65 | Group: aiml-training-job 66 | Completions: {{$COMPLETIONS_PER_JOB}} 67 | Parallelism: {{$BATCH_SIZE}} 68 | RequestCPU: "{{$REQUEST_CPU}}" 69 | RequestMemory: "{{$REQUEST_MEMORY}}" 70 | LimitCPU: "{{$LIMIT_CPU}}" 71 | LimitMemory: "{{$LIMIT_MEMORY}}" 72 | ClusterName: "{{$CLUSTER_NAME}}" 73 | 74 | {{ $numBatches := SubtractInt (DivideInt $COMPLETIONS_PER_JOB $BATCH_SIZE) 1 }} 75 | {{range $batch := Loop $numBatches}} 76 | - name: Scaling Jobs batch {{AddInt $batch 2}} 77 | phases: 78 | - namespaceRange: 79 | min: 1 80 | max: 1 81 | basename: {{$NAMESPACE}} 82 | replicasPerNamespace: {{$JOBS_COUNT}} 83 | tuningSet: ScalingQPS 84 | objectBundle: 85 | - basename: aiml-training-job 86 | objectTemplatePath: job-with-fsx.yaml 87 | updateFromTemplate: true 88 | templateFillMap: 89 | Group: aiml-training-job 90 | Completions: {{$COMPLETIONS_PER_JOB}} 91 | Parallelism: {{MultiplyInt $BATCH_SIZE (AddInt $batch 2)}} 92 | RequestCPU: "{{$REQUEST_CPU}}" 93 | RequestMemory: "{{$REQUEST_MEMORY}}" 94 | LimitCPU: "{{$LIMIT_CPU}}" 95 | LimitMemory: "{{$LIMIT_MEMORY}}" 96 | ClusterName: "{{$CLUSTER_NAME}}" 97 | - name: Sleep after scaling batch {{AddInt $batch 2}} 98 | measurements: 99 | - Identifier: Wait 100 | Method: Sleep 101 | Params: 102 | duration: "{{$SLEEP_DURATION}}s" 103 | {{end}} 104 | 105 | - name: Waiting for Jobs to be ready 106 | measurements: 107 | - Identifier: WaitForRunningJobs 108 | Method: WaitForControlledPodsRunning 109 | Params: 110 | action: gather 111 | - Identifier: PodStartupLatency 112 | Method: PodStartupLatency 113 | Params: 114 | action: gather 115 | labelSelector: app = aiml-training-job 116 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-replicas-wait.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: nodepool-replicas-wait 6 | namespace: scalability 7 | spec: 8 | description: "waits for the number of ready nodes in a nodepool to equal the specified replicas count" 9 | results: 10 | - name: datapoint 11 | description: Stores the result that can be consumed by other tasks (1 for success, 0 for failure) 12 | params: 13 | - name: cluster-name 14 | description: The name of the cluster 15 | - name: endpoint 16 | description: eks endpoint to use 17 | - name: aws-region 18 | default: us-west-2 19 | - name: initial-delay 20 | default: 1m 21 | - name: replicas 22 | description: number of ready replicas in the nodepool to wait for 23 | - name: nodepool 24 | description: nodepool to check nodes in. 25 | - name: check-interval 26 | description: interval in seconds between checks 27 | default: "60" 28 | - name: timeout 29 | description: total time to wait before timing out 30 | default: 3000 31 | steps: 32 | - name: wait-for-replicas 33 | image: alpine/k8s:1.30.2 34 | script: | 35 | sleep $(params.initial-delay) 36 | CHECK_INTERVAL=$(params.check-interval) 37 | TARGET_REPLICAS=$(params.replicas) 38 | TIMEOUT=$(params.timeout) 39 | START_TIME=$(date +%s) 40 | 41 | while true; do 42 | # Check if timeout has been reached 43 | CURRENT_TIME=$(date +%s) 44 | ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) 45 | 46 | if [ $ELAPSED_TIME -ge $TIMEOUT ]; then 47 | echo "$(date): Timeout reached after ${ELAPSED_TIME} seconds. Nodepools did not complete within the specified timeout." 48 | for az in $AZ_LIST; do 49 | echo "Nodes:" 50 | echo $(kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o yaml) 51 | echo "Nodeclaims:" 52 | echo $(kubectl get nodeclaims -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o yaml) 53 | done 54 | echo "0" | tee $(results.datapoint.path) 55 | exit 1 56 | fi 57 | 58 | aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) 59 | echo "$(date): Checking ready nodes in nodepool $(params.nodepool)..." 60 | 61 | # Get list of availability zones 62 | AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') 63 | AZ_COUNT=$(echo "$AZ_LIST" | wc -l) 64 | 65 | echo "[INFO] Found $AZ_COUNT availability zones:" 66 | echo "$AZ_LIST" | sed 's/^/ - /' 67 | echo "" 68 | 69 | # Track if all availability zones have reached target replicas 70 | ALL_AZ_READY=true 71 | 72 | # Check each availability zone 73 | for az in $AZ_LIST; do 74 | ready_nodes_count=$(kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o json | jq -r ' 75 | [.items[] | 76 | select(.status.conditions[] | select(.type == "Ready" and .status == "True"))] | 77 | length 78 | ') 79 | 80 | echo "$(date): AZ ${az} - Ready nodes: $ready_nodes_count, Target replicas: $TARGET_REPLICAS" 81 | 82 | if [ "$ready_nodes_count" -ne "$TARGET_REPLICAS" ]; then 83 | echo "$(date): AZ ${az} - Ready nodes count ($ready_nodes_count) does not match target replicas ($TARGET_REPLICAS)" 84 | ALL_AZ_READY=false 85 | else 86 | echo "$(date): AZ ${az} - Success! Ready nodes count matches target replicas ($TARGET_REPLICAS)" 87 | fi 88 | done 89 | 90 | # Exit if all availability zones have reached target replicas 91 | if [ "$ALL_AZ_READY" = "true" ]; then 92 | echo "$(date): All availability zones have reached target replica count. Exiting successfully." 93 | echo "1" | tee $(results.datapoint.path) 94 | exit 0 95 | fi 96 | 97 | echo "$(date): Not all availability zones have reached target replicas. Waiting ${CHECK_INTERVAL} seconds before next check..." 98 | 99 | sleep $CHECK_INTERVAL 100 | done 101 | 102 | exit 1 103 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: awscli-karpenter-cfn-stack 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Creates the karpenter instance roles and sqs interruption queue 9 | params: 10 | - name: cluster-name 11 | description: The name of the cluster 12 | - name: karpenter-version 13 | description: Version of Karpenter to deploy 14 | - name: endpoint 15 | description: Endpoint to use with EKS 16 | - name: region 17 | default: us-west-2 18 | description: The region where the cluster is in. 19 | - name: account-id 20 | description: The aws account the cluster is running in 21 | workspaces: 22 | - name: source 23 | mountPath: /src/karpenter/ 24 | steps: 25 | - name: create-stack 26 | image: alpine/k8s:1.30.2 27 | script: | 28 | STACK_NAME=Karpenter-$(params.cluster-name) 29 | STACK_STATUS=$(aws cloudformation describe-stacks --query 'Stacks[?StackName==`'${STACK_NAME}'`].StackStatus' --output text --region $(params.region)) 30 | curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/$(params.karpenter-version)/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml -o $(workspaces.source.path)cloudformation.yaml 31 | 32 | cat /src/karpenter/cloudformation.yaml 33 | 34 | UPDATE_OUTPUT=$(aws eks update-cluster-config --name $(params.cluster-name) --access-config authenticationMode=API_AND_CONFIG_MAP --endpoint $(params.endpoint)) 35 | 36 | echo $UPDATE_OUTPUT 37 | 38 | # Extract the update ID from the output 39 | UPDATE_ID=$(echo "$UPDATE_OUTPUT" | jq -r '.update.id // empty') 40 | 41 | echo "Waiting for cluster config update $UPDATE_ID to complete..." 42 | 43 | # Wait for the update to complete 44 | while true; do 45 | UPDATE_STATUS=$(aws eks describe-update --name $(params.cluster-name) --update-id "$UPDATE_ID" --endpoint $(params.endpoint) --query 'update.status' --output text) 46 | 47 | case "$UPDATE_STATUS" in 48 | "Successful") 49 | echo "Cluster config update completed successfully" 50 | break 51 | ;; 52 | "Failed"|"Cancelled") 53 | echo "Cluster config update failed with status: $UPDATE_STATUS" 54 | exit 1 55 | ;; 56 | "InProgress") 57 | echo "Update still in progress, waiting 30 seconds..." 58 | sleep 30 59 | ;; 60 | *) 61 | echo "Unknown update status: $UPDATE_STATUS" 62 | sleep 30 63 | ;; 64 | esac 65 | done 66 | 67 | if [[ "$STACK_STATUS" == "" ]]; then 68 | aws cloudformation deploy \ 69 | --stack-name "Karpenter-$(params.cluster-name)" \ 70 | --template-file $(workspaces.source.path)cloudformation.yaml \ 71 | --capabilities CAPABILITY_NAMED_IAM \ 72 | --parameter-overrides "ClusterName=$(params.cluster-name)" 73 | 74 | aws cloudformation wait stack-create-complete --stack-name $STACK_NAME --region $(params.region) 75 | echo "CREATED_CFN_STACK=$STACK_NAME" 76 | else 77 | echo "$STACK_NAME Already exists" 78 | fi 79 | 80 | aws eks describe-cluster --name "$(params.cluster-name)" --output text --endpoint $(params.endpoint) 81 | 82 | export AWS_EKS_ENDPOINT=$(params.endpoint) 83 | # Check if OIDC provider is already associated 84 | echo "Associating OIDC provider with cluster..." 85 | eksctl utils associate-iam-oidc-provider --cluster "$(params.cluster-name)" --approve 86 | 87 | # Check if access entry already exists 88 | if aws eks describe-access-entry --cluster-name "$(params.cluster-name)" --principal-arn "arn:aws:iam::$(params.account-id):role/KarpenterNodeRole-$(params.cluster-name)" --endpoint $(params.endpoint) >/dev/null 2>&1; then 89 | echo "Access entry for KarpenterNodeRole already exists. Skipping creation..." 90 | else 91 | echo "Creating access entry for KarpenterNodeRole..." 92 | aws eks create-access-entry \ 93 | --cluster-name "$(params.cluster-name)" \ 94 | --principal-arn "arn:aws:iam::$(params.account-id):role/KarpenterNodeRole-$(params.cluster-name)" \ 95 | --endpoint $(params.endpoint) \ 96 | --type EC2_LINUX 97 | fi 98 | -------------------------------------------------------------------------------- /tests/tekton-resources/pipelines/kit-cluster/upstream-load.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Pipeline 4 | metadata: 5 | name: kitloadtest 6 | namespace: scalability 7 | spec: 8 | workspaces: 9 | - name: config 10 | - name: source 11 | - name: results 12 | params: 13 | - name: host-cluster-name 14 | description: The name of the Host cluster on which you spin up KIT Guest cluster. 15 | default: "testbed" 16 | - name: cluster-name 17 | description: The name of the kit cluster you want to spin. 18 | - name: host-cluster-region 19 | default: "us-west-2" 20 | description: The region where the Host EKS cluster is in. 21 | - name: guest-cluster-region 22 | default: "us-west-2" 23 | description: The region where the kit cluster is created. 24 | - name: version 25 | default: "1.21" 26 | description: kubernetes version. 27 | - name: cp-instance-type 28 | default: "m5.8xlarge" 29 | description: control plane instance type. 30 | - name: max-requests-inflight 31 | default: "400" 32 | description: maximum number of inflight read request that apiserver could allow 33 | - name: max-mutating-requests 34 | default: "200" 35 | description: maximum number of mutating requests in flight that apiserver could allow 36 | - name: kcm-qps 37 | default: "20" 38 | description: Kubernetes-Controller-Manager QPS setting 39 | - name: scheduler-qps 40 | default: "20" 41 | description: Kubernetes-Scheduler QPS setting 42 | - name: node_count 43 | default: "1000" 44 | description: desired node count for Dataplane, min is 1000 to create DP nodes. 45 | - name: giturl 46 | description: "git url to clone the package" 47 | default: https://github.com/kubernetes/perf-tests.git 48 | - name: pods-per-node 49 | description: "pod density" 50 | default: "10" 51 | - name: nodes-per-namespace 52 | description: "nodes per namespace to get created for load test " 53 | default: "100" 54 | - name: cl2-load-test-throughput 55 | description: "throughput used for mutate operations" 56 | default: "15" 57 | - name: results-bucket 58 | description: "Results bucket with path of s3 to upload results" 59 | tasks: 60 | - name: create-kit-cluster 61 | taskRef: 62 | name: kit-cluster-create 63 | params: 64 | - name: host-cluster-name 65 | value: '$(params.host-cluster-name)' 66 | - name: cluster-name 67 | value: '$(params.cluster-name)' 68 | - name: host-cluster-region 69 | value: '$(params.host-cluster-region)' 70 | - name: guest-cluster-region 71 | value: '$(params.guest-cluster-region)' 72 | - name: version 73 | value: '$(params.version)' 74 | - name: cp-instance-type 75 | value: '$(params.cp-instance-type)' 76 | - name: max-requests-inflight 77 | value: '$(params.max-requests-inflight)' 78 | - name: max-mutating-requests 79 | value: '$(params.max-mutating-requests)' 80 | - name: kcm-qps 81 | value: '$(params.kcm-qps)' 82 | - name: scheduler-qps 83 | value: '$(params.scheduler-qps)' 84 | - name: node_count 85 | value: '$(params.node_count)' 86 | workspaces: 87 | - name: config 88 | workspace: config 89 | - name: generate 90 | runAfter: [create-kit-cluster] 91 | taskRef: 92 | name: load 93 | params: 94 | - name: giturl 95 | value: '$(params.giturl)' 96 | - name: pods-per-node 97 | value: '$(params.pods-per-node)' 98 | - name: nodes-per-namespace 99 | value: '$(params.nodes-per-namespace)' 100 | - name: cl2-load-test-throughput 101 | value: '$(params.cl2-load-test-throughput)' 102 | - name: results-bucket 103 | value: '$(params.results-bucket)' 104 | - name: nodes 105 | value: '$(params.node_count)' 106 | workspaces: 107 | - name: source 108 | workspace: source 109 | - name: config 110 | workspace: config 111 | - name: results 112 | workspace: results 113 | finally: 114 | - name: teardown 115 | taskRef: 116 | name: kit-cluster-teardown 117 | params: 118 | - name: host-cluster-name 119 | value: '$(params.host-cluster-name)' 120 | - name: cluster-name 121 | value: '$(params.cluster-name)' 122 | -------------------------------------------------------------------------------- /tests/assets/karpenter/controller-role-policy-document.json: -------------------------------------------------------------------------------- 1 | { 2 | "Statement": [ 3 | { 4 | "Action": [ 5 | "ssm:GetParameter", 6 | "ec2:DescribeImages", 7 | "ec2:RunInstances", 8 | "ec2:DescribeSubnets", 9 | "ec2:DescribeSecurityGroups", 10 | "ec2:DescribeLaunchTemplates", 11 | "ec2:DescribeInstances", 12 | "ec2:DescribeInstanceTypes", 13 | "ec2:DescribeInstanceTypeOfferings", 14 | "ec2:DeleteLaunchTemplate", 15 | "ec2:CreateTags", 16 | "ec2:CreateLaunchTemplate", 17 | "ec2:CreateFleet", 18 | "ec2:DescribeSpotPriceHistory", 19 | "pricing:GetProducts" 20 | ], 21 | "Effect": "Allow", 22 | "Resource": "*", 23 | "Sid": "Karpenter" 24 | }, 25 | { 26 | "Action": "ec2:TerminateInstances", 27 | "Condition": { 28 | "StringLike": { 29 | "ec2:ResourceTag/karpenter.sh/nodepool": "*" 30 | } 31 | }, 32 | "Effect": "Allow", 33 | "Resource": "*", 34 | "Sid": "ConditionalEC2Termination" 35 | }, 36 | { 37 | "Effect": "Allow", 38 | "Action": "iam:PassRole", 39 | "Resource": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}", 40 | "Sid": "PassNodeIAMRole" 41 | }, 42 | { 43 | "Effect": "Allow", 44 | "Action": "eks:DescribeCluster", 45 | "Resource": "arn:${AWS_PARTITION}:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", 46 | "Sid": "EKSClusterEndpointLookup" 47 | }, 48 | { 49 | "Sid": "AllowScopedInstanceProfileCreationActions", 50 | "Effect": "Allow", 51 | "Resource": "*", 52 | "Action": [ 53 | "iam:CreateInstanceProfile" 54 | ], 55 | "Condition": { 56 | "StringEquals": { 57 | "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", 58 | "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" 59 | }, 60 | "StringLike": { 61 | "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" 62 | } 63 | } 64 | }, 65 | { 66 | "Sid": "AllowScopedInstanceProfileTagActions", 67 | "Effect": "Allow", 68 | "Resource": "*", 69 | "Action": [ 70 | "iam:TagInstanceProfile" 71 | ], 72 | "Condition": { 73 | "StringEquals": { 74 | "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", 75 | "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}", 76 | "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", 77 | "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" 78 | }, 79 | "StringLike": { 80 | "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", 81 | "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" 82 | } 83 | } 84 | }, 85 | { 86 | "Sid": "AllowScopedInstanceProfileActions", 87 | "Effect": "Allow", 88 | "Resource": "*", 89 | "Action": [ 90 | "iam:AddRoleToInstanceProfile", 91 | "iam:RemoveRoleFromInstanceProfile", 92 | "iam:DeleteInstanceProfile" 93 | ], 94 | "Condition": { 95 | "StringEquals": { 96 | "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", 97 | "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}" 98 | }, 99 | "StringLike": { 100 | "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" 101 | } 102 | } 103 | }, 104 | { 105 | "Sid": "AllowInstanceProfileReadActions", 106 | "Effect": "Allow", 107 | "Resource": "*", 108 | "Action": "iam:GetInstanceProfile" 109 | } 110 | ], 111 | "Version": "2012-10-17" 112 | } -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/eks/awscli-fargate.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: awscli-eks-fargate-create 6 | namespace: scalability 7 | spec: 8 | description: | 9 | Create an EKS fargate pods for a given cluster and label selectors. 10 | params: 11 | - name: cluster-name 12 | description: The name of the EKS cluster you want to spin. 13 | - name: region 14 | default: "us-west-2" 15 | description: The region where the cluster is in. 16 | - name: namespace 17 | default: "kube-system" 18 | - name: desired-nodes 19 | default: "1" 20 | description: The desired number of nodes in the cluster. 21 | - name: labels 22 | default: "{k8s-app=kube-dns}" 23 | description: The label selector used for fargate profile. 24 | - name: profile-name 25 | description: Fargate profile name that you want to use 26 | default: nodescale 27 | - name: endpoint 28 | default: "" 29 | - name: host-cluster-node-role-name 30 | description: name of the hostcluster node role. This tightly coupled to code here - https://github.com/awslabs/kubernetes-iteration-toolkit/blob/3ed1bbd47f7b8f111208e977acaa3edfa1834ca8/substrate/pkg/controller/substrate/cluster/addons/karpenter.go#L52 so if it's changed there, it should be changed here. This helps us to avoid creating a separate noderole for nodegroups/fargate. 31 | - name: fargate-subnets 32 | description: only provide private subnets of your cluster VPC 33 | workspaces: 34 | - name: config 35 | mountPath: /config/ 36 | stepTemplate: 37 | env: 38 | - name: KUBECONFIG 39 | value: /config/kubeconfig 40 | steps: 41 | - name: create-fargate-profile 42 | image: alpine/k8s:1.23.7 43 | script: | 44 | ENDPOINT_FLAG="" 45 | if [ -n "$(params.endpoint)" ]; then 46 | ENDPOINT_FLAG="--endpoint $(params.endpoint)" 47 | fi 48 | NODE_ROLE_NAME=$(params.host-cluster-node-role-name) 49 | NODE_ROLE_ARN=$(aws iam get-role --role-name $NODE_ROLE_NAME --query 'Role.[Arn]' --output text) 50 | 51 | if [ -n "$(params.endpoint)" ]; then 52 | ENDPOINT_FLAG="--endpoint $(params.endpoint)" 53 | fi 54 | CREATED_PROFILE=$(aws eks $ENDPOINT_FLAG list-fargate-profiles --region $(params.region) --cluster-name $(params.cluster-name) --query 'fargateProfileNames[?@==`'$(params.profile-name)'`]' --output text) 55 | echo "CREATED_PROFILE=$CREATED_PROFILE" 56 | TAG=$(kubectl get provisioner -oyaml | grep karpenter.sh/discovery | awk 'NR==1{ print $2}') 57 | subnets=$(aws ec2 describe-subnets --filters Name=tag:kit.aws/substrate,Values=$TAG --query 'Subnets[?MapPublicIpOnLaunch==`false`].SubnetId' | jq -r ' [.[]] | join(",")') 58 | echo "private-subnets=$subnets" 59 | 60 | if [ "$CREATED_PROFILE" == "" ]; then 61 | aws eks create-fargate-profile --region $(params.region) --fargate-profile-name $(params.profile-name) --cluster-name $(params.cluster-name) --pod-execution-role-arn $NODE_ROLE_ARN --selectors namespace=$(params.namespace),labels=$(params.labels) --subnets $(params.fargate-subnets) 62 | fi 63 | 64 | while [[ "$(aws eks $ENDPOINT_FLAG describe-fargate-profile --region $(params.region) --cluster-name $(params.cluster-name) --fargate-profile-name $(params.profile-name) --query fargateProfile.status --output text)" == "CREATING" ]]; do 65 | echo "$(params.profile-name) is "CREATING" at $(date)" 66 | sleep 2 67 | done 68 | - name: write-kubeconfig 69 | image: alpine/k8s:1.23.7 70 | script: | 71 | ENDPOINT_FLAG="" 72 | if [ -n "$(params.endpoint)" ]; then 73 | ENDPOINT_FLAG="--endpoint $(params.endpoint)" 74 | fi 75 | aws eks $ENDPOINT_FLAG update-kubeconfig --name $(params.cluster-name) --region $(params.region) 76 | - name: validate-nodes 77 | image: alpine/k8s:1.23.7 78 | script: | 79 | echo "validate fargate nodes" 80 | kubectl patch deployment coredns -n kube-system --type json -p='[{"op": "remove", "path": "/spec/template/metadata/annotations/eks.amazonaws.com~1compute-type"}]' || true 81 | kubectl version 82 | kubectl config current-context 83 | kubectl get pods -n kube-system -o wide 84 | kubectl get ns 85 | kubectl get nodes 86 | kubectl scale deployment/coredns --replicas=$(params.desired-nodes) -n kube-system 87 | echo "Waiting for the fargate nodes to be ready" 88 | while true; do 89 | ready_node=$(kubectl get nodes 2>/dev/null | grep -w Ready | wc -l) 90 | echo "ready-nodes=$ready_node" 91 | if [[ "$ready_node" -eq $(params.desired-nodes) ]]; then break; fi 92 | sleep 5 93 | done 94 | echo "test is passed" 95 | kubectl scale deployment/coredns --replicas=5 -n kube-system -------------------------------------------------------------------------------- /tests/assets/aiml-workload/large-sts/sts.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: StatefulSet 3 | metadata: 4 | name: aiml-training-job-{{.Group}} 5 | labels: 6 | app: aiml-training-job 7 | group: {{.Group}} 8 | spec: 9 | serviceName: aiml-training-service 10 | replicas: {{.Replicas}} 11 | podManagementPolicy: Parallel 12 | selector: 13 | matchLabels: 14 | app: aiml-training-job 15 | group: {{.Group}} 16 | template: 17 | metadata: 18 | labels: 19 | app: aiml-training-job 20 | group: {{.Group}} 21 | spec: 22 | nodeSelector: 23 | purpose: ml-large 24 | containers: 25 | - name: app-with-awsapi 26 | image: 953421922360.dkr.ecr.us-west-2.amazonaws.com/aws-cli:2.27.49 27 | imagePullPolicy: IfNotPresent 28 | resources: 29 | requests: 30 | cpu: {{.RequestCPU}} 31 | memory: {{.RequestMemory}} 32 | limits: 33 | cpu: {{.LimitCPU}} 34 | memory: {{.LimitMemory}} 35 | env: 36 | - name: CLUSTER_NAME 37 | value: "{{.ClusterName}}" 38 | command: 39 | - sh 40 | - -c 41 | - | 42 | #ToDo remove this once PIA is compliant with exit criteria 43 | sleep 3600 44 | AUTH_TOKEN=$(cat $AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE) 45 | MAX_ATTEMPTS=7 46 | INITIAL_DELAY=0.2 # 200ms 47 | start_epoch=$(date +%s%3N) 48 | METRIC_MAX_RETRIES=3 49 | METRIC_RETRY_DELAY=1 50 | NAMESPACE=TitanApplicationLatencyForLargeSTS 51 | DIMENSION_NAME=ClusterName 52 | DIMENSION_VALUE={{.ClusterName}} 53 | METRIC_LATENCY_NAME=TitanApplicationLatencyForLargeSTS 54 | 55 | echo "Starting credential fetch and S3 verification process..." 56 | 57 | # Fetch credentials from EKS Pod Identity agent with exponential backoff 58 | for i in $(seq 0 $((MAX_ATTEMPTS - 1))); do 59 | status_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 2 -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials) 60 | if [ "$status_code" -eq 200 ]; then 61 | echo "Successfully fetched credentials at attempt $i" 62 | break 63 | fi 64 | 65 | if [ "$i" -eq $((MAX_ATTEMPTS - 1)) ]; then 66 | echo "Failed to fetch credentials after $MAX_ATTEMPTS attempts. Exiting." 67 | exit 1 68 | fi 69 | 70 | SLEEP_TIME=$(echo "$INITIAL_DELAY * (2 ^ $i)" | bc -l) 71 | echo "Credential fetch failed. Sleeping $SLEEP_TIME seconds before retry..." 72 | sleep "$SLEEP_TIME" 73 | done 74 | 75 | # Verify S3 access 76 | echo "Verifying S3 access..." 77 | while ! aws s3 ls; do 78 | echo "Waiting for S3 bucket access..." 79 | sleep 5 80 | done 81 | echo "S3 bucket is accessible, proceeding." 82 | 83 | # Calculate total latency for credential fetch + S3 verification 84 | end_epoch=$(date +%s%3N) 85 | latency_ms=$((end_epoch - start_epoch)) 86 | latency_sec=$(awk "BEGIN { print $latency_ms / 1000 }") 87 | 88 | echo "Total operation latency: ${latency_sec} seconds (credential fetch + S3 verification)" 89 | 90 | # Send combined operation latency metric 91 | for ((j=1; j<=METRIC_MAX_RETRIES; j++)); do 92 | aws cloudwatch put-metric-data \ 93 | --namespace "$NAMESPACE" \ 94 | --metric-name "$METRIC_LATENCY_NAME" \ 95 | --dimensions "$DIMENSION_NAME=$DIMENSION_VALUE" \ 96 | --value "$latency_sec" \ 97 | --unit Seconds && { 98 | echo "Metric $METRIC_LATENCY_NAME sent successfully with value: ${latency_sec}s" 99 | break 100 | } 101 | 102 | if [ "$j" -lt "$METRIC_MAX_RETRIES" ]; then 103 | echo "Attempt $j failed. Retrying in $METRIC_RETRY_DELAY seconds..." >&2 104 | sleep $METRIC_RETRY_DELAY 105 | METRIC_RETRY_DELAY=$((METRIC_RETRY_DELAY * 2)) # exponential backoff 106 | else 107 | echo "Failed to send metric $METRIC_LATENCY_NAME after $METRIC_MAX_RETRIES attempts." >&2 108 | exit 1 109 | fi 110 | done 111 | 112 | echo "Operation completed successfully. Keeping pod alive..." 113 | # Keep pod alive 114 | while true; do 115 | echo "Sleeping for 1 hour..." 116 | sleep 3600 117 | done -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: create-nodepool 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Creates Karpenter NodePool resources for each availability zone in the specified AWS region. 9 | This task downloads a nodepool template, customizes it for each AZ, and applies it to the cluster. 10 | params: 11 | - name: cluster-name 12 | description: The name of the EKS cluster where nodepools will be created 13 | - name: endpoint 14 | description: EKS cluster endpoint URL for kubectl configuration 15 | - name: aws-region 16 | description: AWS region where the cluster is located (used for AZ discovery) 17 | default: us-west-2 18 | - name: karpenter-nodepool-url 19 | description: URL of the nodepool YAML template to download and customize 20 | workspaces: 21 | - name: source 22 | mountPath: /src/karpenter/ 23 | steps: 24 | - name: create-nodepools 25 | image: alpine/k8s:1.30.2 26 | script: | 27 | echo "Starting Karpenter NodePool Creation Task" 28 | 29 | # Configure kubeconfig to connect to the EKS cluster 30 | echo "[INFO] Configuring kubeconfig for cluster access..." 31 | aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) 32 | if [ $? -eq 0 ]; then 33 | echo "[SUCCESS] Successfully configured kubeconfig" 34 | else 35 | echo "[ERROR] Failed to configure kubeconfig" 36 | exit 1 37 | fi 38 | 39 | # Set cluster name environment variable for template substitution 40 | export CLUSTER_NAME=$(params.cluster-name) 41 | 42 | # Download the nodepool template 43 | echo "" 44 | echo "[INFO] Downloading nodepool template from: $(params.karpenter-nodepool-url)" 45 | curl -fsSL $(params.karpenter-nodepool-url) -o $(workspaces.source.path)nodepool.yaml 46 | 47 | # Display the downloaded template for verification 48 | echo "" 49 | echo "[INFO] Downloaded nodepool template content:" 50 | echo "----------------------------------------" 51 | cat $(workspaces.source.path)nodepool.yaml 52 | echo "----------------------------------------" 53 | 54 | # Discover availability zones and create nodepools 55 | echo "" 56 | echo "[INFO] Discovering availability zones in region: $(params.aws-region)" 57 | 58 | # Get list of availability zones 59 | AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') 60 | AZ_COUNT=$(echo "$AZ_LIST" | wc -l) 61 | 62 | echo "[INFO] Found $AZ_COUNT availability zones:" 63 | echo "$AZ_LIST" | sed 's/^/ - /' 64 | echo "" 65 | 66 | # Process each availability zone 67 | NODEPOOL_COUNT=0 68 | echo "$AZ_LIST" | while read -r az; do 69 | if [ -z "$az" ]; then 70 | continue 71 | fi 72 | 73 | NODEPOOL_COUNT=$((NODEPOOL_COUNT + 1)) 74 | export AZ=$az 75 | 76 | echo "[INFO] Creating nodepool for availability zone: $az" 77 | 78 | # Generate AZ-specific nodepool configuration 79 | echo "[INFO] Generating nodepool configuration for $az..." 80 | envsubst < $(workspaces.source.path)nodepool.yaml > $(workspaces.source.path)nodepool-${az}.yaml 81 | 82 | # Display the generated configuration 83 | echo "[INFO] Generated nodepool configuration for $az:" 84 | echo "----------------------------------------" 85 | cat $(workspaces.source.path)nodepool-${az}.yaml | sed 's/^/ /' 86 | echo "----------------------------------------" 87 | 88 | # Apply the nodepool configuration 89 | echo "[INFO] Applying nodepool configuration for $az..." 90 | kubectl apply -f $(workspaces.source.path)nodepool-${az}.yaml 91 | echo "" 92 | done 93 | 94 | # Verify the created nodepools 95 | echo "[INFO] Verifying created nodepools..." 96 | echo "==================================" 97 | 98 | NODEPOOL_LIST=$(kubectl get nodepool --no-headers 2>/dev/null | wc -l) 99 | echo "[SUCCESS] Found $NODEPOOL_LIST nodepool(s) in the cluster" 100 | echo "" 101 | echo "[INFO] Current nodepool status:" 102 | kubectl get nodepool -o wide 103 | echo "" 104 | echo "----------------------------------------" 105 | echo "[INFO] Detailed nodepool configuration:" 106 | kubectl get nodepool -o yaml 107 | echo "----------------------------------------" 108 | 109 | echo "" 110 | echo "==========================================" 111 | echo "Karpenter NodePool Creation Completed" 112 | echo "==========================================" 113 | -------------------------------------------------------------------------------- /tests/assets/eks-pod-identity/config.yaml: -------------------------------------------------------------------------------- 1 | {{$clusterName := DefaultParam .CL2_CLUSTER_NAME "default-cluster-name"}} 2 | {{$metricDimensionName := DefaultParam .CL2_METRIC_DIMENSION_NAME "ClusterName"}} 3 | {{$metricNamespace := DefaultParam .CL2_METRIC_NAMESPACE "EKSPodIdentityScalabilityTests"}} 4 | {{$metricLatencyName := DefaultParam .CL2_METRIC_LATENCY_NAME "CredentialFetchLatency"}} 5 | {{$metricPeriod := DefaultParam .CL2_METRIC_PERIOD 300}} 6 | {{$namespacePrefix := DefaultParam .CL2_NAMESPACE_PREFIX "default"}} 7 | {{$namespaceCount := DefaultParam .CL2_NAMESPACE_COUNT 1}} 8 | {{$totalEksPodIdentityPods := DefaultParam .CL2_EKS_POD_IDENTITY_PODS 5000}} 9 | {{$podImage := DefaultParam .CL2_POD_IMAGE "public.ecr.aws/aws-cli/aws-cli:2.27.56"}} 10 | {{$timeoutEksPodIdentityPodCreation := DefaultParam .CL2_TIMEOUT_EKS_POD_IDENTITY_POD_CREATION "20m"}} 11 | {{$timeoutEksPodIdentityPodStartup := DefaultParam .CL2_TIMEOUT_EKS_POD_IDENTITY_POD_STARTUP "5m"}} 12 | {{$timeoutImagePreload := DefaultParam .CL2_TIMEOUT_IMAGE_PRELOAD "15m"}} 13 | {{$defaultQps := DefaultParam .CL2_DEFAULT_QPS 500}} 14 | {{$defaultBurst := DefaultParam .CL2_DEFAULT_BURST 1000}} 15 | {{$uniformQps := DefaultParam .CL2_UNIFORM_QPS 500}} 16 | 17 | {{$SCHEDULER_THROUGHPUT_THRESHOLD := DefaultParam .CL2_SCHEDULER_THROUGHPUT_THRESHOLD 100}} 18 | 19 | name: eks-pod-identity 20 | tuningSets: 21 | # default is a tuningset that is meant to be used when we don't have any specific requirements on pace of operations. 22 | - name: default 23 | globalQPSLoad: 24 | qps: {{$defaultQps}} 25 | burst: {{$defaultBurst}} 26 | - name: UniformQPS 27 | qpsLoad: 28 | qps: {{$uniformQps}} 29 | steps: 30 | - name: Creating image preload measurements 31 | measurements: 32 | - Method: WaitForControlledPodsRunning 33 | Instances: 34 | - Identifier: WaitForRunningDaemonSets 35 | Params: 36 | apiVersion: apps/v1 37 | kind: DaemonSet 38 | Params: 39 | action: start 40 | labelSelector: group = image-preload 41 | operationTimeout: {{$timeoutImagePreload}} 42 | - name: Create image preload daemonset 43 | phases: 44 | - namespaceRange: 45 | min: 1 46 | max: 1 47 | replicasPerNamespace: 1 48 | tuningSet: default 49 | objectBundle: 50 | - basename: daemonset 51 | objectTemplatePath: pod-image-preload.yaml 52 | templateFillMap: 53 | PodImage: {{$podImage}} 54 | - name: Waiting for image preload daemonset to be completed 55 | measurements: 56 | - Method: WaitForControlledPodsRunning 57 | Instances: 58 | - Identifier: WaitForRunningDaemonSets 59 | Params: 60 | action: gather 61 | - name: Creating eks pod identity measurements 62 | measurements: 63 | - Identifier: EksPodIdentityPodStartupLatency 64 | Method: PodStartupLatency 65 | Params: 66 | action: start 67 | labelSelector: group = eks-pod-identity 68 | threshold: {{$timeoutEksPodIdentityPodStartup}} 69 | # a pod identity association with (namespace: default, sa: default) is created as prerequisite 70 | - name: Create eks pod identity pods 71 | phases: 72 | - namespaceRange: 73 | min: 1 74 | max: {{$namespaceCount}} 75 | baseName: {{$namespacePrefix}} 76 | replicasPerNamespace: {{$totalEksPodIdentityPods}} 77 | tuningSet: UniformQPS 78 | objectBundle: 79 | - basename: eks-pod-identity 80 | objectTemplatePath: pod-default.yaml 81 | templateFillMap: 82 | Group: eks-pod-identity 83 | ClusterName: {{$clusterName}} 84 | PodImage: {{$podImage}} 85 | MetricDimensionName: {{$metricDimensionName}} 86 | MetricNamespace: {{$metricNamespace}} 87 | MetricLatencyName: {{$metricLatencyName}} 88 | MetricPeriod: {{$metricPeriod}} 89 | - name: Waiting for eks pod identity pods to be created 90 | measurements: 91 | - Identifier: WaitForEksPodIdentityPods 92 | Method: WaitForRunningPods 93 | Params: 94 | action: gather 95 | timeout: {{$timeoutEksPodIdentityPodCreation}} 96 | desiredPodCount: {{$totalEksPodIdentityPods}} 97 | labelSelector: group = eks-pod-identity 98 | - name: Collecting eks pod identity measurements 99 | measurements: 100 | - Identifier: EksPodIdentityPodStartupLatency 101 | Method: PodStartupLatency 102 | Params: 103 | action: gather 104 | - name: Delete eks pod identity pods 105 | phases: 106 | - namespaceRange: 107 | min: 1 108 | max: {{$namespaceCount}} 109 | baseName: {{$namespacePrefix}} 110 | replicasPerNamespace: 0 111 | tuningSet: default 112 | objectBundle: 113 | - basename: eks-pod-identity 114 | objectTemplatePath: pod-default.yaml 115 | templateFillMap: 116 | Group: eks-pod-identity 117 | ClusterName: {{$clusterName}} 118 | MetricDimensionName: {{$metricDimensionName}} 119 | MetricNamespace: {{$metricNamespace}} 120 | MetricLatencyName: {{$metricLatencyName}} 121 | MetricPeriod: {{$metricPeriod}} 122 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1beta1 2 | kind: Task 3 | metadata: 4 | name: awscli-node-role 5 | namespace: scalability 6 | spec: 7 | description: | 8 | Creates the Karpenter Node IAM Role with required policies for EKS worker nodes. 9 | This task creates an IAM role that allows EC2 instances to assume the role and attaches 10 | the necessary AWS managed policies for EKS worker node functionality including container 11 | registry access, CNI networking, and Systems Manager access. 12 | results: 13 | - name: node-role-arn 14 | description: The ARN of the created Karpenter node IAM role 15 | params: 16 | - name: cluster-name 17 | description: The name of the EKS cluster (used to create unique role name) 18 | steps: 19 | - name: create-role 20 | image: alpine/k8s:1.30.2 21 | script: | 22 | echo "Starting Karpenter Node IAM Role Creation Task" 23 | echo "==============================================" 24 | 25 | # Set role name variable for consistency 26 | ROLE_NAME="KarpenterNodeRole-$(params.cluster-name)" 27 | 28 | echo "[INFO] Target role name: $ROLE_NAME" 29 | echo "" 30 | 31 | # Check if the IAM role already exists 32 | echo "[INFO] Checking if IAM role already exists..." 33 | if aws iam get-role --role-name "$ROLE_NAME" >/dev/null 2>&1; then 34 | echo "[INFO] IAM role $ROLE_NAME already exists, skipping creation" 35 | else 36 | echo "[INFO] Creating new IAM role: $ROLE_NAME" 37 | echo "[INFO] Configuring trust policy for EC2 service..." 38 | 39 | # Create the IAM role with trust policy for EC2 40 | aws iam create-role --role-name "$ROLE_NAME" \ 41 | --assume-role-policy-document '{ 42 | "Version": "2012-10-17", 43 | "Statement": [ 44 | { 45 | "Effect": "Allow", 46 | "Principal": { 47 | "Service": "ec2.amazonaws.com" 48 | }, 49 | "Action": "sts:AssumeRole" 50 | } 51 | ] 52 | }' 53 | 54 | echo "[SUCCESS] Successfully created IAM role: $ROLE_NAME" 55 | fi 56 | echo "" 57 | 58 | # Define required AWS managed policies for EKS worker nodes 59 | echo "[INFO] Preparing to attach required AWS managed policies..." 60 | POLICIES=( 61 | "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" 62 | "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" 63 | "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" 64 | "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" 65 | ) 66 | 67 | echo "[INFO] Required policies to attach:" 68 | for policy in "${POLICIES[@]}"; do 69 | echo " - $policy" 70 | done 71 | echo "" 72 | 73 | # Attach required policies to the role 74 | POLICY_COUNT=0 75 | for policy in "${POLICIES[@]}"; do 76 | POLICY_COUNT=$((POLICY_COUNT + 1)) 77 | echo "[INFO] Processing policy $POLICY_COUNT of ${#POLICIES[@]}: $policy" 78 | 79 | # Check if policy is already attached to avoid duplicate attachments 80 | if aws iam list-attached-role-policies --role-name "$ROLE_NAME" --query "AttachedPolicies[?PolicyArn=='$policy'].PolicyArn" --output text | grep -q "$policy"; then 81 | echo "[INFO] Policy already attached, skipping: $policy" 82 | else 83 | echo "[INFO] Attaching policy to role..." 84 | aws iam attach-role-policy --role-name "$ROLE_NAME" --policy-arn "$policy" 85 | echo "[SUCCESS] Successfully attached policy: $policy" 86 | fi 87 | echo "" 88 | done 89 | 90 | # Retrieve and store the role ARN for use by other tasks 91 | echo "[INFO] Retrieving role ARN for task output..." 92 | ROLE_ARN=$(aws iam get-role --role-name "$ROLE_NAME" --query 'Role.Arn' --output text) 93 | echo "[INFO] Role ARN: $ROLE_ARN" 94 | 95 | # Write ARN to results file for pipeline consumption 96 | echo "$ROLE_ARN" > $(results.node-role-arn) 97 | echo "[SUCCESS] Role ARN saved to task results" 98 | echo "" 99 | 100 | # Verify final role configuration 101 | echo "[INFO] Verifying final role configuration..." 102 | echo "==========================================" 103 | echo "[INFO] Role details:" 104 | aws iam get-role --role-name "$ROLE_NAME" --query 'Role.{RoleName:RoleName,Arn:Arn,CreateDate:CreateDate}' --output table 105 | echo "" 106 | echo "[INFO] Attached policies:" 107 | aws iam list-attached-role-policies --role-name "$ROLE_NAME" --query 'AttachedPolicies[].{PolicyName:PolicyName,PolicyArn:PolicyArn}' --output table 108 | echo "" 109 | echo "==========================================" 110 | echo "Karpenter Node IAM Role Creation Completed" 111 | echo "==========================================" 112 | -------------------------------------------------------------------------------- /infrastructure/lib/addons/crossplane.ts: -------------------------------------------------------------------------------- 1 | import { Construct } from 'constructs'; 2 | import { aws_iam as iam, Duration, StackProps } from 'aws-cdk-lib'; 3 | import { aws_eks as eks } from 'aws-cdk-lib'; 4 | 5 | export interface CrossplaneProps extends StackProps { 6 | cluster: eks.Cluster 7 | namespace: string 8 | version: string 9 | } 10 | 11 | export class Crossplane extends Construct { 12 | constructor(scope: Construct, id: string, props: CrossplaneProps) { 13 | super(scope, id) 14 | const ns = props.cluster.addManifest('crossplane-namespace', { 15 | apiVersion: 'v1', 16 | kind: 'Namespace', 17 | metadata: { 18 | name: props.namespace 19 | } 20 | }) 21 | 22 | // Controller Role 23 | const sa = props.cluster.addServiceAccount('crossplane-controller-sa', { 24 | name: "crossplane-aws-irsa", 25 | namespace: props.namespace 26 | }) 27 | sa.node.addDependency(ns) 28 | sa.role.attachInlinePolicy(new iam.Policy(this, 'crossplane-aws-policy', { 29 | statements: [ 30 | new iam.PolicyStatement({ 31 | resources: ['*'], 32 | actions: [ 33 | // Write Operations 34 | "iam:*", 35 | "sts:*", 36 | ], 37 | }), 38 | ], 39 | })) 40 | 41 | const chart = props.cluster.addHelmChart('crossplane-chart', { 42 | chart: 'crossplane', 43 | release: 'crossplane', 44 | version: props.version, 45 | repository: 'https://charts.crossplane.io/stable', 46 | namespace: props.namespace, 47 | createNamespace: false, 48 | timeout: Duration.minutes(10), 49 | wait: true, 50 | values: { 51 | tolerations: [ 52 | { 53 | key: 'CriticalAddonsOnly', 54 | operator: 'Exists', 55 | }, 56 | ], 57 | rbacManager: { 58 | tolerations: [ 59 | { 60 | key: 'CriticalAddonsOnly', 61 | operator: 'Exists', 62 | }, 63 | ], 64 | } 65 | } 66 | }) 67 | chart.node.addDependency(ns) 68 | 69 | const controllerConfig = props.cluster.addManifest("crossplane-controller-config", { 70 | apiVersion: 'pkg.crossplane.io/v1alpha1', 71 | kind: 'ControllerConfig', 72 | metadata: { 73 | name: 'aws-config', 74 | annotations: { 75 | 'eks.amazonaws.com/role-arn': sa.role.roleArn 76 | } 77 | }, 78 | spec: { 79 | podSecurityContext: { 80 | 'fsGroup': 2000 81 | }, 82 | tolerations: [ 83 | { 84 | key: 'CriticalAddonsOnly', 85 | operator: 'Exists', 86 | }, 87 | ], 88 | }, 89 | }); 90 | controllerConfig.node.addDependency(chart) 91 | 92 | 93 | const providerManifest = props.cluster.addManifest("crossplane-aws-provider", { 94 | apiVersion: 'pkg.crossplane.io/v1', 95 | kind: 'Provider', 96 | metadata: { 97 | name: 'provider-aws', 98 | }, 99 | spec: { 100 | package: 'crossplane/provider-aws:v0.15.0', 101 | controllerConfigRef: { 102 | name: 'aws-config', 103 | }, 104 | }, 105 | }); 106 | providerManifest.node.addDependency(chart) 107 | 108 | // TODO: need to wait for the provider to come up, but can probably do this in flux 109 | // const awsProviderCRDs = new eks.KubernetesObjectValue(this, 'crossplane-aws-provider-crds', { 110 | // cluster: props.cluster, 111 | // objectType: 'crd', 112 | // objectName: 'providerconfigs.aws.crossplane.io', 113 | // jsonPath: '.', 114 | // timeout: Duration.minutes(5), 115 | // }) 116 | // awsProviderCRDs.node.addDependency(providerManifest) 117 | 118 | // const providerConfig = props.cluster.addManifest("crossplane-provider-config", { 119 | // apiVersion: 'aws.crossplane.io/v1beta1', 120 | // kind: 'ProviderConfig', 121 | // metadata: { 122 | // name: 'crossplane-provider-config', 123 | // }, 124 | // spec: { 125 | // credentials: { 126 | // source: 'InjectedIdentity', 127 | // }, 128 | // }, 129 | // }); 130 | // providerConfig.node.addDependency(awsProviderCRDs) 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /tests/tekton-resources/tasks/setup/kitctl/controlplane.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: tekton.dev/v1beta1 3 | kind: Task 4 | metadata: 5 | name: control-plane-setup 6 | namespace: scalability 7 | annotations: 8 | tekton.dev/pipelines.minVersion: "0.17.0" 9 | tekton.dev/categories: Kubernetes 10 | tekton.dev/tags: CLI, kubectl 11 | tekton.dev/displayName: "kubernetes actions" 12 | tekton.dev/platforms: "linux/amd64" 13 | spec: 14 | description: | 15 | Setup a kubernetes control plane in the guest cluster. 16 | params: 17 | - name: name 18 | default: "guest" 19 | description: Name of the guest cluster 20 | - name: apiserver-replicas 21 | default: "1" 22 | description: Number of APIserver replicas 23 | - name: apiserver-image 24 | default: "" 25 | description: Image of apiserver 26 | - name: apiserver-parameters 27 | default: "" 28 | description: Parameters of the apiserver container 29 | - name: apiserver-instance-type 30 | default: "m5.16xlarge" 31 | description: Instance type for the apiserver 32 | - name: etcd-replicas 33 | default: "3" 34 | description: Number of ETCD replicas 35 | - name: etcd-image 36 | default: "" 37 | description: Image of ETCD 38 | - name: etcd-parameters 39 | default: "" 40 | description: Parameters of the ETCD container 41 | - name: etcd-instance-type 42 | default: "m5.16xlarge" 43 | description: Instance type for the ETCD 44 | - name: kubernetes-version 45 | default: "1.19" 46 | description: Kubernetes version for the guest cluster 47 | steps: 48 | - name: setup-control-plane 49 | image: bitnami/kubectl:1.24.5 # curl was removed in more recent versions 50 | script: | 51 | #!/bin/bash 52 | echo "Approving KCM requests" 53 | kubectl certificate approve $(kubectl get csr | grep "Pending" | awk '{print $1}') 2>/dev/null || true 54 | namespace=$(kubectl get ns $(params.name) -o yaml 2>/dev/null | grep phase | awk '{print $2}') 55 | if [[ $namespace != "Active" ]] 56 | then 57 | echo "Create namespace" 58 | kubectl create namespace $(params.name) 59 | fi 60 | echo "Setting up control plane" 61 | cat < /tmp/controlplane.yaml 62 | apiVersion: kit.k8s.sh/v1alpha1 63 | kind: ControlPlane 64 | metadata: 65 | name: $(params.name) # Desired Cluster name 66 | namespace: $(params.name) 67 | spec: 68 | etcd: 69 | replicas: $(params.etcd-replicas) 70 | spec: 71 | nodeSelector: 72 | node.kubernetes.io/instance-type: $(params.etcd-instance-type) 73 | containers: 74 | - name: etcd 75 | EOF 76 | if [ -n "$(params.etcd-image)" ]; then 77 | cat <> /tmp/controlplane.yaml 78 | image: $(params.etcd-image) 79 | EOF 80 | fi 81 | if [ -n "$(params.etcd-parameters)" ]; then 82 | cat <> /tmp/controlplane.yaml 83 | args: $(params.etcd-parameters) 84 | EOF 85 | fi 86 | cat <> /tmp/controlplane.yaml 87 | master: 88 | apiServer: 89 | replicas: $(params.apiserver-replicas) 90 | spec: 91 | nodeSelector: 92 | node.kubernetes.io/instance-type: $(params.apiserver-instance-type) 93 | containers: 94 | - name: apiserver 95 | EOF 96 | if [ -n "$(params.apiserver-image)" ]; then 97 | cat <> /tmp/controlplane.yaml 98 | image: $(params.apiserver-image) 99 | EOF 100 | fi 101 | if [ -n "$(params.apiserver-parameters)" ]; then 102 | cat <> /tmp/controlplane.yaml 103 | args: $(params.apiserver-parameters) 104 | EOF 105 | fi 106 | if [ -n "$(params.kubernetes-version)" ]; then 107 | cat <> /tmp/controlplane.yaml 108 | kubernetesVersion: "$(params.kubernetes-version)" 109 | EOF 110 | fi 111 | kubectl apply -f /tmp/controlplane.yaml 112 | echo "Getting kube admin config" 113 | while true; do 114 | data=$(kubectl get secret -n $(params.name) $(params.name)-kube-admin-config -ojsonpath='{.data.config}' 2>/dev/null) || true 115 | if [[ ! -z "$data" ]]; then 116 | echo $data | base64 -d > /tmp/kubeconfig 117 | break 118 | fi 119 | sleep 1 120 | done 121 | echo "Waiting for the APIserver endpoint to be ready" 122 | endpoint=$(cat /tmp/kubeconfig | grep server | awk '{print $2}') 123 | while true; do 124 | state=$(curl -k --connect-timeout 5 -s $endpoint/healthz) 2>/dev/null || true 125 | if [[ ! -z "$state" ]]; then 126 | break 127 | fi 128 | sleep 5 129 | done 130 | echo "Installing CNI" 131 | kubectl --kubeconfig=/tmp/kubeconfig apply -f https://raw.githubusercontent.com/aws/amazon-vpc-cni-k8s/release-1.10/config/master/aws-k8s-cni.yaml 132 | echo "Approving KCM requests" 133 | kubectl certificate approve $(kubectl get csr | grep "Pending" | awk '{print $1}') 2>/dev/null || true 134 | -------------------------------------------------------------------------------- /tests/images/toolkit-base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine 2 | 3 | ARG ARCH 4 | 5 | # Ignore to update versions here 6 | # docker build --no-cache --build-arg KUBECTL_VERSION=${tag} --build-arg HELM_VERSION=${helm} --build-arg KUSTOMIZE_VERSION=${kustomize_version} -t ${image}:${tag} . 7 | ARG HELM_VERSION=3.2.1 8 | ARG KUBECTL_VERSION=1.17.5 9 | ARG KUSTOMIZE_VERSION=v3.8.1 10 | ARG KUBESEAL_VERSION=0.18.1 11 | ARG KREW_VERSION=v0.4.4 12 | ARG VALS_VERSION=0.28.1 13 | ARG KUBECONFORM_VERSION=0.6.3 14 | 15 | # Install helm (latest release) 16 | # ENV BASE_URL="https://storage.googleapis.com/kubernetes-helm" 17 | RUN case `uname -m` in \ 18 | x86_64) ARCH=amd64; ;; \ 19 | armv7l) ARCH=arm; ;; \ 20 | aarch64) ARCH=arm64; ;; \ 21 | ppc64le) ARCH=ppc64le; ;; \ 22 | s390x) ARCH=s390x; ;; \ 23 | *) echo "un-supported arch, exit ..."; exit 1; ;; \ 24 | esac && \ 25 | echo "export ARCH=$ARCH" > /envfile && \ 26 | cat /envfile 27 | 28 | RUN . /envfile && echo $ARCH && \ 29 | apk add --update --no-cache curl ca-certificates bash git && \ 30 | curl -sL https://get.helm.sh/helm-v${HELM_VERSION}-linux-${ARCH}.tar.gz | tar -xvz && \ 31 | mv linux-${ARCH}/helm /usr/bin/helm && \ 32 | chmod +x /usr/bin/helm && \ 33 | rm -rf linux-${ARCH} 34 | 35 | # add helm-diff 36 | RUN helm plugin install https://github.com/databus23/helm-diff --verify=false && rm -rf /tmp/helm-* && \ 37 | rm -rf ~/.cache/helm/plugins/https-github.com-databus23-helm-diff/.git 38 | 39 | # add helm-unittest 40 | RUN helm plugin install https://github.com/helm-unittest/helm-unittest --verify=false && rm -rf /tmp/helm-* 41 | 42 | # add helm-push 43 | RUN helm plugin install https://github.com/chartmuseum/helm-push --verify=false && \ 44 | rm -rf /tmp/helm-* \ 45 | /root/.local/share/helm/plugins/helm-push/testdata \ 46 | /root/.cache/helm/plugins/https-github.com-chartmuseum-helm-push/testdata 47 | 48 | # Install kubectl 49 | RUN . /envfile && echo $ARCH && \ 50 | curl -sLO "https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/${ARCH}/kubectl" && \ 51 | mv kubectl /usr/bin/kubectl && \ 52 | chmod +x /usr/bin/kubectl 53 | 54 | # Install kustomize (latest release) 55 | RUN . /envfile && echo $ARCH && \ 56 | curl -sLO https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2F${KUSTOMIZE_VERSION}/kustomize_${KUSTOMIZE_VERSION}_linux_${ARCH}.tar.gz && \ 57 | tar xvzf kustomize_${KUSTOMIZE_VERSION}_linux_${ARCH}.tar.gz && \ 58 | mv kustomize /usr/bin/kustomize && \ 59 | chmod +x /usr/bin/kustomize && \ 60 | rm kustomize_${KUSTOMIZE_VERSION}_linux_${ARCH}.tar.gz 61 | 62 | # Install eksctl (latest version) 63 | RUN . /envfile && echo $ARCH && \ 64 | curl -sL "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_${ARCH}.tar.gz" | tar xz -C /tmp && \ 65 | mv /tmp/eksctl /usr/bin && \ 66 | chmod +x /usr/bin/eksctl 67 | 68 | # Install awscli 69 | # Temp fix to allow system-wide package installation: 70 | # https://stackoverflow.com/a/76540031/3671801 71 | RUN apk add --update --no-cache py3-pip && \ 72 | pip3 install --break-system-packages --upgrade pip setuptools && \ 73 | pip3 install --break-system-packages awscli && \ 74 | pip3 cache purge 75 | 76 | # Install jq 77 | RUN apk add --update --no-cache jq yq 78 | 79 | # https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html 80 | # Install aws-iam-authenticator (latest version) 81 | RUN . /envfile && echo $ARCH && \ 82 | authenticator=$(curl -fs https://api.github.com/repos/kubernetes-sigs/aws-iam-authenticator/releases/latest | jq --raw-output '.name' | sed 's/^v//') && \ 83 | curl -fL https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v${authenticator}/aws-iam-authenticator_${authenticator}_linux_${ARCH} -o /usr/bin/aws-iam-authenticator && \ 84 | chmod +x /usr/bin/aws-iam-authenticator 85 | 86 | # Install for envsubst 87 | RUN apk add --update --no-cache gettext 88 | 89 | # Install kubeseal 90 | RUN . /envfile && echo $ARCH && \ 91 | curl -L https://github.com/bitnami-labs/sealed-secrets/releases/download/v${KUBESEAL_VERSION}/kubeseal-${KUBESEAL_VERSION}-linux-${ARCH}.tar.gz -o - | tar xz -C /usr/bin/ && \ 92 | chmod +x /usr/bin/kubeseal 93 | 94 | # Install vals 95 | RUN . /envfile && echo $ARCH && \ 96 | curl -L https://github.com/helmfile/vals/releases/download/v${VALS_VERSION}/vals_${VALS_VERSION}_linux_${ARCH}.tar.gz -o -| tar xz -C /usr/bin/ && \ 97 | chmod +x /usr/bin/vals 98 | 99 | # Install krew (latest release) 100 | RUN . /envfile && echo $ARCH && \ 101 | curl -fsSLO "https://github.com/kubernetes-sigs/krew/releases/download/v${KREW_VERSION}/krew-linux_${ARCH}.tar.gz" && \ 102 | tar zxvf krew-linux_${ARCH}.tar.gz && \ 103 | ./krew-linux_${ARCH} install krew && \ 104 | echo 'export PATH=/root/.krew/bin:$PATH' >> ~/.bashrc && \ 105 | rm krew-linux_${ARCH}.tar.gz 106 | 107 | # Install kubeconform 108 | RUN . /envfile && echo $ARCH && \ 109 | curl -L https://github.com/yannh/kubeconform/releases/download/v${KUBECONFORM_VERSION}/kubeconform-linux-${ARCH}.tar.gz -o - | tar xz -C /usr/bin/ && \ 110 | chmod +x /usr/bin/kubeconform 111 | 112 | WORKDIR /apps 113 | -------------------------------------------------------------------------------- /infrastructure/k8s-config/clusters/kit-infrastructure/tekton-pipelines/tekton.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: tekton-pipelines 5 | --- 6 | apiVersion: source.toolkit.fluxcd.io/v1beta2 7 | kind: Bucket 8 | metadata: 9 | name: tekton 10 | namespace: tekton-pipelines 11 | spec: 12 | bucketName: tekton-releases 13 | endpoint: storage.googleapis.com 14 | interval: 1m0s 15 | provider: generic 16 | ignore: | 17 | # exclude all 18 | /* 19 | # include releases 20 | !/pipeline/previous/v0.65.6/release.yaml 21 | !/triggers/previous/v0.30.1/release.yaml 22 | !/triggers/previous/v0.30.1/interceptors.yaml 23 | !/dashboard/previous/v0.53.0/release.yaml 24 | --- 25 | apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 26 | kind: Kustomization 27 | metadata: 28 | name: tekton-pipeline 29 | namespace: tekton-pipelines 30 | spec: 31 | interval: 2m0s 32 | path: ./pipeline/previous/v0.65.6 33 | prune: true 34 | sourceRef: 35 | kind: Bucket 36 | name: tekton 37 | patches: 38 | - target: 39 | kind: Deployment 40 | name: tekton-pipelines-controller 41 | namespace: tekton-pipelines 42 | patch: |- 43 | apiVersion: apps/v1 44 | kind: Deployment 45 | metadata: 46 | name: tekton-pipelines-controller 47 | namespace: tekton-pipelines 48 | spec: 49 | template: 50 | spec: 51 | tolerations: 52 | - key: CriticalAddonsOnly 53 | operator: Exists 54 | - target: 55 | kind: Deployment 56 | name: tekton-pipelines-webhook 57 | namespace: tekton-pipelines 58 | patch: |- 59 | apiVersion: apps/v1 60 | kind: Deployment 61 | metadata: 62 | name: tekton-pipelines-webhook 63 | namespace: tekton-pipelines 64 | spec: 65 | template: 66 | spec: 67 | tolerations: 68 | - key: CriticalAddonsOnly 69 | operator: Exists 70 | - target: 71 | kind: ConfigMap 72 | name: config-defaults 73 | namespace: tekton-pipelines 74 | patch: |- 75 | apiVersion: v1 76 | kind: ConfigMap 77 | metadata: 78 | name: config-defaults 79 | namespace: tekton-pipelines 80 | data: 81 | default-task-run-workspace-binding: | 82 | emptyDir: {} 83 | default-timeout-minutes: "240" 84 | --- 85 | apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 86 | kind: Kustomization 87 | metadata: 88 | name: tekton-triggers 89 | namespace: tekton-pipelines 90 | spec: 91 | interval: 2m0s 92 | path: ./triggers/previous/v0.30.1 93 | prune: true 94 | sourceRef: 95 | kind: Bucket 96 | name: tekton 97 | patches: 98 | - target: 99 | kind: Deployment 100 | name: tekton-triggers-controller 101 | namespace: tekton-pipelines 102 | patch: |- 103 | apiVersion: apps/v1 104 | kind: Deployment 105 | metadata: 106 | name: tekton-triggers-controller 107 | namespace: tekton-pipelines 108 | spec: 109 | template: 110 | spec: 111 | tolerations: 112 | - key: CriticalAddonsOnly 113 | operator: Exists 114 | - target: 115 | kind: Deployment 116 | name: tekton-triggers-webhook 117 | namespace: tekton-pipelines 118 | patch: |- 119 | apiVersion: apps/v1 120 | kind: Deployment 121 | metadata: 122 | name: tekton-triggers-webhook 123 | namespace: tekton-pipelines 124 | spec: 125 | template: 126 | spec: 127 | tolerations: 128 | - key: CriticalAddonsOnly 129 | operator: Exists 130 | --- 131 | apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 132 | kind: Kustomization 133 | metadata: 134 | name: tekton-interceptors 135 | namespace: tekton-pipelines 136 | spec: 137 | interval: 2m0s 138 | path: ./triggers/previous/v0.30.1 139 | prune: true 140 | sourceRef: 141 | kind: Bucket 142 | name: tekton 143 | patches: 144 | - target: 145 | kind: Deployment 146 | name: tekton-triggers-core-interceptors 147 | namespace: tekton-pipelines 148 | patch: |- 149 | apiVersion: apps/v1 150 | kind: Deployment 151 | metadata: 152 | name: tekton-triggers-core-interceptors 153 | namespace: tekton-pipelines 154 | spec: 155 | template: 156 | spec: 157 | tolerations: 158 | - key: CriticalAddonsOnly 159 | operator: Exists 160 | --- 161 | apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 162 | kind: Kustomization 163 | metadata: 164 | name: tekton-dashboard 165 | namespace: tekton-pipelines 166 | spec: 167 | interval: 2m0s 168 | path: ./dashboard/previous/v0.53.0 169 | prune: true 170 | sourceRef: 171 | kind: Bucket 172 | name: tekton 173 | patches: 174 | - target: 175 | kind: Deployment 176 | name: tekton-dashboard 177 | namespace: tekton-pipelines 178 | patch: |- 179 | apiVersion: apps/v1 180 | kind: Deployment 181 | metadata: 182 | name: tekton-dashboard 183 | namespace: tekton-pipelines 184 | spec: 185 | template: 186 | spec: 187 | tolerations: 188 | - key: CriticalAddonsOnly 189 | operator: Exists 190 | - target: 191 | kind: Service 192 | name: tekton-dashboard 193 | namespace: tekton-pipelines 194 | patch: |- 195 | apiVersion: v1 196 | kind: Service 197 | metadata: 198 | name: tekton-dashboard 199 | namespace: tekton-pipelines 200 | spec: 201 | type: NodePort 202 | --------------------------------------------------------------------------------