├── bmg.json
├── .vscode
    └── settings.json
├── internal
    ├── version.go
    ├── e2e
    │   ├── doc.go
    │   ├── mpijobs
    │   │   ├── types.go
    │   │   ├── conditions_test.go
    │   │   └── conditions.go
    │   ├── resources.go
    │   ├── ec2.go
    │   ├── health.go
    │   ├── logs.go
    │   └── conditions.go
    ├── deployers
    │   ├── eksctl
    │   │   ├── build.go
    │   │   ├── down.go
    │   │   ├── cluster_config.go
    │   │   └── deployer.go
    │   └── eksapi
    │   │   ├── templates
    │   │       ├── auth_map_role.yaml.template
    │   │       ├── userdata_bootstrap.sh.mimepart.template
    │   │       ├── userdata_bottlerocket.toml.template
    │   │       ├── templates_test.go
    │   │       ├── userdata_nodeadm.yaml.mimepart.template
    │   │       ├── busybox_deployment.yaml.template
    │   │       ├── nvidia_static_cluster_nodepool.yaml.template
    │   │       ├── cloudwatch_agent_infra.yaml
    │   │       ├── cloudwatch-infra.yaml.template
    │   │       └── templates.go
    │   │   ├── vpccni_test.go
    │   │   ├── common_test.go
    │   │   ├── auth_map_role.go
    │   │   ├── common.go
    │   │   ├── logs_ssm_doc.json
    │   │   ├── metrics.go
    │   │   ├── vpccni.go
    │   │   ├── auth_map_role_test.go
    │   │   ├── ami_resolver_test.go
    │   │   ├── aws.go
    │   │   ├── kubeconfig.go
    │   │   ├── userdata.go
    │   │   ├── addons.go
    │   │   └── ami_resolver.go
    ├── util
    │   ├── lang.go
    │   ├── exec.go
    │   ├── version.go
    │   ├── path.go
    │   ├── http.go
    │   ├── http_test.go
    │   └── cloudformation.go
    ├── metrics
    │   ├── noop.go
    │   ├── registry.go
    │   └── cloudwatch.go
    ├── testers
    │   └── ginkgov1
    │   │   ├── README.md
    │   │   └── kubectl
    │   │       └── kubectl.go
    └── awssdk
    │   └── config.go
├── test
    ├── images
    │   ├── nvidia-inference
    │   │   ├── requirements.txt
    │   │   └── Dockerfile
    │   ├── nvidia-training
    │   │   └── requirements.txt
    │   ├── nvidia
    │   │   └── gpu_unit_tests
    │   │   │   ├── tests
    │   │   │       ├── test_sysinfo.sh.data
    │   │   │       │   ├── g6f.2xlarge
    │   │   │       │   │   ├── efa_count.txt
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_smi_topo.txt
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   ├── nvidia_persistence_status.txt
    │   │   │       │   │   └── nvidia_vgpu_license_status.txt
    │   │   │       │   ├── g6f.4xlarge
    │   │   │       │   │   ├── efa_count.txt
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_smi_topo.txt
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   ├── nvidia_persistence_status.txt
    │   │   │       │   │   └── nvidia_vgpu_license_status.txt
    │   │   │       │   ├── g6f.large
    │   │   │       │   │   ├── efa_count.txt
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_smi_topo.txt
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   ├── nvidia_persistence_status.txt
    │   │   │       │   │   └── nvidia_vgpu_license_status.txt
    │   │   │       │   ├── g6f.xlarge
    │   │   │       │   │   ├── efa_count.txt
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_smi_topo.txt
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   ├── nvidia_persistence_status.txt
    │   │   │       │   │   └── nvidia_vgpu_license_status.txt
    │   │   │       │   ├── g5.8xlarge
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_smi_topo.txt
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   └── nvidia_persistence_status.txt
    │   │   │       │   ├── g5g.2xlarge
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_smi_topo.txt
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   └── nvidia_persistence_status.txt
    │   │   │       │   ├── p3.2xlarge
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_smi_topo.txt
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   └── nvidia_persistence_status.txt
    │   │   │       │   ├── g5.48xlarge
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_persistence_status.txt
    │   │   │       │   │   └── nvidia_smi_topo.txt
    │   │   │       │   ├── p4d.24xlarge
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_persistence_status.txt
    │   │   │       │   │   └── nvidia_smi_topo.txt
    │   │   │       │   ├── p4de.24xlarge
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_persistence_status.txt
    │   │   │       │   │   └── nvidia_smi_topo.txt
    │   │   │       │   └── p5.48xlarge
    │   │   │       │   │   ├── numa_topo.txt
    │   │   │       │   │   ├── gpu_count.txt
    │   │   │       │   │   ├── nvidia_persistence_status.txt
    │   │   │       │   │   └── nvidia_smi_topo.txt
    │   │   │       ├── test_basic.sh
    │   │   │       ├── common.sh
    │   │   │       └── test_sysinfo.sh
    │   │   │   ├── unit_test
    │   │   │   └── README.md
    │   ├── neuron
    │   │   └── tests
    │   │   │   ├── singleNodeTest.sh
    │   │   │   ├── testNeuronSingleAllReduce.py
    │   │   │   └── testNeuronMlp.py
    │   └── efa
    │   │   ├── scripts
    │   │       └── unit-test.sh
    │   │   └── Dockerfile
    ├── cases
    │   ├── neuron-training
    │   │   ├── manifests
    │   │   │   ├── training-comm-service.yaml
    │   │   │   └── bert-training.yaml
    │   │   └── vars.go
    │   ├── quick
    │   │   ├── manifests
    │   │   │   └── ulimit.yaml
    │   │   ├── main_test.go
    │   │   ├── node_topology_test.go
    │   │   ├── io_uring_test.go
    │   │   └── limit_test.go
    │   ├── dra
    │   │   └── main_test.go
    │   ├── nvidia-training
    │   │   ├── vars.go
    │   │   └── manifests
    │   │   │   └── bert-training.yaml
    │   ├── disruptive
    │   │   └── main_test.go
    │   ├── nvidia-inference
    │   │   └── manifests
    │   │   │   └── bert-inference.yaml
    │   ├── neuron
    │   │   └── manifests
    │   │   │   ├── single-node-test-neuronx.yaml
    │   │   │   └── multi-node-test-neuron.yaml
    │   ├── neuron-inference
    │   │   ├── vars.go
    │   │   └── manifests
    │   │   │   └── neuron-bert-inference.yaml
    │   ├── nvidia
    │   │   ├── manifests
    │   │   │   ├── job-unit-test-single-node.yaml
    │   │   │   ├── nvidia-driver-capabilities-check.yaml
    │   │   │   ├── job-hpc-benchmarks.yaml
    │   │   │   ├── mpi-job-pytorch-training-single-node.yaml
    │   │   │   ├── mpi-job-nccl-test-multi-node.yaml
    │   │   │   └── daemonset-containerd-check.yaml
    │   │   ├── containerd_test.go
    │   │   └── capabilities_test.go
    │   ├── workload
    │   │   └── main_test.go
    │   └── efa
    │   │   ├── commons.go
    │   │   └── main_test.go
    ├── manifests
    │   ├── raw.go
    │   ├── rendered.go
    │   └── assets
    │   │   ├── k8s-neuron-device-plugin-rbac.yml
    │   │   ├── dcgm-exporter.yaml
    │   │   ├── efa-device-plugin.yaml
    │   │   ├── nvidia-device-plugin.yaml
    │   │   ├── cloudwatch-agent.yaml
    │   │   └── k8s-neuron-device-plugin.yml
    └── common
    │   ├── flags.go
    │   └── resources.go
├── NOTICE
├── .dockerignore
├── .gitignore
├── cmd
    ├── kubetest2-tester-multi
    │   └── main.go
    ├── kubetest2-tester-ginkgo-v1
    │   └── main.go
    ├── kubetest2-eksapi
    │   └── main.go
    ├── kubetest2-eksctl
    │   └── main.go
    └── kubetest2-eksapi-janitor
    │   └── main.go
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── update-image-tags.yaml
    │   ├── update-nvidia-dependencies.yaml
    │   ├── update-go-dependencies.yaml
    │   ├── update-neuron-dependencies.yaml
    │   └── ci.yaml
├── CODE_OF_CONDUCT.md
├── Makefile
├── external
    └── tools.go
├── Config
├── hack
    ├── update-go-dependencies.sh
    ├── free-disk-space.sh
    ├── update-nvidia-dependencies.sh
    ├── download-kubernetes-binaries.sh
    ├── update-neuron-dependencies.sh
    └── update-image-tags.sh
├── Dockerfile
├── README.md
└── CONTRIBUTING.md


/bmg.json:
--------------------------------------------------------------------------------
1 | {
2 |   "binary_artifacts_only": true
3 | }
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "git.ignoreLimitWarning": true
3 | }


--------------------------------------------------------------------------------
/internal/version.go:
--------------------------------------------------------------------------------
1 | package internal
2 | 
3 | var Version string
4 | 


--------------------------------------------------------------------------------
/test/images/nvidia-inference/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.53.0
2 | numpy==1.26
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia-training/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.53.0
2 | numpy==1.26
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/efa_count.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/efa_count.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/efa_count.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Awstester
2 | Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 
3 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git/
2 | .github/
3 | bin/
4 | CHANGELOG/
5 | Dockerfile
6 | Makefile
7 | aws-k8s-tester
8 | 


--------------------------------------------------------------------------------
/internal/e2e/doc.go:
--------------------------------------------------------------------------------
1 | // Package frameworkext contains extensions to sigs.k8s.io/e2e-framework
2 | package e2e
3 | 


--------------------------------------------------------------------------------
/internal/deployers/eksctl/build.go:
--------------------------------------------------------------------------------
1 | package eksctl
2 | 
3 | // Build is a no-op
4 | func (d *deployer) Build() error {
5 | 	return nil
6 | }
7 | 


--------------------------------------------------------------------------------
/internal/util/lang.go:
--------------------------------------------------------------------------------
1 | package util
2 | 
3 | func Must[T any](t T, err error) T {
4 | 	if err != nil {
5 | 		panic(err)
6 | 	}
7 | 	return t
8 | }
9 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
1 | name, index, pci.bus_id
2 | NVIDIA A10G, 0, 00000000:00:1E.0
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
1 | name, index, pci.bus_id
2 | NVIDIA T4G, 0, 00000000:00:1F.0
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
1 | name, index, pci.bus_id
2 | NVIDIA L4-6Q, 0, 00000000:31:00.0
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
1 | name, index, pci.bus_id
2 | NVIDIA L4-12Q, 0, 00000000:35:00.0
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/gpu_count.txt:
--------------------------------------------------------------------------------
1 | name, index, pci.bus_id
2 | NVIDIA L4-3Q, 0, 00000000:31:00.0
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
1 | name, index, pci.bus_id
2 | NVIDIA L4-3Q, 0, 00000000:31:00.0
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.DS_Store
 2 | /bin
 3 | /_tmp
 4 | .idea
 5 | *.swp
 6 | /aws-k8s-tester
 7 | */*/.DS_Store
 8 | */.DS_Store
 9 | /_artifacts
10 | /_rundir
11 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
1 | name, index, pci.bus_id
2 | Tesla V100-SXM2-16GB, 0, 00000000:00:1E.0
3 | 


--------------------------------------------------------------------------------
/cmd/kubetest2-tester-multi/main.go:
--------------------------------------------------------------------------------
1 | package main
2 | 
3 | import "github.com/aws/aws-k8s-tester/internal/testers/multi"
4 | 
5 | func main() {
6 | 	multi.Main()
7 | }
8 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
1 | 	[4mGPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
2 | GPU0	 X 	0-31	0		N/A
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
1 | 	[4mGPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
2 | GPU0	 X 	0-7	0		N/A
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
1 | 	[4mGPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
2 | GPU0	 X 	0-7	0		N/A
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
1 | 	[4mGPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
2 | GPU0	 X 	0-1	0		N/A
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
1 | 	[4mGPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
2 | GPU0	 X 	0-3	0		N/A
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
1 | 	[4mGPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
2 | GPU0	 X 	0-7	0		N/A
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
1 | 	[4mGPU0	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
2 | GPU0	 X 	0-15	0		N/A
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-1
2 | /sys/devices/system/node/node0/distance:10
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-31
2 | /sys/devices/system/node/node0/distance:10
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
1 | name, pci.bus_id, persistence_mode
2 | NVIDIA A10G, 00000000:00:1E.0, Enabled
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-7
2 | /sys/devices/system/node/node0/distance:10
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
1 | name, pci.bus_id, persistence_mode
2 | NVIDIA T4G, 00000000:00:1F.0, Enabled
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-7
2 | /sys/devices/system/node/node0/distance:10
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-15
2 | /sys/devices/system/node/node0/distance:10
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
1 | name, pci.bus_id, persistence_mode
2 | NVIDIA L4-3Q, 00000000:31:00.0, Enabled
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-3
2 | /sys/devices/system/node/node0/distance:10
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-7
2 | /sys/devices/system/node/node0/distance:10
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
1 | name, pci.bus_id, persistence_mode
2 | NVIDIA L4-6Q, 00000000:31:00.0, Enabled
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
1 | name, pci.bus_id, persistence_mode
2 | NVIDIA L4-12Q, 00000000:35:00.0, Enabled
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
1 | name, pci.bus_id, persistence_mode
2 | NVIDIA L4-3Q, 00000000:31:00.0, Enabled
3 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
1 | name, pci.bus_id, persistence_mode
2 | Tesla V100-SXM2-16GB, 00000000:00:1E.0, Enabled
3 | 


--------------------------------------------------------------------------------
/cmd/kubetest2-tester-ginkgo-v1/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/aws/aws-k8s-tester/internal/testers/ginkgov1"
 5 | )
 6 | 
 7 | func main() {
 8 | 	ginkgov1.Main()
 9 | }
10 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
7 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/auth_map_role.yaml.template:
--------------------------------------------------------------------------------
1 | 
2 | - username: system:node:{{"{{"}}{{.NodeNameStrategy}}{{"}}"}} 
3 |   groups:
4 |     - system:bootstrappers
5 |     - system:nodes
6 |   rolearn: {{.Rolearn}}


--------------------------------------------------------------------------------
/test/cases/neuron-training/manifests/training-comm-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: training
 5 |   labels:
 6 |     app: training
 7 | spec:
 8 |   clusterIP: None
 9 |   selector:
10 |     job-name: bert-training
11 | 


--------------------------------------------------------------------------------
/cmd/kubetest2-eksapi/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/aws/aws-k8s-tester/internal/deployers/eksapi"
 5 | 	"sigs.k8s.io/kubetest2/pkg/app"
 6 | )
 7 | 
 8 | func main() {
 9 | 	app.Main(eksapi.DeployerName, eksapi.NewDeployer)
10 | }
11 | 


--------------------------------------------------------------------------------
/cmd/kubetest2-eksctl/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/aws/aws-k8s-tester/internal/deployers/eksctl"
 5 | 	"sigs.k8s.io/kubetest2/pkg/app"
 6 | )
 7 | 
 8 | func main() {
 9 | 	app.Main(eksctl.DeployerName, eksctl.NewDeployer)
10 | }
11 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_vgpu_license_status.txt:
--------------------------------------------------------------------------------
1 |     vGPU Software Licensed Product
2 |         Product Name                      : NVIDIA RTX Virtual Workstation
3 |         License Status                    : Licensed (Expiry: N/A)
4 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt:
--------------------------------------------------------------------------------
1 |     vGPU Software Licensed Product
2 |         Product Name                      : NVIDIA RTX Virtual Workstation
3 |         License Status                    : Licensed (Expiry: N/A)
4 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_vgpu_license_status.txt:
--------------------------------------------------------------------------------
1 |     vGPU Software Licensed Product
2 |         Product Name                      : NVIDIA RTX Virtual Workstation
3 |         License Status                    : Licensed (Expiry: N/A)
4 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_vgpu_license_status.txt:
--------------------------------------------------------------------------------
1 |     vGPU Software Licensed Product
2 |         Product Name                      : NVIDIA RTX Virtual Workstation
3 |         License Status                    : Licensed (Expiry: N/A)
4 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-47,96-143
2 | /sys/devices/system/node/node1/cpulist:48-95,144-191
3 | /sys/devices/system/node/node0/distance:10 32
4 | /sys/devices/system/node/node1/distance:32 10
5 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-23,48-71
2 | /sys/devices/system/node/node1/cpulist:24-47,72-95
3 | /sys/devices/system/node/node0/distance:10 21
4 | /sys/devices/system/node/node1/distance:21 10
5 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-23,48-71
2 | /sys/devices/system/node/node1/cpulist:24-47,72-95
3 | /sys/devices/system/node/node0/distance:10 21
4 | /sys/devices/system/node/node1/distance:21 10
5 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/numa_topo.txt:
--------------------------------------------------------------------------------
1 | /sys/devices/system/node/node0/cpulist:0-47,96-143
2 | /sys/devices/system/node/node1/cpulist:48-95,144-191
3 | /sys/devices/system/node/node0/distance:10 32
4 | /sys/devices/system/node/node1/distance:32 10
5 | 


--------------------------------------------------------------------------------
/test/images/neuron/tests/singleNodeTest.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronSingleAllReduce.py
6 | torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronParallelState.py
7 | torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronMlp.py


--------------------------------------------------------------------------------
/internal/util/exec.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"os/exec"
 6 | )
 7 | 
 8 | func ExecuteCommand(name string, args ...string) error {
 9 | 	command := exec.Command(name, args...)
10 | 	command.Stdout = os.Stdout
11 | 	command.Stderr = os.Stderr
12 | 	return command.Run()
13 | }
14 | 


--------------------------------------------------------------------------------
/test/cases/quick/manifests/ulimit.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: ulimit
 5 | spec:
 6 |   restartPolicy: Never
 7 |   containers:
 8 |   - name: al2023
 9 |     image: public.ecr.aws/amazonlinux/amazonlinux:2023
10 |     command: ["ulimit"]
11 |     args:
12 |       - -a
13 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/userdata_bootstrap.sh.mimepart.template:
--------------------------------------------------------------------------------
1 | Content-Type: text/x-shellscript; charset="us-ascii"
2 | MIME-Version: 1.0
3 | 
4 | #!/usr/bin/env bash
5 | /etc/eks/bootstrap.sh {{.Name}} \
6 |   --b64-cluster-ca {{.CertificateAuthority}} \
7 |   --apiserver-endpoint {{.APIServerEndpoint}}
8 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/vpccni_test.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func Test_validVPCCNIDaemonSetPatch(t *testing.T) {
 9 | 	var j json.RawMessage
10 | 	if err := json.Unmarshal([]byte(vpcCNIDaemonSetPatch), &j); err != nil {
11 | 		t.Error(err)
12 | 	}
13 | }
14 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/userdata_bottlerocket.toml.template:
--------------------------------------------------------------------------------
1 | [settings.kubernetes]
2 | "cluster-name" = "{{.Name}}"
3 | "api-server" = "{{.APIServerEndpoint}}"
4 | "cluster-certificate" = "{{.CertificateAuthority}}"
5 | device-ownership-from-security-context = true
6 | 
7 | [settings.host-containers.admin]
8 | "enabled" = true
9 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/internal/metrics/noop.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | func NewNoopMetricRegistry() MetricRegistry {
 4 | 	return &noopRegistry{}
 5 | }
 6 | 
 7 | type noopRegistry struct{}
 8 | 
 9 | func (r *noopRegistry) Record(spec *MetricSpec, value float64, dimensions map[string]string) {}
10 | 
11 | func (r *noopRegistry) Emit() error {
12 | 	return nil
13 | }
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | include ${BGO_MAKEFILE}
 2 | 
 3 | pre-release::
 4 | 	go test -c -tags=e2e ./test/... -o $(GOBIN)
 5 | 	go install sigs.k8s.io/kubetest2/...@latest
 6 | 
 7 | update-deps:
 8 | 	for SCRIPT in ./hack/update-*.sh; do \
 9 |     	"$$SCRIPT" ; \
10 | 	done
11 | 
12 | .PHONY: test-integration
13 | test-integration: ## Run unit and integration tests
14 | 	go test -v -tags=integration ./...
15 | 


--------------------------------------------------------------------------------
/external/tools.go:
--------------------------------------------------------------------------------
 1 | //go:build tools
 2 | // +build tools
 3 | 
 4 | package external
 5 | 
 6 | // this file allows us to declare direct dependencies on our required external tools.
 7 | // this file will not compile! that's expected.
 8 | 
 9 | import (
10 | 	_ "sigs.k8s.io/kubetest2"
11 | 	_ "sigs.k8s.io/kubetest2/kubetest2-tester-exec"
12 | 	_ "sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo"
13 | )
14 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
 1 | name, index, pci.bus_id
 2 | NVIDIA A10G, 0, 00000000:00:16.0
 3 | NVIDIA A10G, 1, 00000000:00:17.0
 4 | NVIDIA A10G, 2, 00000000:00:18.0
 5 | NVIDIA A10G, 3, 00000000:00:19.0
 6 | NVIDIA A10G, 4, 00000000:00:1A.0
 7 | NVIDIA A10G, 5, 00000000:00:1B.0
 8 | NVIDIA A10G, 6, 00000000:00:1C.0
 9 | NVIDIA A10G, 7, 00000000:00:1D.0
10 | 


--------------------------------------------------------------------------------
/Config:
--------------------------------------------------------------------------------
 1 | # This file is for Amazon internal build processes
 2 | 
 3 | # Copyright 2025 Amazon.com, Inc. or its affiliates.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | 
 6 | package.Aws-k8s-tester-mirror = {
 7 |     interfaces = (1.0);
 8 | 
 9 |     build-system = bgo-wrap-make;
10 |     build-tools = {
11 |         1.0 = {
12 |             BrazilMakeGo = 3.0;
13 |             GoLang = 1.x;
14 |         };
15 |     };
16 | };
17 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/common_test.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | )
 8 | 
 9 | func Test_AZ_PRIORITY(t *testing.T) {
10 | 	t.Setenv(AvailabilityZonePriorityEnv, "us-west-2d")
11 | 	assert.Equal(t,
12 | 		[]string{"us-west-2d", "us-west-2b", "us-west-2c"},
13 | 		availabilityZoneHintedOrder([]string{"us-west-2b", "us-west-2c", "us-west-2d"}),
14 | 	)
15 | }
16 | 


--------------------------------------------------------------------------------
/internal/testers/ginkgov1/README.md:
--------------------------------------------------------------------------------
 1 | This tester supports ginkgo 1.x versions, which were used for Kubernetes versions prior to 1.25.
 2 | 
 3 | ---
 4 | 
 5 | This is a fork of the `ginkgo` tester: https://github.com/kubernetes-sigs/kubetest2/tree/master/pkg/testers/ginkgo
 6 | 
 7 | The fork originated at commit `d7fcb799ce84ceda66c8b9b1ec8eefcbe226f293`.
 8 | 
 9 | A copy of the original license is provided in the file named `LICENSE.original`.
10 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/unit_test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit
 4 | set -o nounset
 5 | set -o pipefail
 6 | 
 7 | TRACE_LOG=trace.log
 8 | TEST_TIMEOUT=1800
 9 | BASH="/usr/bin/bash"
10 | CURRENT_DIR=$(pwd)
11 | SKIP_TESTS_SUBCOMMAND=${SKIP_TESTS_SUBCOMMAND:-""}
12 | 
13 | timeout -k 10 ${TEST_TIMEOUT} ${BASH} gpu_unit_tests/bash_unit -f tap ${SKIP_TESTS_SUBCOMMAND} -t gpu_unit_tests/${TRACE_LOG} gpu_unit_tests/tests/*test*.sh
14 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
 1 | name, pci.bus_id, persistence_mode
 2 | NVIDIA A10G, 00000000:00:16.0, Enabled
 3 | NVIDIA A10G, 00000000:00:17.0, Enabled
 4 | NVIDIA A10G, 00000000:00:18.0, Enabled
 5 | NVIDIA A10G, 00000000:00:19.0, Enabled
 6 | NVIDIA A10G, 00000000:00:1A.0, Enabled
 7 | NVIDIA A10G, 00000000:00:1B.0, Enabled
 8 | NVIDIA A10G, 00000000:00:1C.0, Enabled
 9 | NVIDIA A10G, 00000000:00:1D.0, Enabled
10 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/templates_test.go:
--------------------------------------------------------------------------------
 1 | package templates
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func Test_UnmanagedNodegroup(t *testing.T) {
 9 | 	buf := bytes.Buffer{}
10 | 	err := UnmanagedNodegroup.Execute(&buf, UnmanagedNodegroupTemplateData{
11 | 		KubernetesVersion: "1.28",
12 | 		InstanceTypes: []string{
13 | 			"t2.medium",
14 | 			"t2.large",
15 | 			"t2.xlarge",
16 | 		},
17 | 	})
18 | 	if err != nil {
19 | 		t.Error(err)
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
 1 | name, index, pci.bus_id
 2 | NVIDIA A100-SXM4-40GB, 0, 00000000:10:1C.0
 3 | NVIDIA A100-SXM4-40GB, 1, 00000000:10:1D.0
 4 | NVIDIA A100-SXM4-40GB, 2, 00000000:20:1C.0
 5 | NVIDIA A100-SXM4-40GB, 3, 00000000:20:1D.0
 6 | NVIDIA A100-SXM4-40GB, 4, 00000000:90:1C.0
 7 | NVIDIA A100-SXM4-40GB, 5, 00000000:90:1D.0
 8 | NVIDIA A100-SXM4-40GB, 6, 00000000:A0:1C.0
 9 | NVIDIA A100-SXM4-40GB, 7, 00000000:A0:1D.0
10 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
 1 | name, index, pci.bus_id
 2 | NVIDIA A100-SXM4-80GB, 0, 00000000:10:1C.0
 3 | NVIDIA A100-SXM4-80GB, 1, 00000000:10:1D.0
 4 | NVIDIA A100-SXM4-80GB, 2, 00000000:20:1C.0
 5 | NVIDIA A100-SXM4-80GB, 3, 00000000:20:1D.0
 6 | NVIDIA A100-SXM4-80GB, 4, 00000000:90:1C.0
 7 | NVIDIA A100-SXM4-80GB, 5, 00000000:90:1D.0
 8 | NVIDIA A100-SXM4-80GB, 6, 00000000:A0:1C.0
 9 | NVIDIA A100-SXM4-80GB, 7, 00000000:A0:1D.0
10 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/gpu_count.txt:
--------------------------------------------------------------------------------
 1 | name, index, pci.bus_id
 2 | NVIDIA H100 80GB HBM3, 0, 00000000:53:00.0
 3 | NVIDIA H100 80GB HBM3, 1, 00000000:64:00.0
 4 | NVIDIA H100 80GB HBM3, 2, 00000000:75:00.0
 5 | NVIDIA H100 80GB HBM3, 3, 00000000:86:00.0
 6 | NVIDIA H100 80GB HBM3, 4, 00000000:97:00.0
 7 | NVIDIA H100 80GB HBM3, 5, 00000000:A8:00.0
 8 | NVIDIA H100 80GB HBM3, 6, 00000000:B9:00.0
 9 | NVIDIA H100 80GB HBM3, 7, 00000000:CA:00.0
10 | 


--------------------------------------------------------------------------------
/internal/awssdk/config.go:
--------------------------------------------------------------------------------
 1 | package awssdk
 2 | 
 3 | import (
 4 | 	"context"
 5 | 
 6 | 	"github.com/aws/aws-sdk-go-v2/aws"
 7 | 	"github.com/aws/aws-sdk-go-v2/config"
 8 | 	"k8s.io/klog/v2"
 9 | )
10 | 
11 | // NewConfig returns an AWS SDK config
12 | // It will panic if the cnfig cannot be created
13 | func NewConfig() aws.Config {
14 | 	c, err := config.LoadDefaultConfig(context.TODO())
15 | 	if err != nil {
16 | 		klog.Fatalf("failed to create AWS SDK config: %v", err)
17 | 	}
18 | 	return c
19 | }
20 | 


--------------------------------------------------------------------------------
/hack/update-go-dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o nounset
 4 | set -o errexit
 5 | set -o pipefail
 6 | 
 7 | echo "Updating go modules..."
 8 | go get $(go list -f '{{if not (or .Main .Indirect)}}{{.Path}}{{end}}' -mod=mod -m all) && go mod tidy
 9 | 
10 | echo "Updating kubetest2 image go version..."
11 | MODULE_GO_VERSION=$(go list -m -f "{{if .Main}}{{.GoVersion}}{{end}}" | cut -d'.' -f1-2)
12 | find . -type f -name Dockerfile -exec sed -i "s/\(GO_MINOR_VERSION\)=.*/\1=${MODULE_GO_VERSION}/g" {} +
13 | 


--------------------------------------------------------------------------------
/hack/free-disk-space.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o nounset
 4 | set -o errexit
 5 | set -o pipefail
 6 | 
 7 | # hack to free up disk space for build
 8 | # ref: https://github.com/easimon/maximize-build-space/blob/master/action.yml
 9 | 
10 | # storage before
11 | sudo df -h
12 | 
13 | sudo rm -rf \
14 |   /usr/share/dotnet \
15 |   /usr/local/lib/android \
16 |   /opt/ghc \
17 |   /opt/hostedtoolcache/CodeQL
18 | 
19 | docker image prune --all --force
20 | docker builder prune -a
21 | 
22 | # storage after
23 | sudo df -h
24 | 


--------------------------------------------------------------------------------
/internal/metrics/registry.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
 5 | )
 6 | 
 7 | type MetricRegistry interface {
 8 | 	// Record adds a new metric value to the registry
 9 | 	Record(spec *MetricSpec, value float64, dimensions map[string]string)
10 | 	// Emit sends all registered metric values to cloudwatch, emptying the registry
11 | 	Emit() error
12 | }
13 | 
14 | type MetricSpec struct {
15 | 	Namespace string
16 | 	Metric    string
17 | 	Unit      types.StandardUnit
18 | }
19 | 


--------------------------------------------------------------------------------
/internal/e2e/mpijobs/types.go:
--------------------------------------------------------------------------------
 1 | package mpijobs
 2 | 
 3 | import (
 4 | 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 5 | 	"k8s.io/apimachinery/pkg/runtime/schema"
 6 | )
 7 | 
 8 | var MPIJobGVK = schema.GroupVersionKind{
 9 | 	Group:   "kubeflow.org",
10 | 	Version: "v2beta1",
11 | 	Kind:    "MPIJob",
12 | }
13 | 
14 | func NewUnstructured(name, namespace string) *unstructured.Unstructured {
15 | 	u := unstructured.Unstructured{}
16 | 	u.SetGroupVersionKind(MPIJobGVK)
17 | 	u.SetName(name)
18 | 	u.SetNamespace(namespace)
19 | 	return &u
20 | }
21 | 


--------------------------------------------------------------------------------
/internal/e2e/resources.go:
--------------------------------------------------------------------------------
 1 | package e2e
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	v1 "k8s.io/api/core/v1"
 7 | )
 8 | 
 9 | func GetNonZeroResourceCapacity(node *v1.Node, resourceName string) (int, error) {
10 | 	capacity, ok := node.Status.Capacity[v1.ResourceName(resourceName)]
11 | 	if !ok {
12 | 		return 0, fmt.Errorf("node %q has no resource %q", node.Name, resourceName)
13 | 	}
14 | 	if capacity.Value() == 0 {
15 | 		return 0, fmt.Errorf("node %q has zero capacity for resource %q", node.Name, resourceName)
16 | 	}
17 | 	return int(capacity.Value()), nil
18 | }
19 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
 1 | name, pci.bus_id, persistence_mode
 2 | NVIDIA A100-SXM4-40GB, 00000000:10:1C.0, Enabled
 3 | NVIDIA A100-SXM4-40GB, 00000000:10:1D.0, Enabled
 4 | NVIDIA A100-SXM4-40GB, 00000000:20:1C.0, Enabled
 5 | NVIDIA A100-SXM4-40GB, 00000000:20:1D.0, Enabled
 6 | NVIDIA A100-SXM4-40GB, 00000000:90:1C.0, Enabled
 7 | NVIDIA A100-SXM4-40GB, 00000000:90:1D.0, Enabled
 8 | NVIDIA A100-SXM4-40GB, 00000000:A0:1C.0, Enabled
 9 | NVIDIA A100-SXM4-40GB, 00000000:A0:1D.0, Enabled
10 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
 1 | name, pci.bus_id, persistence_mode
 2 | NVIDIA A100-SXM4-80GB, 00000000:10:1C.0, Enabled
 3 | NVIDIA A100-SXM4-80GB, 00000000:10:1D.0, Enabled
 4 | NVIDIA A100-SXM4-80GB, 00000000:20:1C.0, Enabled
 5 | NVIDIA A100-SXM4-80GB, 00000000:20:1D.0, Enabled
 6 | NVIDIA A100-SXM4-80GB, 00000000:90:1C.0, Enabled
 7 | NVIDIA A100-SXM4-80GB, 00000000:90:1D.0, Enabled
 8 | NVIDIA A100-SXM4-80GB, 00000000:A0:1C.0, Enabled
 9 | NVIDIA A100-SXM4-80GB, 00000000:A0:1D.0, Enabled
10 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_persistence_status.txt:
--------------------------------------------------------------------------------
 1 | name, pci.bus_id, persistence_mode
 2 | NVIDIA H100 80GB HBM3, 00000000:53:00.0, Enabled
 3 | NVIDIA H100 80GB HBM3, 00000000:64:00.0, Enabled
 4 | NVIDIA H100 80GB HBM3, 00000000:75:00.0, Enabled
 5 | NVIDIA H100 80GB HBM3, 00000000:86:00.0, Enabled
 6 | NVIDIA H100 80GB HBM3, 00000000:97:00.0, Enabled
 7 | NVIDIA H100 80GB HBM3, 00000000:A8:00.0, Enabled
 8 | NVIDIA H100 80GB HBM3, 00000000:B9:00.0, Enabled
 9 | NVIDIA H100 80GB HBM3, 00000000:CA:00.0, Enabled
10 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
 1 | 	[4mGPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	CPU Affinity	NUMA Affinity
 2 | GPU0	 X 	PHB	PHB	PHB	PHB	PHB	PHB	PHB	0-191	0-1
 3 | GPU1	PHB	 X 	PHB	PHB	PHB	PHB	PHB	PHB	0-191	0-1
 4 | GPU2	PHB	PHB	 X 	PHB	PHB	PHB	PHB	PHB	0-191	0-1
 5 | GPU3	PHB	PHB	PHB	 X 	PHB	PHB	PHB	PHB	0-191	0-1
 6 | GPU4	PHB	PHB	PHB	PHB	 X 	PHB	PHB	PHB	0-191	0-1
 7 | GPU5	PHB	PHB	PHB	PHB	PHB	 X 	PHB	PHB	0-191	0-1
 8 | GPU6	PHB	PHB	PHB	PHB	PHB	PHB	 X 	PHB	0-191	0-1
 9 | GPU7	PHB	PHB	PHB	PHB	PHB	PHB	PHB	 X 	0-191	0-1
10 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/auth_map_role.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 
 6 | 	"github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates"
 7 | )
 8 | 
 9 | func generateAuthMapRole(nodeNameStrategy string, rolearn string) (string, error) {
10 | 	template := templates.AuthMapRole
11 | 	buf := bytes.Buffer{}
12 | 	if err := template.Execute(&buf, templates.AuthMapRoleTemplateData{
13 | 		NodeNameStrategy: nodeNameStrategy,
14 | 		Rolearn:          rolearn,
15 | 	}); err != nil {
16 | 		return "", err
17 | 	}
18 | 	return buf.String(), nil
19 | }
20 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
 1 | 	[4mGPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	CPU Affinity	NUMA Affinity
 2 | GPU0	 X 	NV12	NV12	NV12	NV12	NV12	NV12	NV12	0-23,48-71	0
 3 | GPU1	NV12	 X 	NV12	NV12	NV12	NV12	NV12	NV12	0-23,48-71	0
 4 | GPU2	NV12	NV12	 X 	NV12	NV12	NV12	NV12	NV12	0-23,48-71	0
 5 | GPU3	NV12	NV12	NV12	 X 	NV12	NV12	NV12	NV12	0-23,48-71	0
 6 | GPU4	NV12	NV12	NV12	NV12	 X 	NV12	NV12	NV12	24-47,72-95	1
 7 | GPU5	NV12	NV12	NV12	NV12	NV12	 X 	NV12	NV12	24-47,72-95	1
 8 | GPU6	NV12	NV12	NV12	NV12	NV12	NV12	 X 	NV12	24-47,72-95	1
 9 | GPU7	NV12	NV12	NV12	NV12	NV12	NV12	NV12	 X 	24-47,72-95	1
10 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
 1 | 	[4mGPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	CPU Affinity	NUMA Affinity
 2 | GPU0	 X 	NV12	NV12	NV12	NV12	NV12	NV12	NV12	0-23,48-71	0
 3 | GPU1	NV12	 X 	NV12	NV12	NV12	NV12	NV12	NV12	0-23,48-71	0
 4 | GPU2	NV12	NV12	 X 	NV12	NV12	NV12	NV12	NV12	0-23,48-71	0
 5 | GPU3	NV12	NV12	NV12	 X 	NV12	NV12	NV12	NV12	0-23,48-71	0
 6 | GPU4	NV12	NV12	NV12	NV12	 X 	NV12	NV12	NV12	24-47,72-95	1
 7 | GPU5	NV12	NV12	NV12	NV12	NV12	 X 	NV12	NV12	24-47,72-95	1
 8 | GPU6	NV12	NV12	NV12	NV12	NV12	NV12	 X 	NV12	24-47,72-95	1
 9 | GPU7	NV12	NV12	NV12	NV12	NV12	NV12	NV12	 X 	24-47,72-95	1
10 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_smi_topo.txt:
--------------------------------------------------------------------------------
 1 | 	[4mGPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	CPU Affinity	NUMA Affinity
 2 | GPU0	 X 	NV18	NV18	NV18	NV18	NV18	NV18	NV18	0-47,96-143	0
 3 | GPU1	NV18	 X 	NV18	NV18	NV18	NV18	NV18	NV18	0-47,96-143	0
 4 | GPU2	NV18	NV18	 X 	NV18	NV18	NV18	NV18	NV18	0-47,96-143	0
 5 | GPU3	NV18	NV18	NV18	 X 	NV18	NV18	NV18	NV18	0-47,96-143	0
 6 | GPU4	NV18	NV18	NV18	NV18	 X 	NV18	NV18	NV18	48-95,144-191	1
 7 | GPU5	NV18	NV18	NV18	NV18	NV18	 X 	NV18	NV18	48-95,144-191	1
 8 | GPU6	NV18	NV18	NV18	NV18	NV18	NV18	 X 	NV18	48-95,144-191	1
 9 | GPU7	NV18	NV18	NV18	NV18	NV18	NV18	NV18	 X 	48-95,144-191	1
10 | 


--------------------------------------------------------------------------------
/test/cases/dra/main_test.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package dra
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	_ "embed"
 8 | 	"log"
 9 | 	"os"
10 | 	"os/signal"
11 | 	"testing"
12 | 
13 | 	"sigs.k8s.io/e2e-framework/pkg/env"
14 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
15 | )
16 | 
17 | var (
18 | 	testenv env.Environment
19 | )
20 | 
21 | func TestMain(m *testing.M) {
22 | 	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
23 | 	defer cancel()
24 | 	cfg, err := envconf.NewFromFlags()
25 | 	if err != nil {
26 | 		log.Fatalf("failed to initialize test environment: %v", err)
27 | 	}
28 | 	testenv = env.NewWithConfig(cfg).WithContext(ctx)
29 | 	os.Exit(testenv.Run(m))
30 | }
31 | 


--------------------------------------------------------------------------------
/test/cases/nvidia-training/vars.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package training
 4 | 
 5 | import (
 6 | 	"github.com/aws/aws-k8s-tester/test/common"
 7 | 	"sigs.k8s.io/e2e-framework/pkg/env"
 8 | )
 9 | 
10 | type Config struct {
11 | 	common.MetricOps
12 | 	BertTrainingImage string `flag:"bertTrainingImage" desc:"Docker image used for BERT training workload"`
13 | 	EfaEnabled        bool   `flag:"efaEnabled" desc:"Enable Elastic Fabric Adapter (EFA)"`
14 | 	NodeType          string `flag:"nodeType" desc:"Instance type for cluster nodes"`
15 | }
16 | 
17 | // Shared global variables
18 | var (
19 | 	testenv    env.Environment
20 | 	testConfig Config
21 | 
22 | 	nodeCount  int
23 | 	gpuPerNode int
24 | 	efaPerNode int
25 | )
26 | 


--------------------------------------------------------------------------------
/test/manifests/raw.go:
--------------------------------------------------------------------------------
 1 | package manifests
 2 | 
 3 | import (
 4 | 	_ "embed"
 5 | )
 6 | 
 7 | var (
 8 | 	//go:embed assets/nvidia-device-plugin.yaml
 9 | 	NvidiaDevicePluginManifest []byte
10 | 	//go:embed assets/mpi-operator.yaml
11 | 	MpiOperatorManifest []byte
12 | 
13 | 	//go:embed assets/efa-device-plugin.yaml
14 | 	EfaDevicePluginManifest []byte
15 | 
16 | 	//go:embed assets/k8s-neuron-device-plugin-rbac.yml
17 | 	NeuronDevicePluginRbacManifest []byte
18 | 	//go:embed assets/k8s-neuron-device-plugin.yml
19 | 	NeuronDevicePluginManifest []byte
20 | 
21 | 	//go:embed assets/dcgm-exporter.yaml
22 | 	DCGMExporterManifest []byte
23 | 
24 | 	//go:embed assets/cloudwatch-agent.yaml
25 | 	cloudWatchAgentManifestTemplate []byte
26 | )
27 | 


--------------------------------------------------------------------------------
/test/manifests/rendered.go:
--------------------------------------------------------------------------------
 1 | package manifests
 2 | 
 3 | import (
 4 | 	"html/template"
 5 | 	"strings"
 6 | 
 7 | 	fwext "github.com/aws/aws-k8s-tester/internal/e2e"
 8 | )
 9 | 
10 | // RenderCloudWatchAgentManifest renders the CloudWatch Agent manifest with dynamic dimensions
11 | func RenderCloudWatchAgentManifest(metricDimensions map[string]string) ([]byte, error) {
12 | 	var keys []string
13 | 	for key := range metricDimensions {
14 | 		keys = append(keys, `"`+key+`"`)
15 | 	}
16 | 	dimensionsStr := strings.Join(keys, ", ")
17 | 	return fwext.RenderManifests(cloudWatchAgentManifestTemplate, map[string]interface{}{
18 | 		"MetricDimensions": metricDimensions,
19 | 		"DimensionKeys":    template.HTML(dimensionsStr),
20 | 	})
21 | }
22 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/common.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"slices"
 6 | 	"strings"
 7 | )
 8 | 
 9 | const AvailabilityZonePriorityEnv = "EKSAPI_AZ_PRIORITY"
10 | 
11 | func availabilityZoneHintedOrder(availabilityZones []string) []string {
12 | 	var priorityAZs []string
13 | 	if priorityAZsString, ok := os.LookupEnv(AvailabilityZonePriorityEnv); ok {
14 | 		priorityAZs = strings.Split(priorityAZsString, ",")
15 | 	}
16 | 	if len(priorityAZs) == 0 {
17 | 		return availabilityZones
18 | 	}
19 | 	return slices.SortedStableFunc(slices.Values(availabilityZones), func(az1, az2 string) int {
20 | 		if slices.Contains(priorityAZs, az1) {
21 | 			if slices.Contains(priorityAZs, az2) {
22 | 				return 0
23 | 			}
24 | 			return -1
25 | 		}
26 | 		return 0
27 | 	})
28 | }
29 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/userdata_nodeadm.yaml.mimepart.template:
--------------------------------------------------------------------------------
 1 | Content-Type: application/node.eks.aws
 2 | MIME-Version: 1.0
 3 | 
 4 | ---
 5 | apiVersion: node.eks.aws/v1alpha1
 6 | kind: NodeConfig
 7 | spec:
 8 | {{- if .NodeadmFeatureGates}}
 9 |   featureGates:
10 |     {{- range $gate, $value := .NodeadmFeatureGates }}
11 |     {{$gate}}: {{$value}}
12 |     {{- end }}
13 | {{- end }}
14 |   cluster:
15 |     name: {{.Name}}
16 |     apiServerEndpoint: {{.APIServerEndpoint}}
17 |     certificateAuthority: {{.CertificateAuthority}}
18 |     cidr: {{.CIDR}}
19 | {{- if .KubeletFeatureGates}}
20 |   kubelet:
21 |     config:
22 |       featureGates:
23 |         {{- range $gate, $value := .KubeletFeatureGates }}
24 |         {{$gate}}: {{$value}}
25 |         {{- end }}
26 | {{- end }}
27 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/busybox_deployment.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: busybox-deployment
 5 | spec:
 6 |   replicas: {{.Nodes}}
 7 |   selector:
 8 |     matchLabels:
 9 |       app: busybox
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: busybox
14 |     spec:
15 |       affinity:
16 |         podAntiAffinity:
17 |           requiredDuringSchedulingIgnoredDuringExecution:
18 |           - labelSelector:
19 |               matchExpressions:
20 |               - key: app
21 |                 operator: In
22 |                 values:
23 |                 - busybox
24 |             topologyKey: "kubernetes.io/hostname"
25 |       containers:
26 |       - name: busybox
27 |         image: busybox
28 |         command: ["sleep", "infinity"]
29 | 


--------------------------------------------------------------------------------
/internal/util/version.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"strings"
 7 | )
 8 | 
 9 | const KubernetesVersionFile = "kubernetes-version.txt"
10 | 
11 | func DetectKubernetesVersion() (string, error) {
12 | 	versionFile, err := LookPath(KubernetesVersionFile)
13 | 	if err != nil {
14 | 		return "", err
15 | 	}
16 | 	bytes, err := os.ReadFile(versionFile)
17 | 	if err != nil {
18 | 		return "", err
19 | 	}
20 | 	// "v1.2.3"
21 | 	versionTag := string(bytes)
22 | 	return strings.ReplaceAll(versionTag, "v", ""), nil
23 | }
24 | 
25 | func ParseMinorVersion(semanticVersion string) (string, error) {
26 | 	parts := strings.Split(semanticVersion, ".")
27 | 	if len(parts) < 2 {
28 | 		return "", fmt.Errorf("malformed semantic version: '%s'", semanticVersion)
29 | 	}
30 | 	return strings.Join(parts[:2], "."), nil
31 | }
32 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/logs_ssm_doc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "schemaVersion": "2.2",
 3 |     "description": "Collect logs from an Amazon Linux EKS node",
 4 |     "parameters": {
 5 |         "s3Destination": {
 6 |             "type": "String"
 7 |         }
 8 |     },
 9 |     "mainSteps": [
10 |         {
11 |             "action": "aws:runShellScript",
12 |             "name": "collectAndUploadLogs",
13 |             "precondition": {
14 |                 "StringEquals": [
15 |                     "platformType",
16 |                     "Linux"
17 |                 ]
18 |             },
19 |             "inputs": {
20 |                 "runCommand": [
21 |                     "bash /etc/eks/log-collector-script/eks-log-collector.sh >/dev/null 2>&1",
22 |                     "aws s3 cp /var/log/eks_i* {{s3Destination}}"
23 |                 ]
24 |             }
25 |         }
26 |     ]
27 | }


--------------------------------------------------------------------------------
/test/cases/quick/main_test.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package quick
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	_ "embed"
 8 | 	"log"
 9 | 	"os"
10 | 	"os/signal"
11 | 	"testing"
12 | 
13 | 	"sigs.k8s.io/e2e-framework/pkg/env"
14 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
15 | )
16 | 
17 | var (
18 | 	testenv env.Environment
19 | )
20 | 
21 | func TestMain(m *testing.M) {
22 | 	cfg, err := envconf.NewFromFlags()
23 | 	if err != nil {
24 | 		log.Fatalf("failed to initialize test environment: %v", err)
25 | 	}
26 | 	testenv = env.NewWithConfig(cfg)
27 | 	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
28 | 	defer cancel()
29 | 	testenv = testenv.WithContext(ctx)
30 | 
31 | 	testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) {
32 | 		log.Println("Starting quick test suite...")
33 | 		return ctx, nil
34 | 	})
35 | 
36 | 	os.Exit(testenv.Run(m))
37 | }
38 | 


--------------------------------------------------------------------------------
/test/cases/disruptive/main_test.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package disruptive
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	_ "embed"
 8 | 	"log"
 9 | 	"os"
10 | 	"os/signal"
11 | 	"testing"
12 | 
13 | 	"sigs.k8s.io/e2e-framework/pkg/env"
14 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
15 | )
16 | 
17 | var (
18 | 	testenv env.Environment
19 | )
20 | 
21 | func TestMain(m *testing.M) {
22 | 	cfg, err := envconf.NewFromFlags()
23 | 	if err != nil {
24 | 		log.Fatalf("failed to initialize test environment: %v", err)
25 | 	}
26 | 	testenv = env.NewWithConfig(cfg)
27 | 	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
28 | 	defer cancel()
29 | 	testenv = testenv.WithContext(ctx)
30 | 
31 | 	testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) {
32 | 		log.Println("Starting quick test suite...")
33 | 		return ctx, nil
34 | 	})
35 | 
36 | 	os.Exit(testenv.Run(m))
37 | }
38 | 


--------------------------------------------------------------------------------
/test/cases/neuron-training/vars.go:
--------------------------------------------------------------------------------
 1 | package training
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 
 6 | 	"sigs.k8s.io/e2e-framework/pkg/env"
 7 | )
 8 | 
 9 | // Shared global variables
10 | var (
11 | 	testenv env.Environment
12 | 
13 | 	bertTrainingImage *string
14 | 	efaEnabled        *bool
15 | 	nodeType          *string
16 | 	nodeCount         int
17 | 	efaPerNode        int
18 | 	neuronPerNode     int
19 | 	neuronCorePerNode int
20 | 	retries           *int
21 | )
22 | 
23 | func init() {
24 | 	// Define command-line flags
25 | 	bertTrainingImage = flag.String("bertTrainingImage", "", "Docker image used for BERT training workload")
26 | 	efaEnabled = flag.Bool("efaEnabled", false, "Enable Elastic Fabric Adapter (EFA)")
27 | 	nodeType = flag.String("nodeType", "", "Instance type for cluster nodes (e.g., inf1.24xlarge)")
28 | 	retries = flag.Int("retries", 2, "Number of retries to attempt before marking the test as failed.")
29 | }
30 | 


--------------------------------------------------------------------------------
/test/cases/nvidia-inference/manifests/bert-inference.yaml:
--------------------------------------------------------------------------------
 1 | # Single-node BERT inference job with GPU. Memory-backed volume for /dev/shm
 2 | apiVersion: batch/v1
 3 | kind: Job
 4 | metadata:
 5 |   name: bert-inference
 6 | spec:
 7 |   backoffLimit: 4
 8 |   template:
 9 |     spec:
10 |       restartPolicy: OnFailure
11 |       volumes:
12 |       - name: dshm
13 |         emptyDir:
14 |           medium: Memory
15 |       containers:
16 |       - name: bert-inference
17 |         image: {{.BertInferenceImage}}
18 |         imagePullPolicy: Always
19 |         command: ["python", "infer.py"]
20 |         env:
21 |         - name: INFERENCE_MODE
22 |           value: "{{.InferenceMode}}"
23 |         volumeMounts:
24 |         - mountPath: /dev/shm
25 |           name: dshm
26 |         resources:
27 |           requests:
28 |             nvidia.com/gpu: {{.GPUPerNode}}
29 |           limits:
30 |             nvidia.com/gpu: {{.GPUPerNode}}
31 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/metrics.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"path"
 5 | 
 6 | 	"github.com/aws/aws-k8s-tester/internal/metrics"
 7 | 	cloudwatchtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
 8 | )
 9 | 
10 | var DeployerMetricNamespace = path.Join("kubetest2", DeployerName)
11 | 
12 | var (
13 | 	totalRuntimeSeconds = &metrics.MetricSpec{
14 | 		Namespace: DeployerMetricNamespace,
15 | 		Metric:    "TotalRuntimeSeconds",
16 | 		Unit:      cloudwatchtypes.StandardUnitSeconds,
17 | 	}
18 | 
19 | 	nodeTimeToRegistrationSeconds = &metrics.MetricSpec{
20 | 		Namespace: DeployerMetricNamespace,
21 | 		Metric:    "NodeTimeToRegistrationSeconds",
22 | 		Unit:      cloudwatchtypes.StandardUnitSeconds,
23 | 	}
24 | 
25 | 	nodeTimeToReadySeconds = &metrics.MetricSpec{
26 | 		Namespace: DeployerMetricNamespace,
27 | 		Metric:    "NodeTimeToReadySeconds",
28 | 		Unit:      cloudwatchtypes.StandardUnitSeconds,
29 | 	}
30 | )
31 | 


--------------------------------------------------------------------------------
/cmd/kubetest2-eksapi-janitor/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"flag"
 6 | 	"time"
 7 | 
 8 | 	"github.com/aws/aws-k8s-tester/internal/deployers/eksapi"
 9 | 	"k8s.io/klog/v2"
10 | )
11 | 
12 | func main() {
13 | 	var maxResourceAge time.Duration
14 | 	flag.DurationVar(&maxResourceAge, "max-resource-age", time.Hour*3, "Maximum resource age")
15 | 	var workers int
16 | 	flag.IntVar(&workers, "workers", 1, "number of workers to processes resources in parallel")
17 | 	var stackStatus string
18 | 	flag.StringVar(&stackStatus, "stack-status", "", "only process stacks with a specific status")
19 | 	var emitMetrics bool
20 | 	flag.BoolVar(&emitMetrics, "emit-metrics", false, "Send metrics to CloudWatch")
21 | 	flag.Parse()
22 | 	j := eksapi.NewJanitor(maxResourceAge, emitMetrics, workers, stackStatus)
23 | 	if err := j.Sweep(context.Background()); err != nil {
24 | 		klog.Fatalf("failed to sweep resources: %v", err)
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/internal/e2e/mpijobs/conditions_test.go:
--------------------------------------------------------------------------------
 1 | package mpijobs
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 8 | )
 9 | 
10 | func Test_MPIJobSucceeded(t *testing.T) {
11 | 	u := unstructured.Unstructured{
12 | 		Object: map[string]interface{}{
13 | 			"status": map[string]interface{}{
14 | 				"conditions": []interface{}{
15 | 					map[string]interface{}{
16 | 						"type":   "Succeeded",
17 | 						"status": "True",
18 | 					},
19 | 				},
20 | 			},
21 | 		},
22 | 	}
23 | 	assert.True(t, MPIJobSucceeded(&u))
24 | 
25 | 	u = unstructured.Unstructured{
26 | 		Object: map[string]interface{}{
27 | 			"status": map[string]interface{}{
28 | 				"conditions": []interface{}{
29 | 					map[string]interface{}{
30 | 						"type":   "Succeeded",
31 | 						"status": "False",
32 | 					},
33 | 				},
34 | 			},
35 | 		},
36 | 	}
37 | 	assert.False(t, MPIJobSucceeded(&u))
38 | }
39 | 


--------------------------------------------------------------------------------
/internal/util/path.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"os"
 6 | 	"path/filepath"
 7 | 	"syscall"
 8 | )
 9 | 
10 | var ErrFileNotFoundInPath = errors.New("file not found in $PATH")
11 | 
12 | // LookPath finds a file on the PATH.
13 | // It uses a similar process to exec.LookPath, but can find regular files.
14 | func LookPath(file string) (string, error) {
15 | 	path := os.Getenv("PATH")
16 | 	for _, dir := range filepath.SplitList(path) {
17 | 		if dir == "" {
18 | 			// Unix shell semantics: path element "" means "."
19 | 			dir = "."
20 | 		}
21 | 		path := filepath.Join(dir, file)
22 | 		if err := checkFile(path); err == nil {
23 | 			return path, nil
24 | 		}
25 | 	}
26 | 	return "", ErrFileNotFoundInPath
27 | }
28 | 
29 | func checkFile(file string) error {
30 | 	d, err := os.Stat(file)
31 | 	if err != nil {
32 | 		return err
33 | 	}
34 | 	m := d.Mode()
35 | 	if m.IsDir() {
36 | 		return syscall.EISDIR
37 | 	}
38 | 	return nil
39 | }
40 | 


--------------------------------------------------------------------------------
/test/cases/neuron/manifests/single-node-test-neuronx.yaml:
--------------------------------------------------------------------------------
 1 | kind: Job
 2 | apiVersion: batch/v1
 3 | metadata:
 4 |   name: neuronx-single-node
 5 |   labels:
 6 |     app: neuronx-single-node
 7 | spec:
 8 |   template:
 9 |     metadata:
10 |       labels:
11 |         app: neuronx-single-node
12 |     spec:
13 |       containers:
14 |       - name: neuronx-single-node-test
15 |         image: {{.NeuronTestImage}}
16 |         command:
17 |         - /bin/bash
18 |         - ./tests/singleNodeTest.sh
19 |         imagePullPolicy: Always
20 |         resources:
21 |           limits:
22 |             cpu: "4"
23 |             memory: 4Gi
24 |             aws.amazon.com/neuron: "1"
25 |           requests:
26 |             cpu: "1"
27 |             memory: 1Gi
28 |             aws.amazon.com/neuron: "1"
29 |       restartPolicy: Never
30 |       securityContext:
31 |         runAsUser: 1000
32 |         runAsGroup: 2000
33 |         fsGroup: 3000
34 |   backoffLimit: 4
35 | 


--------------------------------------------------------------------------------
/test/cases/neuron-inference/vars.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package inference
 4 | 
 5 | import (
 6 | 	"flag"
 7 | 
 8 | 	"sigs.k8s.io/e2e-framework/pkg/env"
 9 | )
10 | 
11 | // Shared global variables
12 | var (
13 | 	// The e2e-framework environment
14 | 	testenv env.Environment
15 | 
16 | 	// Passed in as flags
17 | 	bertInferenceImage *string
18 | 	nodeType           *string
19 | 	inferenceMode      *string
20 | 
21 | 	// Discovered in main_test.go
22 | 	neuronPerNode     int
23 | 	neuronCorePerNode int
24 | )
25 | 
26 | // init() runs before TestMain and sets up the flags
27 | func init() {
28 | 	bertInferenceImage = flag.String("bertInferenceImage", "",
29 | 		"[REQUIRED] Docker image used for Neuron-based BERT inference")
30 | 	nodeType = flag.String("nodeType", "",
31 | 		"Node type label for K8s nodes, e.g., trn1.32xlarge or inf2.xlarge")
32 | 	inferenceMode = flag.String("inferenceMode", "throughput",
33 | 		"Inference mode for BERT (throughput or latency)")
34 | }
35 | 


--------------------------------------------------------------------------------
/internal/util/http.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"strings"
 6 | 
 7 | 	"github.com/aws/smithy-go/middleware"
 8 | 	smithyhttp "github.com/aws/smithy-go/transport/http"
 9 | )
10 | 
11 | const httpHeaderBoundary = ": "
12 | 
13 | // NewHTTPHeaderAPIOptions returns a slice of middleware options that adds the
14 | // specified HTTP headers to an API request.
15 | // Each header should be of the format `Header-Key: Header-Value`, in the same manner
16 | // as headers are passed with `curl`-s `-H` flag.
17 | func NewHTTPHeaderAPIOptions(headers []string) ([]func(*middleware.Stack) error, error) {
18 | 	var opts []func(*middleware.Stack) error
19 | 	for _, header := range headers {
20 | 		boundary := strings.Index(header, httpHeaderBoundary)
21 | 		if boundary == -1 {
22 | 			return nil, fmt.Errorf("malformed HTTP header: '%s'", header)
23 | 		}
24 | 		key := header[:boundary]
25 | 		val := header[boundary+len(httpHeaderBoundary):]
26 | 		opts = append(opts, smithyhttp.AddHeaderValue(key, val))
27 | 	}
28 | 	return opts, nil
29 | }
30 | 


--------------------------------------------------------------------------------
/test/cases/neuron-inference/manifests/neuron-bert-inference.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: neuron-inference
 5 | spec:
 6 |   backoffLimit: 4
 7 |   template:
 8 |     spec:
 9 |       restartPolicy: OnFailure
10 |       volumes:
11 |         - name: dshm
12 |           emptyDir:
13 |             medium: Memory
14 |       containers:
15 |         - name: neuron-inference
16 |           image: {{.BertInferenceImage}}
17 |           imagePullPolicy: Always
18 |           command: ["python", "/app/infer.py"]
19 |           env:
20 |             - name: INFERENCE_MODE
21 |               value: "{{.InferenceMode}}"
22 |           volumeMounts:
23 |             - mountPath: /dev/shm
24 |               name: dshm
25 |           resources:
26 |             requests:
27 |               aws.amazon.com/neuroncore: "{{.NeuronCorePerNode}}"
28 |             limits:
29 |               aws.amazon.com/neuroncore: "{{.NeuronCorePerNode}}"
30 |           nodeSelector:
31 |             node.kubernetes.io/instance-type: {{.NodeType}}
32 | 


--------------------------------------------------------------------------------
/.github/workflows/update-image-tags.yaml:
--------------------------------------------------------------------------------
 1 | name: "[CI] update-image-tags"
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     # once a week
 6 |     - cron: "0 0 * * 0"
 7 | permissions:
 8 |   id-token: write
 9 |   contents: write
10 |   pull-requests: write
11 | jobs:
12 |   update-dependencies:
13 |     runs-on: ubuntu-latest
14 |     if: github.repository == 'aws/aws-k8s-tester'
15 |     steps:
16 |     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
17 |     - run: ./hack/update-image-tags.sh
18 |     - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8
19 |       with:
20 |         branch: update-image-tags
21 |         base: main
22 |         add-paths: |
23 |           test/images/
24 |         commit-message: "chore: update image tags"
25 |         committer: "GitHub <noreply@github.com>"
26 |         author: "GitHub <noreply@github.com>"
27 |         title: "chore: update image tags"
28 |         body: |
29 |           Generated by:
30 |           ```
31 |           ./hack/update-image-tags.sh
32 |           ```
33 | 


--------------------------------------------------------------------------------
/internal/util/http_test.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func Test_NewHTTPHeaderAPIOptions(t *testing.T) {
 8 | 	testCases := []struct {
 9 | 		name        string
10 | 		headers     []string
11 | 		expectError bool
12 | 	}{
13 | 		{
14 | 			name:    "empty",
15 | 			headers: []string{},
16 | 		},
17 | 		{
18 | 			name:    "single valid header",
19 | 			headers: []string{"Content-Type: application/json"},
20 | 		},
21 | 		{
22 | 			name:    "multiple valid headers",
23 | 			headers: []string{"Content-Type: application/json", "Accept: application/json"},
24 | 		},
25 | 		{
26 | 			name:        "invalid header",
27 | 			headers:     []string{"Invalid header"},
28 | 			expectError: true,
29 | 		},
30 | 	}
31 | 	for _, tc := range testCases {
32 | 		t.Run(tc.name, func(t *testing.T) {
33 | 			_, err := NewHTTPHeaderAPIOptions(tc.headers)
34 | 			if err != nil && !tc.expectError {
35 | 				t.Errorf("unexpected error: %v", err)
36 | 			}
37 | 			if err == nil && tc.expectError {
38 | 				t.Error("expected error but got none")
39 | 			}
40 | 		})
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/test/cases/nvidia/manifests/job-unit-test-single-node.yaml:
--------------------------------------------------------------------------------
 1 | kind: Job
 2 | apiVersion: batch/v1
 3 | metadata:
 4 |   name: unit-test-job
 5 |   labels:
 6 |     app: unit-test-job
 7 | spec:
 8 |   template:
 9 |     metadata:
10 |       labels:
11 |         app: unit-test-job
12 |     spec:
13 |       containers:
14 |       - name: unit-test-container
15 |         image: {{.NvidiaTestImage}}
16 |         command: 
17 |         - /bin/bash
18 |         - ./gpu_unit_tests/unit_test
19 |         env:
20 |           - name: SKIP_TESTS_SUBCOMMAND
21 |             value: {{.SkipTestSubcommand}}
22 |           # because we started building these from source, this is just a
23 |           # regular binary.
24 |           - name: DEMO_SUITE_DIR
25 |             value: /usr/bin
26 |           - name: EC2_INSTANCE_TYPE
27 |             value: {{.NodeType}}
28 |         imagePullPolicy: Always
29 |         resources:
30 |           limits:
31 |             nvidia.com/gpu: {{.GpuPerNode}}
32 |           requests:
33 |             cpu: "1"
34 |             memory: 1Gi
35 |       restartPolicy: Never
36 |   backoffLimit: 4
37 | 


--------------------------------------------------------------------------------
/.github/workflows/update-nvidia-dependencies.yaml:
--------------------------------------------------------------------------------
 1 | name: "[CI] update-nvidia-dependencies"
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     # once a week
 6 |     - cron: "0 0 * * 0"
 7 | permissions:
 8 |   id-token: write
 9 |   contents: write
10 |   pull-requests: write
11 | jobs:
12 |   update-dependencies:
13 |     runs-on: ubuntu-latest
14 |     if: github.repository == 'aws/aws-k8s-tester'
15 |     steps:
16 |     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
17 |     - run: ./hack/update-nvidia-dependencies.sh
18 |     - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8
19 |       with:
20 |         branch: update-nvidia-dependencies
21 |         base: main
22 |         add-paths: |
23 |           test/images/
24 |         commit-message: "chore: update nvidia test dependencies"
25 |         committer: "GitHub <noreply@github.com>"
26 |         author: "GitHub <noreply@github.com>"
27 |         title: "chore: update nvidia test dependencies"
28 |         body: |
29 |           Generated by:
30 |           ```
31 |           ./hack/update-nvidia-dependencies.sh
32 |           ```
33 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/nvidia_static_cluster_nodepool.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.sh/v1
 2 | kind: NodePool
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/managed-by: eks
 6 |   name: nvidia
 7 | spec:
 8 |   weight: 50
 9 |   template:
10 |     spec:
11 |       requirements:
12 |         - key: kubernetes.io/arch
13 |           operator: In
14 |           values: [{{.Arch}}]
15 |         - key: kubernetes.io/os
16 |           operator: In
17 |           values: ["linux"]
18 |         - key: karpenter.sh/capacity-type
19 |           operator: In
20 |           values: ["on-demand"]
21 |         - key: node.kubernetes.io/instance-type
22 |           operator: In
23 |           values: 
24 |             {{- range .InstanceTypes}}
25 |             - "{{.}}"
26 |             {{- end}}
27 |         - key: eks.amazonaws.com/instance-gpu-count
28 |           operator: Exists
29 |       nodeClassRef:
30 |         group: eks.amazonaws.com
31 |         kind: NodeClass
32 |         name: default
33 |       expireAfter: 336h 
34 |   disruption:
35 |     budgets:
36 |       - nodes: 10%
37 |     consolidationPolicy: WhenEmpty
38 |     consolidateAfter: 600s
39 | 


--------------------------------------------------------------------------------
/hack/update-nvidia-dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # following from the last updated dependency:
 4 | # 1. get the latest release of aws-ofi-nccl
 5 | # 2. get the supported version of libnccl
 6 | # 3. get the latest correct cuda version used for libnccl
 7 | 
 8 | set -o nounset
 9 | set -o errexit
10 | set -o pipefail
11 | 
12 | echo "Updating aws-ofi-nccl"
13 | AWS_OFI_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .tag_name | sed 's/^v//')
14 | find . -type f -name Dockerfile -exec sed -i "s/AWS_OFI_NCCL_VERSION=.*/AWS_OFI_NCCL_VERSION=$AWS_OFI_NCCL_TAG/g" {} +
15 | 
16 | echo "Updating nccl"
17 | LIB_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .body | grep -oP '\[NCCL \K(\S*)(?=\])' | head -n 1 | sed 's/^v//')
18 | find . -type f -name Dockerfile -exec sed -i "s/LIBNCCL_VERSION=.*/LIBNCCL_VERSION=$LIB_NCCL_TAG/g" {} +
19 | 
20 | echo "Updating nvbandwidth"
21 | NVBANDWIDTH_TAG=$(curl -s https://api.github.com/repos/NVIDIA/nvbandwidth/releases/latest | jq -r .tag_name)
22 | find . -type f -name Dockerfile -exec sed -i "s/NVBANDWIDTH_VERSION=.*/NVBANDWIDTH_VERSION=$NVBANDWIDTH_TAG/g" {} +
23 | 
24 | 


--------------------------------------------------------------------------------
/test/common/flags.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package common
 4 | 
 5 | import (
 6 | 	"flag"
 7 | 	"fmt"
 8 | 	"github.com/urfave/sflags/gen/gpflag"
 9 | 	"github.com/spf13/pflag"
10 | 	"reflect"
11 | )
12 | 
13 | // For CloudWatch metric dimension flag
14 | type MetricOps struct {
15 | 	MetricDimensions map[string]string `flag:"metricDimensions" desc:"CloudWatch metric dimensions as comma-separated key=value pairs"`
16 | }
17 | 
18 | func ParseFlags(config interface{}) (*pflag.FlagSet, error) {
19 | 	flags, err := gpflag.Parse(config)
20 | 	if err != nil {
21 | 		return nil, fmt.Errorf("failed to parse flags: %w", err)
22 | 	}
23 | 
24 | 	// Handle MetricDimensions map that gpflag doesn't support
25 | 	if _, hasField := reflect.TypeOf(config).Elem().FieldByName("MetricDimensions"); hasField {
26 | 		field := reflect.ValueOf(config).Elem().FieldByName("MetricDimensions")
27 | 		metricDims := field.Addr().Interface().(*map[string]string)
28 | 		flags.StringToStringVar(metricDims, "metricDimensions", nil, "CloudWatch metric dimensions as comma-separated key=value pairs")
29 | 	}
30 | 
31 | 	flags.VisitAll(func(pf *pflag.Flag) {
32 | 		flag.CommandLine.Var(pf.Value, pf.Name, pf.Usage)
33 | 	})
34 | 
35 | 	return flags, nil
36 | }


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/cloudwatch_agent_infra.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Namespace
 3 | metadata:
 4 |   name: amazon-cloudwatch
 5 |   labels:
 6 |     name: amazon-cloudwatch
 7 | 
 8 | ---
 9 | apiVersion: v1
10 | kind: ServiceAccount
11 | metadata:
12 |   name: cwagent
13 |   namespace: amazon-cloudwatch
14 | 
15 | ---
16 | # ClusterRole for cwagent
17 | apiVersion: rbac.authorization.k8s.io/v1
18 | kind: ClusterRole
19 | metadata:
20 |   name: cwagent-role
21 | rules:
22 |   - apiGroups: [""]
23 |     resources:
24 |       - nodes
25 |       - nodes/proxy
26 |       - services
27 |       - endpoints
28 |       - pods
29 |     verbs: ["get", "list", "watch"]
30 |   - apiGroups: ["extensions"]
31 |     resources:
32 |       - ingresses
33 |     verbs: ["get", "list", "watch"]
34 |   - nonResourceURLs: ["/metrics"]
35 |     verbs: ["get"]
36 | 
37 | ---
38 | # ClusterRoleBinding
39 | apiVersion: rbac.authorization.k8s.io/v1
40 | kind: ClusterRoleBinding
41 | metadata:
42 |   name: cwagent-role-binding
43 | subjects:
44 |   - kind: ServiceAccount
45 |     name: cwagent
46 |     namespace: amazon-cloudwatch
47 | roleRef:
48 |   kind: ClusterRole
49 |   name: cwagent-role
50 |   apiGroup: rbac.authorization.k8s.io


--------------------------------------------------------------------------------
/test/common/resources.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package common
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"fmt"
 8 | 	"log"
 9 | 	"time"
10 | 
11 | 	fwext "github.com/aws/aws-k8s-tester/internal/e2e"
12 | 	appsv1 "k8s.io/api/apps/v1"
13 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14 | 	"sigs.k8s.io/e2e-framework/klient/wait"
15 | 	"sigs.k8s.io/e2e-framework/pkg/env"
16 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
17 | )
18 | 
19 | // DeployDaemonSet returns a function to deploy and wait for a DaemonSet to be ready
20 | func DeployDaemonSet(name, namespace string) env.Func {
21 | 	return func(ctx context.Context, config *envconf.Config) (context.Context, error) {
22 | 		log.Printf("Waiting for %s daemonset to be ready.", name)
23 | 		daemonset := appsv1.DaemonSet{
24 | 			ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace},
25 | 		}
26 | 		err := wait.For(
27 | 			fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&daemonset),
28 | 			wait.WithTimeout(5*time.Minute),
29 | 			wait.WithContext(ctx),
30 | 		)
31 | 		if err != nil {
32 | 			return ctx, fmt.Errorf("%s daemonset is not ready: %w", name, err)
33 | 		}
34 | 		log.Printf("%s daemonset is ready.", name)
35 | 		return ctx, nil
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/vpccni.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"context"
 6 | 	"encoding/json"
 7 | 
 8 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 9 | 	"k8s.io/apimachinery/pkg/types"
10 | )
11 | 
12 | const vpcCNIDaemonSetPatch = `{
13 | 	"spec": {
14 | 		"template": {
15 | 			"spec": {
16 | 				"containers": [
17 | 					{
18 | 						"name": "aws-node",
19 | 						"env": [
20 | 							{
21 | 								"name": "ENABLE_PREFIX_DELEGATION",
22 | 								"value": "true"
23 | 							},
24 | 							{
25 | 								"name": "MINIMUM_IP_TARGET",
26 | 								"value": "80"
27 | 							},
28 | 							{
29 | 								"name": "WARM_IP_TARGET",
30 | 								"value": "10"
31 | 							}
32 | 						]
33 | 					}
34 | 				]
35 | 			}
36 | 		}
37 | 	}
38 | }`
39 | 
40 | // tuneVPCCNI applies configuration to the VPC CNI DaemonSet that helps prevent test flakiness
41 | func (k *k8sClient) tuneVPCCNI() error {
42 | 	var patch bytes.Buffer
43 | 	if err := json.Compact(&patch, []byte(vpcCNIDaemonSetPatch)); err != nil {
44 | 		return err
45 | 	}
46 | 	_, err := k.clientset.AppsV1().DaemonSets("kube-system").Patch(context.TODO(), "aws-node", types.StrategicMergePatchType, patch.Bytes(), metav1.PatchOptions{})
47 | 	return err
48 | }
49 | 


--------------------------------------------------------------------------------
/.github/workflows/update-go-dependencies.yaml:
--------------------------------------------------------------------------------
 1 | name: "[CI] update-go-dependencies"
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     # once a week
 6 |     - cron: "0 0 * * 0"
 7 | permissions:
 8 |   id-token: write
 9 |   contents: write
10 |   pull-requests: write
11 | jobs:
12 |   update-dependencies:
13 |     runs-on: ubuntu-latest
14 |     if: github.repository == 'aws/aws-k8s-tester'
15 |     steps:
16 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
17 |       - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # 5.5.0
18 |       - run: |
19 |           ./hack/update-go-dependencies.sh
20 |       - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8
21 |         with:
22 |           branch: update-go-dependencies
23 |           base: main
24 |           add-paths: |
25 |             .
26 |           commit-message: "chore: update go dependencies"
27 |           committer: "GitHub <noreply@github.com>"
28 |           author: "GitHub <noreply@github.com>"
29 |           title: "chore: update go dependencies"
30 |           body: |
31 |             Generated by:
32 |               ```
33 |               ./hack/update-go-dependencies.sh
34 |               ```
35 | 


--------------------------------------------------------------------------------
/internal/deployers/eksctl/down.go:
--------------------------------------------------------------------------------
 1 | package eksctl
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/aws/aws-k8s-tester/internal/util"
 7 | 	"k8s.io/klog"
 8 | )
 9 | 
10 | func (d *deployer) Down() error {
11 | 	d.initClusterName()
12 | 
13 | 	var err error
14 | 
15 | 	if d.DeployTarget == "nodegroup" {
16 | 		klog.Infof("deleting nodegroup %s from cluster %s", d.NodegroupName, d.clusterName)
17 | 		err = util.ExecuteCommand("eksctl", "delete", "nodegroup", "--cluster", d.clusterName, "--name", d.NodegroupName, "--drain=false", "--wait")
18 | 		if err != nil {
19 | 			return fmt.Errorf("failed to delete nodegroup: %v", err)
20 | 		}
21 | 		klog.Infof("Successfully deleted nodegroup: %s from cluster: %s", d.NodegroupName, d.clusterName)
22 | 	} else if d.DeployTarget == "cluster" {
23 | 		klog.Infof("deleting cluster %s", d.clusterName)
24 | 		err = util.ExecuteCommand("eksctl", "delete", "cluster", "--name", d.clusterName, "--wait")
25 | 		if err != nil {
26 | 			return fmt.Errorf("failed to delete cluster: %v", err)
27 | 		}
28 | 		klog.Infof("Successfully deleted cluster: %s", d.clusterName)
29 | 	} else {
30 | 		return fmt.Errorf("Unsupported deploy target: %s, supported options: `cluster`, `nodegroup`.", d.DeployTarget)
31 | 	}
32 | 	return nil
33 | }
34 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/auth_map_role_test.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | )
 8 | 
 9 | const rolearn = "mock-role-arn"
10 | 
11 | const sessionNamedAuthMapRole = `
12 | - username: system:node:{{SessionName}} 
13 |   groups:
14 |     - system:bootstrappers
15 |     - system:nodes
16 |   rolearn: mock-role-arn`
17 | 
18 | const privateDNSNamedAuthMapRole = `
19 | - username: system:node:{{EC2PrivateDNSName}} 
20 |   groups:
21 |     - system:bootstrappers
22 |     - system:nodes
23 |   rolearn: mock-role-arn`
24 | 
25 | func Test_generateAuthRoleMap(t *testing.T) {
26 | 	cases := []struct {
27 | 		nodeNameStrategy string
28 | 		expected         string
29 | 	}{
30 | 		{
31 | 			nodeNameStrategy: "SessionName",
32 | 			expected:         sessionNamedAuthMapRole,
33 | 		},
34 | 		{
35 | 			nodeNameStrategy: "EC2PrivateDNSName",
36 | 			expected:         privateDNSNamedAuthMapRole,
37 | 		},
38 | 	}
39 | 	for _, c := range cases {
40 | 		t.Run(c.nodeNameStrategy, func(t *testing.T) {
41 | 			actual, err := generateAuthMapRole(c.nodeNameStrategy, rolearn)
42 | 			if err != nil {
43 | 				t.Log(err)
44 | 				t.Error(err)
45 | 			}
46 | 			assert.Equal(t, c.expected, actual)
47 | 		})
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh:
--------------------------------------------------------------------------------
 1 | # Trivial cuda tests to validate that GPU it functional
 2 | # Use demu-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html 
 3 | # and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests
 4 | 
 5 | setup_suite()
 6 | {
 7 |     source common.sh
 8 |     assert_gpu_unused
 9 |     DEMO_SUITE_DIR=${DEMO_SUITE_DIR:-$(realpath /usr/local/cuda/extras/demo_suite)}
10 | }
11 | 
12 | teardown_suite()
13 | {
14 |     assert_gpu_unused
15 | }
16 | 
17 | test_01_device_query()
18 | {
19 |     assert_status_code 0 "$DEMO_SUITE_DIR/deviceQuery"
20 | }
21 | 
22 | test_02_vector_add()
23 | {
24 |     assert_status_code 0 "$DEMO_SUITE_DIR/vectorAdd"
25 | }
26 | 
27 | test_03_nvbandwidth()
28 | {
29 |     assert_status_code 0 "$DEMO_SUITE_DIR/nvbandwidth"
30 | }
31 | 
32 | test_04_dcgm_diagnostics()
33 | {
34 |     # This test is not applicable for vGPU instance types.
35 |     if is_vgpu; then
36 |         skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)"
37 |     fi
38 | 
39 |     # https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests
40 |     assert_status_code 0 "dcgmi diag -r 2"
41 | }
42 | 


--------------------------------------------------------------------------------
/test/cases/nvidia/manifests/nvidia-driver-capabilities-check.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: moderngl-pod
 5 | spec:
 6 |   restartPolicy: Never
 7 |   tolerations:
 8 |   - key: "nvidia.com/gpu"
 9 |     operator: "Exists"
10 |     effect: "NoSchedule"
11 |   containers:
12 |   - name: moderngl-container
13 |     env:
14 |     - name: NVIDIA_DRIVER_CAPABILITIES
15 |       value: "all"
16 |     image: public.ecr.aws/ubuntu/ubuntu:22.04
17 |     command: ["/bin/bash"]
18 |     args:
19 |       - -c
20 |       - |
21 |         set -e
22 |         apt-get update
23 |         apt-get install -y \
24 |           python3 \
25 |           python3-pip \
26 |           libgl1-mesa-glx \
27 |           libegl1-mesa-dev \
28 |           libgles2-mesa-dev \
29 |           mesa-utils \
30 |           xvfb
31 |         pip3 install moderngl
32 |         sleep 60
33 |         cat <<'EOF' > moderngl-script.py
34 |         import moderngl
35 |         moderngl.create_standalone_context(backend='egl')
36 |         EOF
37 |         python3 moderngl-script.py
38 |     resources:
39 |       requests:
40 |         memory: "50Gi"
41 |         cpu: "15"
42 |         "nvidia.com/gpu": "1"
43 |       limits:
44 |         memory: "50Gi"
45 |         "nvidia.com/gpu": "1"
46 | 


--------------------------------------------------------------------------------
/test/manifests/assets/k8s-neuron-device-plugin-rbac.yml:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/aws-neuron/aws-neuron-sdk/blob/master/src/k8/k8s-neuron-device-plugin-rbac.yml
 2 | kind: ClusterRole
 3 | apiVersion: rbac.authorization.k8s.io/v1
 4 | metadata:
 5 |   name: neuron-device-plugin
 6 | rules:
 7 | - apiGroups:
 8 |   - ""
 9 |   resources:
10 |   - nodes
11 |   verbs:
12 |   - get
13 |   - list
14 |   - watch
15 | - apiGroups:
16 |   - ""
17 |   resources:
18 |   - events
19 |   verbs:
20 |   - create
21 |   - patch
22 | - apiGroups:
23 |   - ""
24 |   resources:
25 |   - pods
26 |   verbs:
27 |   - update
28 |   - patch
29 |   - get
30 |   - list
31 |   - watch
32 | - apiGroups:
33 |   - ""
34 |   resources:
35 |   - nodes/status
36 |   verbs:
37 |   - patch
38 |   - update
39 | ---
40 | apiVersion: v1
41 | kind: ServiceAccount
42 | metadata:
43 |   name: neuron-device-plugin
44 |   namespace: kube-system
45 | ---
46 | kind: ClusterRoleBinding
47 | apiVersion: rbac.authorization.k8s.io/v1
48 | metadata:
49 |   name: neuron-device-plugin
50 |   namespace: kube-system
51 | roleRef:
52 |   apiGroup: rbac.authorization.k8s.io
53 |   kind: ClusterRole
54 |   name: neuron-device-plugin
55 | subjects:
56 | - kind: ServiceAccount
57 |   name: neuron-device-plugin
58 |   namespace: kube-system
59 | 


--------------------------------------------------------------------------------
/internal/e2e/mpijobs/conditions.go:
--------------------------------------------------------------------------------
 1 | package mpijobs
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 7 | 	"sigs.k8s.io/e2e-framework/klient/k8s"
 8 | )
 9 | 
10 | // MPIJobSucceeded returns true if the specified k8s.Object is an unstructured.Unstructured
11 | // with .status.conditions["Succeeded"] = "True"
12 | func MPIJobSucceeded(obj k8s.Object) bool {
13 | 	u := obj.(*unstructured.Unstructured)
14 | 	conditions, found, err := unstructured.NestedSlice(u.Object, "status", "conditions")
15 | 	if err != nil {
16 | 		panic(fmt.Errorf("MPIJob does not match expected schema: %v", err))
17 | 	}
18 | 	if !found {
19 | 		return false
20 | 	}
21 | 	for _, condition := range conditions {
22 | 		c := condition.(map[string]interface{})
23 | 		cType, found, err := unstructured.NestedString(c, "type")
24 | 		if err != nil {
25 | 			panic(fmt.Errorf("MPIJob does not match expected schema: %v", err))
26 | 		}
27 | 		if !found {
28 | 			continue
29 | 		}
30 | 		if cType == "Succeeded" {
31 | 			cStatus, found, err := unstructured.NestedString(c, "status")
32 | 			if err != nil {
33 | 				panic(fmt.Errorf("MPIJob does not match expected schema: %v", err))
34 | 			}
35 | 			if !found {
36 | 				continue
37 | 			}
38 | 			return cStatus == "True"
39 | 		}
40 | 	}
41 | 	return false
42 | }
43 | 


--------------------------------------------------------------------------------
/test/images/neuron/tests/testNeuronSingleAllReduce.py:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/aws/deep-learning-containers/blob/master/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronSingleAllReduce
 2 | import os
 3 | import torch
 4 | import torch_xla.core.xla_model as xm
 5 | import torch_xla.distributed.xla_backend
 6 | import torch_xla.runtime as xr
 7 | torch.distributed.init_process_group('xla')
 8 | import torch_xla.distributed.xla_multiprocessing as xmp
 9 | os.environ["NEURON_RT_EXEC_TIMEOUT"] = "20"
10 | os.environ["NCCL_DEBUG"] = "WARN"
11 | os.environ["NCCL_DEBUG_SUBSYS"] = "ALL"
12 | def _mp_fn():
13 |   world_size = xr.world_size()
14 |   device = xm.xla_device()
15 |   rank = xr.global_ordinal()
16 |   ones = torch.ones((2, 3))
17 |   xones = ones.to(device)
18 |   if world_size > 0:
19 |     print("running all reduce")
20 |     for i in range(0, 5):
21 |         print(f'at iteration {i}, with local rank {rank}', flush=True)
22 |         result = xm.all_reduce(xm.REDUCE_SUM, xones)
23 |         result_cpu = result.cpu()
24 |         #xm.mark_step()
25 |         print(result_cpu, flush = True)
26 |     expected = torch.ones((2,3))*world_size
27 |     assert expected.allclose(result_cpu)
28 |     print('PASS')
29 | if __name__ == '__main__':
30 |     _mp_fn()
31 |     #xmp.spawn(_mp_fn, args=(),nprocs=2, join=True)
32 | 


--------------------------------------------------------------------------------
/.github/workflows/update-neuron-dependencies.yaml:
--------------------------------------------------------------------------------
 1 | name: "[CI] update-neuron-dependencies"
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     # once a week
 6 |     - cron: "0 0 * * 0"
 7 | permissions:
 8 |   id-token: write
 9 |   contents: write
10 |   pull-requests: write
11 | jobs:
12 |   update-dependencies:
13 |     runs-on: ubuntu-latest
14 |     if: github.repository == 'aws/aws-k8s-tester'
15 |     steps:
16 |     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
17 |     - run: |
18 |         ./hack/update-neuron-dependencies.sh
19 |     - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8
20 |       with:
21 |         branch: update-neuron-dependencies
22 |         base: main
23 |         add-paths: |
24 |           test/images/
25 |         commit-message: "chore: update neuron dependencies"
26 |         committer: "GitHub <noreply@github.com>"
27 |         author: "GitHub <noreply@github.com>"
28 |         title: "chore: update neuron dependencies"
29 |         body: |
30 |           Generated by:
31 |           ```
32 |           ./hack/update-neuron-dependencies.sh
33 |           ```
34 | 
35 |           See the following URL for artifactes in the latest Neuron SDK release: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/releasecontent.html#latest-neuron-release-artifacts
36 | 


--------------------------------------------------------------------------------
/hack/download-kubernetes-binaries.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit
 4 | set -o nounset
 5 | 
 6 | BUNDLES=(
 7 |   "kubernetes-client"
 8 |   "kubernetes-test"
 9 | )
10 | 
11 | if [ "$#" -ne 3 ]; then
12 |   echo >&2 "usage: $0 (KUBERNETES_MINOR_VERSION|latest) OS ARCH"
13 |   exit 1
14 | fi
15 | 
16 | if [ "$1" = "latest" ]; then
17 |   RELEASE_MARKER="latest.txt"
18 | else
19 |   RELEASE_MARKER="latest-$1.txt"
20 | fi
21 | 
22 | echo >&2 "Release marker: ${RELEASE_MARKER}"
23 | 
24 | OS="$2"
25 | ARCH="$3"
26 | 
27 | function download_binaries() {
28 |   local basePath=$1
29 | 
30 |   local KUBERNETES_VERSION=$(curl --silent "${basePath}/${RELEASE_MARKER}")
31 | 
32 |   echo "Kubernetes version: ${KUBERNETES_VERSION}"
33 |   echo "${KUBERNETES_VERSION}" > kubernetes-version.txt
34 | 
35 |   for BUNDLE in ${BUNDLES[@]}; do
36 |     echo >&2 "Downloading bundle: ${BUNDLE}"
37 |     local TARBALL="${BUNDLE}.tar.gz"
38 |     if ! wget --quiet --output-document=${TARBALL} $basePath/${KUBERNETES_VERSION}/${BUNDLE}-${OS}-${ARCH}.tar.gz; then
39 |       return 1
40 |     fi
41 |     tar xzf ${TARBALL}
42 |     rm ${TARBALL}
43 |   done
44 | }
45 | 
46 | if ! download_binaries https://storage.googleapis.com/kubernetes-release/release; then
47 |   echo >&2 "binary download failed from release bucket, falling back to ci dev release"
48 |   download_binaries https://storage.googleapis.com/k8s-release-dev/ci
49 | fi
50 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/cloudwatch-infra.yaml.template:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Description: kubetest2-eksapi CloudWatch using Pod Identity
 3 | 
 4 | Parameters:
 5 |   ClusterName:
 6 |     Description: Name of the EKS cluster
 7 |     Type: String
 8 |   
 9 |   ClusterUUID:
10 |     Description: UUID portion of the cluster name
11 |     Type: String
12 | 
13 | Resources:
14 |   CloudWatchRole:
15 |     Type: AWS::IAM::Role
16 |     Properties:
17 |       RoleName: !Sub "cloudwatch-role-${ClusterUUID}"
18 |       AssumeRolePolicyDocument:
19 |         Version: '2012-10-17'
20 |         Statement:
21 |           - Sid: AllowEksAuthToAssumeRoleForPodIdentity
22 |             Effect: Allow
23 |             Principal:
24 |               Service: pods.eks.amazonaws.com
25 |             Action:
26 |               - sts:AssumeRole
27 |               - sts:TagSession
28 |       ManagedPolicyArns:
29 |         - arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy
30 |       Description: Role for CloudWatch Agent in EKS cluster
31 | 
32 |   PodIdentityAssociation:
33 |     Type: AWS::EKS::PodIdentityAssociation
34 |     Properties:
35 |       ClusterName: !Ref ClusterName
36 |       Namespace: amazon-cloudwatch
37 |       ServiceAccount: cwagent
38 |       RoleArn: !GetAtt CloudWatchRole.Arn
39 | 
40 | Outputs:
41 |   CloudWatchRoleArn:
42 |     Description: ARN of the CloudWatch IAM role
43 |     Value: !GetAtt CloudWatchRole.Arn
44 |     Export:
45 |       Name: !Sub "${AWS::StackName}::CloudWatchRoleArn"
46 | 
47 |   PodIdentityAssociationArn:
48 |     Description: ARN of the Pod Identity Association
49 |     Value: !Ref PodIdentityAssociation
50 |     Export:
51 |       Name: !Sub '${AWS::StackName}-PodIdentityAssociationArn'
52 | 


--------------------------------------------------------------------------------
/internal/e2e/ec2.go:
--------------------------------------------------------------------------------
 1 | package e2e
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 
 7 | 	"github.com/aws/aws-k8s-tester/internal/awssdk"
 8 | 	"github.com/aws/aws-sdk-go-v2/service/ec2"
 9 | 	ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
10 | )
11 | 
12 | type EC2Client interface {
13 | 	DescribeInstanceType(instanceType string) (ec2types.InstanceTypeInfo, error)
14 | }
15 | 
16 | type ec2Client struct {
17 | 	client *ec2.Client
18 | }
19 | 
20 | func NewEC2Client() *ec2Client {
21 | 	return &ec2Client{
22 | 		client: ec2.NewFromConfig(awssdk.NewConfig()),
23 | 	}
24 | }
25 | 
26 | func (c *ec2Client) DescribeInstanceTopology(instanceIDs []string) ([]ec2types.InstanceTopology, error) {
27 | 	var instanceTopologies []ec2types.InstanceTopology
28 | 	paginator := ec2.NewDescribeInstanceTopologyPaginator(c.client, &ec2.DescribeInstanceTopologyInput{
29 | 		InstanceIds: instanceIDs,
30 | 	})
31 | 	for paginator.HasMorePages() {
32 | 		instanceTopologyOuput, err := paginator.NextPage(context.TODO())
33 | 		if err != nil {
34 | 			return []ec2types.InstanceTopology{}, err
35 | 		}
36 | 		instanceTopologies = append(instanceTopologies, instanceTopologyOuput.Instances...)
37 | 	}
38 | 	return instanceTopologies, nil
39 | }
40 | 
41 | func (c *ec2Client) DescribeInstanceType(instanceType string) (ec2types.InstanceTypeInfo, error) {
42 | 	describeResponse, err := c.client.DescribeInstanceTypes(context.TODO(), &ec2.DescribeInstanceTypesInput{
43 | 		InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)},
44 | 	})
45 | 	if err != nil {
46 | 		return ec2types.InstanceTypeInfo{}, fmt.Errorf("failed to describe instance type: %s: %v", instanceType, err)
47 | 	} else {
48 | 		return describeResponse.InstanceTypes[0], nil
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/hack/update-neuron-dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o nounset
 4 | set -o errexit
 5 | set -o pipefail
 6 | 
 7 | # pip_versionsearch takes exactly 1 argument and returns its latest available version from the neuron pip repo
 8 | # usage: pip_versionsearch PACKAGE
 9 | pip_versionsearch() {
10 |     local PACKAGE_INDEX_NAME=$(echo $1 | tr -s '_' '-')
11 |     local PACKAGE_VERSION_NAME=$(echo $PACKAGE_INDEX_NAME | tr -s '-' '_')
12 |     curl -s https://pip.repos.neuron.amazonaws.com/${PACKAGE_INDEX_NAME} | grep -o -G "${PACKAGE_VERSION_NAME}-[0-9\.]*+[a-f0-9]*" | sed "s/$PACKAGE_VERSION_NAME-//" | sort -V | tail -n 1 
13 | }
14 | 
15 | # versionsearch takes exactly 1 argument and returns its latest available version from the neuron amd64 apt repo
16 | # usage: versionsearch PACKAGE
17 | versionsearch() {
18 |     local PACKAGE_NAME=$1
19 |     curl -s https://apt.repos.neuron.amazonaws.com/dists/focal/main/binary-amd64/Packages | grep -o "${PACKAGE_NAME}_[0-9\.]*-*[a-f0-9]*" | sed "s/${PACKAGE_NAME}_//" | sort -V | tail -n 1 
20 | }
21 | 
22 | # update_arg ARG NEW_VALUE
23 | update_arg() {
24 |     local ARG=$1
25 |     local NEW_VALUE=$2
26 |     echo "setting $ARG to $NEW_VALUE"
27 |     find . -type f -name Dockerfile -exec sed -i "s/${ARG}=.*/${ARG}=$NEW_VALUE/g" {} +
28 | }
29 | 
30 | update_arg NEURONX_RUNTIME_LIB_VERSION $(versionsearch aws-neuronx-runtime-lib)
31 | update_arg NEURONX_COLLECTIVES_LIB_VERSION $(versionsearch aws-neuronx-collectives)
32 | update_arg NEURONX_TOOLS_VERSION $(versionsearch aws-neuronx-tools)
33 | update_arg NEURONX_FRAMEWORK_VERSION $(pip_versionsearch torch-neuronx)
34 | update_arg NEURONX_CC_VERSION $(pip_versionsearch neuronx-cc)
35 | update_arg NEURONX_DISTRIBUTED_VERSION $(pip_versionsearch neuronx_distributed)


--------------------------------------------------------------------------------
/test/cases/neuron-training/manifests/bert-training.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   labels:
 5 |     app: bert-training
 6 |   name: bert-training
 7 | spec:
 8 |   completionMode: Indexed
 9 |   completions: {{.NodeCount}}
10 |   parallelism: {{.NodeCount}}
11 |   backoffLimit: 0
12 |   template:
13 |     spec:
14 |       restartPolicy: Never
15 |       containers:
16 |       - image: {{.BertTrainingImage}}
17 |         name: bert-training
18 |         env:
19 |         - name: MASTER_ADDR
20 |           value: bert-training-0.training
21 |         args:
22 |         - sh
23 |         - -c 
24 |         - |
25 |           # Enable EFA https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-runtime/nrt-troubleshoot.html#fi-efa-fork-safe (AL2 legacy requirement)
26 |           export FI_EFA_FORK_SAFE=1
27 |           export CCOM_SOCKET_IFNAME=eth0
28 |           export NCCL_DEBUG=ERROR
29 |           torchrun --nproc_per_node {{.NeuronCorePerNode}} --nnodes {{.NodeCount}} --node_rank $JOB_COMPLETION_INDEX --master_addr $MASTER_ADDR train.py
30 |         volumeMounts:
31 |         - name: dshm
32 |           mountPath: /dev/shm 
33 |         resources:
34 |           requests:
35 |             aws.amazon.com/neuron: {{.NeuronPerNode}}
36 |             aws.amazon.com/neuroncore: {{.NeuronCorePerNode}}
37 |             vpc.amazonaws.com/efa: {{.EFAPerNode}}
38 |           limits:
39 |             aws.amazon.com/neuron: {{.NeuronPerNode}}
40 |             aws.amazon.com/neuroncore: {{.NeuronCorePerNode}}
41 |             vpc.amazonaws.com/efa: {{.EFAPerNode}}
42 |       nodeSelector:
43 |         node.kubernetes.io/instance-type: {{.NodeType}}
44 |       subdomain: training
45 |       volumes:
46 |       - name: dshm
47 |         emptyDir:
48 |           medium: Memory
49 | 


--------------------------------------------------------------------------------
/internal/e2e/health.go:
--------------------------------------------------------------------------------
 1 | package e2e
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"strings"
 7 | 
 8 | 	"k8s.io/client-go/kubernetes"
 9 | 	"k8s.io/client-go/rest"
10 | )
11 | 
12 | // KubeletIsResponsive returns true if the kubelet /healthz endpoint responds with a 200 status code, and propagates
13 | // any non-connection specific errors
14 | func KubeletIsResponsive(ctx context.Context, cfg *rest.Config, nodeName string) (bool, error) {
15 | 	client, err := kubernetes.NewForConfig(cfg)
16 | 	if err != nil {
17 | 		return false, fmt.Errorf("failed to initialize client set: %v", err)
18 | 	}
19 | 
20 | 	nodeHealthResponse := client.CoreV1().RESTClient().Get().Resource("nodes").
21 | 		Name(nodeName).SubResource("proxy").Suffix("/healthz").
22 | 		Do(ctx)
23 | 
24 | 	if nodeHealthResponse.Error() != nil {
25 | 		errMsg := nodeHealthResponse.Error().Error()
26 | 		// TODO: match errors against types, e.g. syscall.ECONNREFUSED instead, the k8s client doesn't
27 | 		// currently properly wrap the underlying error to allow this though
28 | 		if strings.Contains(errMsg, "connection refused") ||
29 | 			strings.Contains(errMsg, "connection reset by peer") ||
30 | 			strings.Contains(errMsg, "http2: client connection lost") {
31 | 			// these errors indicate reachability to the node in general but an unstable connection to kubelet
32 | 			return false, nil
33 | 		}
34 | 
35 | 		// propagate other errors, e.g. i/o timeout, that may result from things unrelated to kubelet health,
36 | 		// e.g. security group rules on the instance restricting traffic from the CP
37 | 		return false, fmt.Errorf("could not reach /healthz endpoint for node %s: %w", nodeName, nodeHealthResponse.Error())
38 | 	}
39 | 
40 | 	var statusCode int
41 | 	nodeHealthResponse.StatusCode(&statusCode)
42 | 	return statusCode == 200, nil
43 | }
44 | 


--------------------------------------------------------------------------------
/test/cases/workload/main_test.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package workload
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"flag"
 8 | 	"fmt"
 9 | 	"log"
10 | 	"os"
11 | 	"os/signal"
12 | 	"testing"
13 | 	"time"
14 | 
15 | 	"sigs.k8s.io/e2e-framework/pkg/env"
16 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
17 | )
18 | 
19 | const (
20 | 	defaultWorkloadTestTimeout = 10 * time.Minute
21 | )
22 | 
23 | var (
24 | 	testenv               env.Environment
25 | 	workloadTestCommand   *string
26 | 	workloadTestImage     *string
27 | 	workloadTestName      *string
28 | 	workloadTestResources *string
29 | 	workloadTestTimeout   *time.Duration
30 | )
31 | 
32 | func TestMain(m *testing.M) {
33 | 	workloadTestCommand = flag.String("workloadTestCommand", "", "command for workload test")
34 | 	workloadTestImage = flag.String("workloadTestImage", "", "image for workload test")
35 | 	workloadTestName = flag.String("workloadTestName", "workload-test", "name for workload test")
36 | 	workloadTestResources = flag.String("workloadTestResources", "", "JSON map of resources for workload test (e.g., '{\"nvidia.com/gpu\": \"1\"}')")
37 | 	workloadTestTimeout = flag.Duration("workloadTestTimeout", defaultWorkloadTestTimeout, fmt.Sprintf("timeout for workload test (default: %s)", defaultWorkloadTestTimeout))
38 | 	cfg, err := envconf.NewFromFlags()
39 | 	if err != nil {
40 | 		log.Fatalf("failed to initialize test environment: %v", err)
41 | 	}
42 | 	testenv = env.NewWithConfig(cfg)
43 | 	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
44 | 	defer cancel()
45 | 	testenv = testenv.WithContext(ctx)
46 | 
47 | 	testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) {
48 | 		log.Println("Starting workload test suite...")
49 | 		return ctx, nil
50 | 	})
51 | 
52 | 	os.Exit(testenv.Run(m))
53 | }
54 | 


--------------------------------------------------------------------------------
/test/cases/nvidia/manifests/job-hpc-benchmarks.yaml:
--------------------------------------------------------------------------------
 1 | kind: Job
 2 | apiVersion: batch/v1
 3 | metadata:
 4 |   name: hpc-benckmarks-job
 5 |   labels:
 6 |     app: hpc-benckmarks-job
 7 | spec:
 8 |   completions: 1
 9 |   parallelism: 1
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: hpc-benckmarks-job
14 |     spec:
15 |       volumes:
16 |         - name: dshm
17 |           emptyDir:
18 |             medium: Memory
19 |       containers:
20 |       - name: hpc-benchmarks
21 |         image: "nvcr.io/nvidia/hpc-benchmarks:25.04"
22 |         command:
23 |         - mpirun
24 |         - --allow-run-as-root
25 |         - -np
26 |         - "{{.GpuPerNode}}"
27 |         - -bind-to
28 |         - none
29 |         - -x
30 |         - NCCL_DEBUG=INFO
31 |         - -x 
32 |         - HPL_FCT_COMM_POLICY=1 
33 |         - -x 
34 |         - HPL_USE_NVSHMEM=0
35 |         # TODO: for arm it will be
36 |         # - hpl-aarch64.sh
37 |         - hpl.sh 
38 |         - --mem-affinity 
39 |         - 0:0:0:0:1:1:1:1 
40 |         # --cpu-affinity needs to be tuned depending on the number of CPUs
41 |         # available on the instance type.
42 |         - --cpu-affinity 
43 |         - 0-13:14-27:28-41:42-55:56-69:70-83:84-97:98-111
44 |         - --no-multinode 
45 |         - --dat 
46 |         - hpl-linux-x86_64/sample-dat/HPL-dgx-1N.dat
47 |         # TODO: the path differs for arm64
48 |         # - hpl-linux-aarch64-gpu/sample-dat/HPL-dgx-1N.dat
49 |         volumeMounts:
50 |         - mountPath: /dev/shm
51 |           name: dshm
52 |         imagePullPolicy: Always
53 |         resources:
54 |           limits:
55 |             nvidia.com/gpu: {{.GpuPerNode}}
56 |         env:
57 |         - name: UCX_TLS
58 |           value: "^sysv"
59 |       restartPolicy: Never
60 |   backoffLimit: 4
61 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/ami_resolver_test.go:
--------------------------------------------------------------------------------
 1 | //go:build integration
 2 | 
 3 | package eksapi
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/aws/aws-sdk-go-v2/config"
10 | 	"github.com/stretchr/testify/assert"
11 | )
12 | 
13 | func TestAMIResolver(t *testing.T) {
14 | 	ctx := context.Background()
15 | 	awsCfg, err := config.LoadDefaultConfig(ctx)
16 | 	assert.NoError(t, err)
17 | 
18 | 	amiResolver := NewAMIResolver(newAWSClients(awsCfg, ""))
19 | 
20 | 	t.Run("AL2023-nvidia", func(t *testing.T) {
21 | 		opts := deployerOptions{
22 | 			UserDataFormat:    UserDataNodeadm,
23 | 			KubernetesVersion: "1.33",
24 | 		}
25 | 		t.Run("nvidia", func(t *testing.T) {
26 | 			opts := opts
27 | 			opts.InstanceTypes = []string{"g5.xlarge"}
28 | 
29 | 			ami, err := amiResolver.Resolve(ctx, &opts)
30 | 			assert.NoError(t, err)
31 | 			assert.Regexp(t, "ami-.*", ami)
32 | 		})
33 | 		t.Run("standard", func(t *testing.T) {
34 | 			opts := opts
35 | 			opts.InstanceTypes = []string{"m5.xlarge"}
36 | 
37 | 			ami, err := amiResolver.Resolve(ctx, &opts)
38 | 			assert.NoError(t, err)
39 | 			assert.Regexp(t, "ami-.*", ami)
40 | 		})
41 | 	})
42 | 
43 | 	t.Run("Bottlerocket", func(t *testing.T) {
44 | 		opts := deployerOptions{
45 | 			UserDataFormat:    UserDataBottlerocket,
46 | 			KubernetesVersion: "1.33",
47 | 		}
48 | 		t.Run("nvidia", func(t *testing.T) {
49 | 			opts := opts
50 | 			opts.InstanceTypes = []string{"g5.xlarge"}
51 | 
52 | 			ami, err := amiResolver.Resolve(ctx, &opts)
53 | 			assert.NoError(t, err)
54 | 			assert.Regexp(t, "ami-.*", ami)
55 | 		})
56 | 		t.Run("standard", func(t *testing.T) {
57 | 			opts := opts
58 | 			opts.InstanceTypes = []string{"m5.xlarge"}
59 | 
60 | 			ami, err := amiResolver.Resolve(ctx, &opts)
61 | 			assert.NoError(t, err)
62 | 			assert.Regexp(t, "ami-.*", ami)
63 | 		})
64 | 	})
65 | }
66 | 


--------------------------------------------------------------------------------
/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # container image from: https://github.com/aws/deep-learning-containers/blob/master/available_images.md
 3 | apiVersion: kubeflow.org/v2beta1
 4 | kind: MPIJob
 5 | metadata:
 6 |   name: pytorch-training-single-node
 7 | spec:
 8 |   slotsPerWorker: 4
 9 |   runPolicy:
10 |     cleanPodPolicy: Running
11 |   mpiImplementation: OpenMPI
12 |   mpiReplicaSpecs:
13 |     Launcher:
14 |       replicas: 1
15 |       template:
16 |          spec:
17 |            restartPolicy: OnFailure
18 |            containers:
19 |            - image: {{.PytorchTestImage}}
20 |              name: gpu-test
21 |              command:
22 |               - mpirun
23 |               - --allow-run-as-root
24 |               - -np
25 |               - "1"
26 |               - -mca
27 |               - btl_tcp_if_exclude
28 |               - lo
29 |               - -mca
30 |               - pml
31 |               - ob1
32 |               - -mca
33 |               - btl
34 |               - ^openib
35 |               - --bind-to
36 |               - none
37 |               - -map-by
38 |               - slot
39 |               - -x
40 |               - LD_LIBRARY_PATH
41 |               - -x
42 |               - PATH
43 |               - -x
44 |               - NCCL_SOCKET_IFNAME=eth0
45 |               - -x
46 |               - NCCL_DEBUG=INFO
47 |               - -x
48 |               - MXNET_CUDNN_AUTOTUNE_DEFAULT=0
49 |               - python
50 |               - -c
51 |               - import os; os.system("git clone https://github.com/pytorch/examples.git pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python pytorch-examples/mnist/main.py --epochs 1")
52 |              resources:
53 |                limits:
54 |                  nvidia.com/gpu: 1
55 | 


--------------------------------------------------------------------------------
/test/images/efa/scripts/unit-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eu
 4 | 
 5 | get_instance_type()
 6 | {
 7 | 
 8 |     local token=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null)
 9 | 
10 |     if [ -n "$token" ]; then
11 |         curl -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-type
12 |     else
13 |         curl http://169.254.169.254/latest/meta-data/instance-type
14 |     fi
15 | }
16 | 
17 | get_expected_efa_device_count() 
18 | {
19 |     aws ec2 describe-instance-types --instance-type="$EC2_INSTANCE_TYPE" | jq -r '.InstanceTypes[].NetworkInfo.EfaInfo.MaximumEfaInterfaces'
20 | }
21 | 
22 | EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-$(get_instance_type)}
23 | EXPECTED_EFA_DEVICE_COUNT=${EXPECTED_EFA_DEVICE_COUNT:-$(get_expected_efa_device_count)}
24 | 
25 | echo "Running test on a $EC2_INSTANCE_TYPE"
26 | 
27 | fi_info -p efa
28 | DGRAM_ENDPOINT_COUNT=$(fi_info -p efa | grep 'type:\sFI_EP_DGRAM$' | wc -l)
29 | if ! test $EXPECTED_EFA_DEVICE_COUNT -le $DGRAM_ENDPOINT_COUNT; then
30 |     echo "Expected at least $EXPECTED_EFA_DEVICE_COUNT DGRAM endpoint(s) but found $DGRAM_ENDPOINT_COUNT"
31 |     exit 1
32 | else
33 |     echo "Verified at least $EXPECTED_EFA_DEVICE_COUNT DGRAM endpoint(s) are available (found $DGRAM_ENDPOINT_COUNT)"
34 | fi
35 | 
36 | RDM_ENDPOINT_COUNT=$(fi_info -p efa | grep 'type:\sFI_EP_RDM$' | wc -l)
37 | if ! test $EXPECTED_EFA_DEVICE_COUNT -le $RDM_ENDPOINT_COUNT; then
38 |     echo "Expected at least $EXPECTED_EFA_DEVICE_COUNT RDM endpoint(s) but found $RDM_ENDPOINT_COUNT"
39 |     exit 1
40 | else
41 |     echo "Verified at least $EXPECTED_EFA_DEVICE_COUNT RDM endpoint(s) are available (found $RDM_ENDPOINT_COUNT)"
42 | fi
43 | 
44 | 
45 | echo "Running single-node efa test"
46 | 
47 | # Run efa_test.sh, a utility added during the build while installing EFA
48 | efa_test.sh
49 | 
50 | echo "Success!"


--------------------------------------------------------------------------------
/internal/testers/ginkgov1/kubectl/kubectl.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2019 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package kubectl
18 | 
19 | import (
20 | 	"fmt"
21 | 	"os"
22 | 
23 | 	"sigs.k8s.io/kubetest2/pkg/exec"
24 | )
25 | 
26 | const (
27 | 	kubectl = "kubectl"
28 | )
29 | 
30 | // APIServerURL obtains the URL of the k8s master from kubectl
31 | func APIServerURL() (string, error) {
32 | 	kubecontext, err := execAndResult(kubectl, "config", "view", "-o", "jsonpath=\"{.current-context}\"")
33 | 	if err != nil {
34 | 		return "", fmt.Errorf("Could not get kube context: %v", err)
35 | 	}
36 | 
37 | 	clustername, err := execAndResult(kubectl, "config", "view", "-o",
38 | 		fmt.Sprintf("jsonpath=\"{.contexts[?(@.name == %s)].context.cluster}\"", kubecontext))
39 | 	if err != nil {
40 | 		return "", fmt.Errorf("Could not get cluster name: %v", err)
41 | 	}
42 | 
43 | 	apiServerURL, err := execAndResult(kubectl, "config", "view", "-o",
44 | 		fmt.Sprintf("jsonpath={.clusters[?(@.name == %s)].cluster.server}", clustername))
45 | 	if err != nil {
46 | 		return "", err
47 | 	}
48 | 	return apiServerURL, nil
49 | }
50 | 
51 | // execAndResult runs command with args and returns the entire output (or error)
52 | func execAndResult(command string, args ...string) (string, error) {
53 | 	cmd := exec.Command(command, args...)
54 | 	cmd.SetStderr(os.Stderr)
55 | 	bytes, err := exec.Output(cmd)
56 | 	return string(bytes), err
57 | }
58 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/common.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | get_instance_type()
 4 | {
 5 |     # Retrieve instance metadata: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#instance-metadata-retrieval-examples
 6 |     [ -n "$FORCE_INSTANCE_TYPE" ] && echo $FORCE_INSTANCE_TYPE
 7 | 
 8 |     local token=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null)
 9 | 
10 |     if [ -n "$token" ]; then
11 |         curl -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-type
12 |     else
13 |         curl http://169.254.169.254/latest/meta-data/instance-type
14 |     fi
15 | }
16 | 
17 | assert_gpu_unused()
18 | {
19 |     cmd="nvidia-smi --query-compute-apps timestamp,gpu_bus_id,gpu_uuid,pid,name,used_memory --format csv,noheader"
20 |     assert_equals "" "`$cmd`" "gpu is busy by other task, system misconfig?"
21 | }
22 | 
23 | _assert_data()
24 | {
25 |     local expected="$1"
26 |     local cmd="$2"
27 |     local message="${3:-}"
28 |     local cmd_out="$ACTUAL_RESULTS/$(basename $expected)"
29 |     [[ -z $message ]] || message="$message\n"
30 | 
31 |     eval "$cmd" > $cmd_out
32 |     diff_cmd="diff -up $expected $cmd_out"
33 |     diff_out="`$diff_cmd`"
34 | 
35 |     notify_trace_dbg "_assert_data $diff_cmd, out: $diff_out"
36 |     if [ -n "$diff_out" ]
37 |     then
38 | 	fail "$message test data value diff:\n$diff_out"
39 |     fi
40 | }
41 | 
42 | assert_data() {
43 |     _assert_data "$1" "$2" "$3"
44 | }
45 | 
46 | generate_data()
47 | {
48 |     local expected="$1"
49 |     local cmd="$2"
50 |     local msg="$3"
51 |     local cmd_out="$ACTUAL_RESULTS/$(basename $expected)"
52 | 
53 |     eval "$cmd" > $expected
54 |     _assert_data "$expected" "$cmd" "$msg"
55 | }
56 | 
57 | function is_vgpu()
58 | {
59 |   local instance_type=${EC2_INSTANCE_TYPE:-$(get_instance_type)}
60 |   case "${instance_type}" in
61 |     g6f.*|gr6f.*) return ;;
62 |     *) return 1 ;;  # Not supported
63 |   esac
64 | }
65 | 


--------------------------------------------------------------------------------
/test/manifests/assets/dcgm-exporter.yaml:
--------------------------------------------------------------------------------
 1 | # Derived from: Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 2 | apiVersion: apps/v1
 3 | kind: DaemonSet
 4 | metadata:
 5 |   name: "dcgm-exporter"
 6 |   namespace: "kube-system"
 7 |   labels:
 8 |     app.kubernetes.io/name: "dcgm-exporter"
 9 |     app.kubernetes.io/version: "4.1.3"
10 | spec:
11 |   updateStrategy:
12 |     type: RollingUpdate
13 |   selector:
14 |     matchLabels:
15 |       app.kubernetes.io/name: "dcgm-exporter"
16 |       app.kubernetes.io/version: "4.1.3"
17 |   template:
18 |     metadata:
19 |       labels:
20 |         app.kubernetes.io/name: "dcgm-exporter"
21 |         app.kubernetes.io/version: "4.1.3"
22 |       name: "dcgm-exporter"
23 |     spec:
24 |       containers:
25 |       - image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.1.3-ubuntu22.04"
26 |         env:
27 |         - name: "DCGM_EXPORTER_LISTEN"
28 |           value: ":9400"
29 |         - name: "DCGM_EXPORTER_INTERVAL"
30 |           value: "100"
31 |         - name: "DCGM_EXPORTER_KUBERNETES"
32 |           value: "true"
33 |         name: "dcgm-exporter"
34 |         ports:
35 |         - name: "metrics"
36 |           containerPort: 9400
37 |         securityContext:
38 |           runAsNonRoot: false
39 |           runAsUser: 0
40 |           capabilities:
41 |             add: ["SYS_ADMIN"]
42 |         volumeMounts:
43 |         - name: "pod-gpu-resources"
44 |           readOnly: true
45 |           mountPath: "/var/lib/kubelet/pod-resources"
46 |       volumes:
47 |       - name: "pod-gpu-resources"
48 |         hostPath:
49 |           path: "/var/lib/kubelet/pod-resources"
50 | 
51 | ---
52 | 
53 | kind: Service
54 | apiVersion: v1
55 | metadata:
56 |   name: "dcgm-exporter"
57 |   namespace: "kube-system"
58 |   labels:
59 |     app.kubernetes.io/name: "dcgm-exporter"
60 |     app.kubernetes.io/version: "4.1.3"
61 | spec:
62 |   clusterIP: "None"
63 |   selector:
64 |     app.kubernetes.io/name: "dcgm-exporter"
65 |     app.kubernetes.io/version: "4.1.3"
66 |   ports:
67 |   - name: "metrics"
68 |     port: 9400


--------------------------------------------------------------------------------
/test/manifests/assets/efa-device-plugin.yaml:
--------------------------------------------------------------------------------
 1 | # Source: https://raw.githubusercontent.com/aws-samples/aws-efa-eks/main/manifest/efa-k8s-device-plugin.yml
 2 | apiVersion: apps/v1
 3 | kind: DaemonSet
 4 | metadata:
 5 |   name: aws-efa-k8s-device-plugin-daemonset
 6 |   namespace: kube-system
 7 | spec:
 8 |   selector:
 9 |     matchLabels:
10 |       name:  aws-efa-k8s-device-plugin
11 |   updateStrategy:
12 |     type: RollingUpdate
13 |   template:
14 |     metadata:
15 |       labels:
16 |         name: aws-efa-k8s-device-plugin
17 |     spec:
18 |       serviceAccount: default
19 |       tolerations:
20 |         - key: CriticalAddonsOnly
21 |           operator: Exists
22 |         - key: aws.amazon.com/efa
23 |           operator: Exists
24 |           effect: NoSchedule
25 |       # Mark this pod as a critical add-on; when enabled, the critical add-on
26 |       # scheduler reserves resources for critical add-on pods so that they can
27 |       # be rescheduled after a failure.
28 |       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
29 |       priorityClassName: "system-node-critical"
30 |       hostNetwork: true
31 |       containers:
32 |         - image: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.5.8
33 |           name: aws-efa-k8s-device-plugin
34 |           securityContext:
35 |             allowPrivilegeEscalation: false
36 |             capabilities:
37 |               drop: ["ALL"]
38 |             runAsNonRoot: false
39 |           volumeMounts:
40 |             - name: device-plugin
41 |               mountPath: /var/lib/kubelet/device-plugins
42 |             - name: infiniband-volume
43 |               mountPath: /dev/infiniband
44 |           resources:
45 |             requests:
46 |               cpu:    10m
47 |               memory: 20Mi
48 |       volumes:
49 |         - name: device-plugin
50 |           hostPath:
51 |             path: /var/lib/kubelet/device-plugins
52 |         - name: infiniband-volume
53 |           hostPath:
54 |             path: /dev/infiniband
55 | 


--------------------------------------------------------------------------------
/test/images/efa/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/amazonlinux/amazonlinux:2023
 2 | 
 3 | ARG EFA_BIN_PATH="/opt/amazon/efa/bin"
 4 | 
 5 | RUN dnf -y swap gnupg2-minimal gnupg2 && \
 6 |     dnf install -y \
 7 |     gcc gcc-c++ make \  
 8 |     ca-certificates \
 9 |     cmake \
10 |     emacs \
11 |     git \
12 |     jq \
13 |     wget \
14 |     unzip \
15 |     vim \
16 |     zlib-devel \      
17 |     openssl \
18 |     openssl-devel \    
19 |     sqlite-devel \   
20 |     gdbm-devel \      
21 |     glibc-devel \     
22 |     bzip2-devel \     
23 |     ncurses-devel \    
24 |     tk-devel \        
25 |     libffi-devel \     
26 |     libcap-devel \  
27 |     tar \
28 |     gnupg2 
29 | 
30 | ENV PATH="$PATH:$EFA_BIN_PATH"
31 | 
32 | RUN cd $HOME \
33 |     && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
34 |     && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
35 |     && cat aws-efa-installer.key | gpg --fingerprint \
36 |     && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
37 |     && tar -xf aws-efa-installer-latest.tar.gz \
38 |     && cd aws-efa-installer \
39 |     && ./efa_installer.sh -y -d --skip-kmod --skip-limit-conf --no-verify \
40 |     # TODO: remove this in favor of letting the efa installer add it if that ever becomes an option.
41 |     # At the moment, this is only installed if omitting --no-verify, which would require
42 |     # building in a context with EFA available
43 |     && install -T -m 0755 efa_test.sh "${EFA_BIN_PATH}/efa_test.sh" \
44 |     && cd $HOME \
45 |     && rm -rf aws-efa-installer
46 | 
47 | RUN dnf clean all
48 | 
49 | RUN INSTALL_DIR=$(mktemp -d) && \
50 |     cd $INSTALL_DIR && \
51 |     curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" && \
52 |     unzip awscliv2.zip && \
53 |     ./aws/install  && \
54 |     cd && \ 
55 |     rm -rf $INSTALL_DIR
56 | 
57 | COPY test/images/efa/scripts ./scripts
58 | 
59 | RUN chmod -R +x ./scripts


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/README.md:
--------------------------------------------------------------------------------
 1 | # What
 2 | 
 3 | gpu_unit_tests is the unit tests for gpu enabled platforms. Idea is to create compact
 4 | set of tests which will cover most of performance critical aspects for gpu
 5 | platforms. Test designed to run on single instance.
 6 | # Usage
 7 | 
 8 | ```
 9 | # Run tests
10 | ./unit_test
11 | ```
12 | 
13 | **Generate test data for new instance type**
14 | 
15 | Step 1: Copy the `gpu_unit_tests` folder to the EC2 instance where you want to generate the data.
16 | 
17 | Step 2:  Execute the following command in the `gpu_unit_tests` directory on the EC2 instance:
18 | ```
19 | GENERATE_DATA=1 ./unit_test
20 | ```
21 | Step 3:
22 | Copy the files from `tests/test_sysinfo.sh.data` (e.g., `tests/test_sysinfo.sh.data/p3.2xlarge`) to your local repository.
23 | 
24 | Step 4:
25 | Create PR with the new `tests/test_sysinfo.sh.data/xxx`
26 | 
27 | # Test list
28 | 
29 | -  test_sysinfo.sh :: Validate basic system configuration by comparing it with test config
30 |   - test_numa_topo_topo :: check cpu/numa topology
31 |   - test_nvidia_gpu_count :: fail if one of GPUs is broken or is not visiable
32 |   - test_nvidia_fabric_status :: fail if fabric manager is not active
33 |   - test_nvidia_smi_topo :: fail if nvidia-smi topology is differ
34 |   - test_nvidia_persistence_status :: validate persistence state
35 |   - test_nvidia_gpu_unused :: Check that no other process are using GPUs, fail is a signal system misconfiguration.
36 | 
37 | 
38 | - 10_test_basic_cuda.sh :: Execute trivial cuda binaries, fail if cuda subsys is not healthy
39 |   Use demo-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests 
40 |   If this test suite fail this is a sign that cuda subsystem is not usable at all.
41 |   Usually this is side effect of system misconfiguration (driver or fabric manager is not loaded)
42 |   - test_01_device_query
43 |   - test_02_vector_add
44 |   - test_03_nvbandwidth
45 |   - test_04_dcgm_diagnostics
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/aws.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"github.com/aws/aws-sdk-go-v2/aws"
 5 | 	"github.com/aws/aws-sdk-go-v2/service/autoscaling"
 6 | 	"github.com/aws/aws-sdk-go-v2/service/cloudformation"
 7 | 	"github.com/aws/aws-sdk-go-v2/service/ec2"
 8 | 	"github.com/aws/aws-sdk-go-v2/service/eks"
 9 | 	"github.com/aws/aws-sdk-go-v2/service/iam"
10 | 	"github.com/aws/aws-sdk-go-v2/service/s3"
11 | 	"github.com/aws/aws-sdk-go-v2/service/ssm"
12 | )
13 | 
14 | type awsClients struct {
15 | 	_eks       *eks.Client
16 | 	_cfn       *cloudformation.Client
17 | 	_ec2       *ec2.Client
18 | 	_asg       *autoscaling.Client
19 | 	_ssm       *ssm.Client
20 | 	_iam       *iam.Client
21 | 	_s3        *s3.Client
22 | 	_s3Presign *s3.PresignClient
23 | }
24 | 
25 | func newAWSClients(config aws.Config, eksEndpointURL string) *awsClients {
26 | 	clients := awsClients{
27 | 		_cfn: cloudformation.NewFromConfig(config),
28 | 		_ec2: ec2.NewFromConfig(config),
29 | 		_asg: autoscaling.NewFromConfig(config),
30 | 		_ssm: ssm.NewFromConfig(config),
31 | 		_iam: iam.NewFromConfig(config),
32 | 		_s3:  s3.NewFromConfig(config),
33 | 	}
34 | 	clients._s3Presign = s3.NewPresignClient(clients._s3)
35 | 	if eksEndpointURL != "" {
36 | 		clients._eks = eks.NewFromConfig(config, func(o *eks.Options) {
37 | 			o.BaseEndpoint = aws.String(eksEndpointURL)
38 | 		})
39 | 	} else {
40 | 		clients._eks = eks.NewFromConfig(config)
41 | 	}
42 | 	return &clients
43 | }
44 | 
45 | func (c *awsClients) EKS() *eks.Client {
46 | 	return c._eks
47 | }
48 | 
49 | func (c *awsClients) CFN() *cloudformation.Client {
50 | 	return c._cfn
51 | }
52 | 
53 | func (c *awsClients) EC2() *ec2.Client {
54 | 	return c._ec2
55 | }
56 | 
57 | func (c *awsClients) ASG() *autoscaling.Client {
58 | 	return c._asg
59 | }
60 | 
61 | func (c *awsClients) SSM() *ssm.Client {
62 | 	return c._ssm
63 | }
64 | 
65 | func (c *awsClients) IAM() *iam.Client {
66 | 	return c._iam
67 | }
68 | 
69 | func (c *awsClients) S3() *s3.Client {
70 | 	return c._s3
71 | }
72 | 
73 | func (c *awsClients) S3Presign() *s3.PresignClient {
74 | 	return c._s3Presign
75 | }
76 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/kubeconfig.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"os"
 7 | 	"text/template"
 8 | 
 9 | 	"k8s.io/klog"
10 | )
11 | 
12 | const kubeconfigPerm = 0666
13 | 
14 | var kubeconfigTemplate = `---
15 | apiVersion: v1
16 | kind: Config
17 | clusters:
18 | - cluster:
19 |     certificate-authority-data: {{ .ClusterCertificateAuthority }}
20 |     server: {{ .ClusterEndpoint }}
21 |   name: {{ .ClusterARN }}
22 | contexts:
23 | - context:
24 |     cluster: {{ .ClusterARN }}
25 |     user: {{ .ClusterARN }}
26 |   name: {{ .ClusterARN }}
27 | current-context: {{ .ClusterARN }}
28 | preferences: {}
29 | users:
30 | - name: {{ .ClusterARN }}
31 |   user:
32 |     exec:
33 |       apiVersion: client.authentication.k8s.io/v1beta1
34 |       command: aws
35 |       args:
36 |       - eks
37 |       - get-token
38 |       - --cluster-name
39 |       - {{ .ClusterName }}
40 | `
41 | 
42 | type kubeconfigTemplateParameters struct {
43 | 	ClusterCertificateAuthority string
44 | 	ClusterARN                  string
45 | 	ClusterEndpoint             string
46 | 	ClusterName                 string
47 | }
48 | 
49 | func writeKubeconfig(cluster *Cluster, kubeconfigPath string) error {
50 | 	if cluster == nil {
51 | 		return fmt.Errorf("Cluster is nil, you might need set --static-cluster-name or set --up to initial cluster resrouces")
52 | 	}
53 | 	klog.Infof("writing kubeconfig to %s for cluster: %s", kubeconfigPath, cluster.arn)
54 | 	templateParams := kubeconfigTemplateParameters{
55 | 		ClusterCertificateAuthority: cluster.certificateAuthorityData,
56 | 		ClusterARN:                  cluster.arn,
57 | 		ClusterEndpoint:             cluster.endpoint,
58 | 		ClusterName:                 cluster.name,
59 | 	}
60 | 
61 | 	kubeconfig := bytes.Buffer{}
62 | 
63 | 	t, err := template.New("kubeconfig").Parse(kubeconfigTemplate)
64 | 	if err != nil {
65 | 		return err
66 | 	}
67 | 	err = t.Execute(&kubeconfig, templateParams)
68 | 	if err != nil {
69 | 		return err
70 | 	}
71 | 
72 | 	err = os.WriteFile(kubeconfigPath, kubeconfig.Bytes(), kubeconfigPerm)
73 | 	if err != nil {
74 | 		return err
75 | 	}
76 | 
77 | 	klog.Infof("wrote kubeconfig: %s\n%s", kubeconfigPath, kubeconfig.String())
78 | 	return nil
79 | }
80 | 


--------------------------------------------------------------------------------
/internal/util/cloudformation.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"strings"
 7 | 
 8 | 	"github.com/aws/aws-sdk-go-v2/aws"
 9 | 	"github.com/aws/aws-sdk-go-v2/service/cloudformation"
10 | 	types "github.com/aws/aws-sdk-go-v2/service/cloudformation/types"
11 | )
12 | 
13 | // TODO: implement AWS client wrappers, and incorporate this into the cfn:CreateStack call
14 | func WrapCFNStackFailure(ctx context.Context, cfnClient *cloudformation.Client, createStackErr error, stackName string) error {
15 | 	if createStackErr == nil {
16 | 		return nil
17 | 	}
18 | 	resourceByFailureMode := make(map[string][]string)
19 | 	eventsPaginator := cloudformation.NewDescribeStackEventsPaginator(cfnClient, &cloudformation.DescribeStackEventsInput{
20 | 		StackName: &stackName,
21 | 	})
22 | 	for eventsPaginator.HasMorePages() {
23 | 		page, err := eventsPaginator.NextPage(ctx)
24 | 		if err != nil {
25 | 			return createStackErr
26 | 		}
27 | 		for _, event := range page.StackEvents {
28 | 			if event.ResourceStatus == types.ResourceStatusCreateFailed {
29 | 				if _, ok := resourceByFailureMode[aws.ToString(event.ResourceStatusReason)]; !ok {
30 | 					resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = []string{}
31 | 				}
32 | 				resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = append(resourceByFailureMode[aws.ToString(event.ResourceStatusReason)], aws.ToString(event.LogicalResourceId))
33 | 			}
34 | 		}
35 | 	}
36 | 	nonCancellationFailure := len(resourceByFailureMode) > 1
37 | 	var enhancedDetails []string
38 | 	for reason, resources := range resourceByFailureMode {
39 | 		if nonCancellationFailure && reason == "Resource creation cancelled" {
40 | 			// Ignore resource cancellation errors if there's another failure reported, those failures
41 | 			// would just be a consequence of that failure. If all the failures are resource cancellation,
42 | 			// then there was likely a user initiated delete of the whole stack based on a timeout
43 | 			// waiting for one of the resources to create
44 | 			continue
45 | 		}
46 | 		enhancedDetails = append(enhancedDetails, fmt.Sprintf("%s: %s", strings.Join(resources, ","), reason))
47 | 	}
48 | 	return fmt.Errorf("%w: %s", createStackErr, strings.Join(enhancedDetails, "--"))
49 | }
50 | 


--------------------------------------------------------------------------------
/test/manifests/assets/nvidia-device-plugin.yaml:
--------------------------------------------------------------------------------
 1 | # Source: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/main/deployments/static/nvidia-device-plugin.yml
 2 | 
 3 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | apiVersion: apps/v1
18 | kind: DaemonSet
19 | metadata:
20 |   name: nvidia-device-plugin-daemonset
21 |   namespace: kube-system
22 | spec:
23 |   selector:
24 |     matchLabels:
25 |       name: nvidia-device-plugin-ds
26 |   updateStrategy:
27 |     type: RollingUpdate
28 |   template:
29 |     metadata:
30 |       labels:
31 |         name: nvidia-device-plugin-ds
32 |     spec:
33 |       tolerations:
34 |       - key: nvidia.com/gpu
35 |         operator: Exists
36 |         effect: NoSchedule
37 |       # Mark this pod as a critical add-on; when enabled, the critical add-on
38 |       # scheduler reserves resources for critical add-on pods so that they can
39 |       # be rescheduled after a failure.
40 |       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
41 |       priorityClassName: "system-node-critical"
42 |       containers:
43 |       - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.2
44 |         name: nvidia-device-plugin-ctr
45 |         env:
46 |           - name: FAIL_ON_INIT_ERROR
47 |             value: "false"
48 |         securityContext:
49 |           allowPrivilegeEscalation: false
50 |           capabilities:
51 |             drop: ["ALL"]
52 |         volumeMounts:
53 |         - name: device-plugin
54 |           mountPath: /var/lib/kubelet/device-plugins
55 |       volumes:
56 |       - name: device-plugin
57 |         hostPath:
58 |           path: /var/lib/kubelet/device-plugins
59 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/amazonlinux/amazonlinux:2023 AS builder
 2 | ARG TARGETOS
 3 | ARG TARGETARCH
 4 | RUN dnf install -y git tar gzip make unzip gcc rsync wget jq
 5 | ARG GO_MINOR_VERSION=1.25
 6 | RUN curl https://go.dev/dl/?mode=json | jq -r .[].version | grep "^go${GO_MINOR_VERSION}" | head -n1 > go-version.txt
 7 | RUN  wget -O go.tar.gz https://go.dev/dl/$(cat go-version.txt).${TARGETOS}-${TARGETARCH}.tar.gz && \
 8 |     rm -rf /usr/local/go && \
 9 |     tar -C /usr/local -xzf go.tar.gz
10 | ENV GOPATH=/usr/local/go
11 | ENV PATH=$PATH:$GOPATH/bin
12 | ENV GOPROXY=direct
13 | 
14 | WORKDIR $GOPATH/src/github.com/aws/aws-k8s-tester
15 | COPY . .
16 | RUN go install ./...
17 | RUN go test -c -tags=e2e ./test/... -o $GOPATH/bin/
18 | 
19 | RUN go install sigs.k8s.io/kubetest2 && \
20 |     go install sigs.k8s.io/kubetest2/kubetest2-tester-exec && \
21 |     go install sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo
22 | 
23 | FROM public.ecr.aws/amazonlinux/amazonlinux:2023
24 | ARG TARGETOS
25 | ARG TARGETARCH
26 | WORKDIR /workdir
27 | RUN dnf install -y tar gzip unzip wget openssh diffutils
28 | RUN wget -O awscli.zip https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip && \
29 |     unzip awscli.zip && \
30 |     ./aws/install
31 | # we need gsutil from the gcloud CLI for kubetest-tester-ginkgo
32 | RUN dnf install -y python3.13
33 | ARG GCLOUD_SDK_URL=https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz
34 | RUN wget -O google-cloud-sdk.tar.gz -q $GCLOUD_SDK_URL && \
35 |     tar xzf google-cloud-sdk.tar.gz -C / && \
36 |     rm google-cloud-sdk.tar.gz && \
37 |     /google-cloud-sdk/install.sh \
38 |         --disable-installation-options \
39 |         --bash-completion=false \
40 |         --path-update=false \
41 |         --usage-reporting=false
42 | ENV PATH=$PATH:/google-cloud-sdk/bin
43 | ARG EKSCTL_VERSION=latest
44 | RUN wget -O eksctl.tar.gz "https://github.com/eksctl-io/eksctl/releases/${EKSCTL_VERSION}/download/eksctl_Linux_${TARGETARCH}.tar.gz" && \
45 |     tar xzf eksctl.tar.gz -C /bin/ && \
46 |     rm eksctl.tar.gz
47 | ARG KUBERNETES_MINOR_VERSION
48 | COPY hack/download-kubernetes-binaries.sh .
49 | RUN ./download-kubernetes-binaries.sh "${KUBERNETES_MINOR_VERSION}" "${TARGETOS}" "${TARGETARCH}"
50 | RUN mkdir /info
51 | ENV PATH=$PATH:/info
52 | RUN cp kubernetes-version.txt /info/
53 | RUN mv kubernetes/*/bin/* /bin/
54 | RUN rm -rf /workdir
55 | COPY --from=builder /usr/local/go/bin/* /bin/
56 | 


--------------------------------------------------------------------------------
/test/cases/neuron/manifests/multi-node-test-neuron.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v2beta1
 2 | kind: MPIJob
 3 | metadata:
 4 |   name: multi-node-nccom-test
 5 | spec:
 6 |   slotsPerWorker: {{.NeuronPerNode}}
 7 |   runPolicy:
 8 |     backoffLimit: 20
 9 |     cleanPodPolicy: Running
10 |   mpiReplicaSpecs:
11 |     Launcher:
12 |       replicas: 1
13 |       template:
14 |         spec:
15 |           restartPolicy: OnFailure
16 |           containers:
17 |           - image: {{.NeuronTestImage}}
18 |             imagePullPolicy: Always
19 |             name: nccom-test-launcher
20 |             env:
21 |             - name: POD_IP
22 |               valueFrom:
23 |                 fieldRef:
24 |                   fieldPath: status.podIP
25 |             command:
26 |             - /bin/bash
27 |             args:
28 |             - -c
29 |             - |
30 |                 WORKER_IPS=()
31 |                 for i in $(seq 0 $(({{.WorkerNodeCount}} - 1))); do
32 |                   WORKER_IP=$(getent hosts multi-node-nccom-test-worker-$i.multi-node-nccom-test | awk '{print $1}')
33 |                   WORKER_IPS+=("$WORKER_IP")
34 |                 done
35 | 
36 |                 export CCOM_SOCKET_IFNAME=eth0
37 |                 export NEURON_RT_ROOT_COMM_ID=${WORKER_IPS[0]}:63182
38 |                 nccom-test -r $(({{.NeuronCorePerNode}}*{{.WorkerNodeCount}})) -N {{.WorkerNodeCount}} -b "8" -e "2G" -f "2" -n "5" -w "5" -d "fp32" allr --hosts ${WORKER_IPS[*]} --data-collector-host $POD_IP --data-collector-port 60006 --debug
39 |     Worker:
40 |       replicas: {{.WorkerNodeCount}}
41 |       template:
42 |         spec:
43 |           securityContext:
44 |             runAsUser: 1000
45 |             runAsGroup: 2000
46 |             fsGroup: 3000
47 |           containers:
48 |           - image: {{.NeuronTestImage}}
49 |             name: nccom-test-worker
50 |             command: ["/bin/bash"]
51 |             args: ["-c", "echo password | sudo -S /usr/sbin/sshd -D"]
52 |             imagePullPolicy: Always
53 |             resources:
54 |               limits:
55 |                 aws.amazon.com/neuron: {{.NeuronPerNode}}
56 |                 aws.amazon.com/neuroncore: {{.NeuronCorePerNode}}
57 |                 vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}}
58 |               requests:
59 |                 aws.amazon.com/neuron: {{.NeuronPerNode}}
60 |                 aws.amazon.com/neuroncore: {{.NeuronCorePerNode}}
61 |                 vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}}


--------------------------------------------------------------------------------
/test/cases/nvidia-training/manifests/bert-training.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v2beta1
 2 | kind: MPIJob
 3 | metadata:
 4 |   name: bert-training
 5 | spec:
 6 |   slotsPerWorker: {{.SlotsPerWorker}}
 7 |   runPolicy:
 8 |     backoffLimit: 20
 9 |     cleanPodPolicy: Running
10 |   mpiReplicaSpecs:
11 |     Launcher:
12 |       replicas: 1
13 |       template:
14 |         spec:
15 |           restartPolicy: OnFailure
16 |           containers:
17 |           - image: {{.BertTrainingImage}}
18 |             imagePullPolicy: Always
19 |             name: bert-training
20 |             env:
21 |             - name: NCCL_DEBUG
22 |               value: "TRACE"
23 |             - name: MASTER_ADDR
24 |               value: "bert-training"
25 |             - name: MASTER_PORT
26 |               value: "12355"
27 |             command:
28 |             - /opt/amazon/openmpi/bin/mpirun
29 |             - --allow-run-as-root
30 |             - --tag-output
31 |             - -np
32 |             - "{{.NP}}"           # Number of processes derived from node/gpu calculations
33 |             - -bind-to
34 |             - none
35 |             - -map-by
36 |             - slot
37 |             - -x
38 |             - PATH
39 |             - -x
40 |             - LD_LIBRARY_PATH
41 |             - -x
42 |             - NCCL_DEBUG
43 |             - -x
44 |             - MASTER_ADDR
45 |             - -x
46 |             - MASTER_PORT
47 |             - --mca 
48 |             - pml
49 |             - "^cm"
50 |             - --mca
51 |             - routed
52 |             - direct
53 |             - --oversubscribe
54 |             - --mca
55 |             - orte_base_help_aggregate 
56 |             - "0"
57 |             - python
58 |             - train.py
59 |     Worker:
60 |       replicas: {{.WorkerReplicas}}
61 |       template:
62 |         spec:
63 |           volumes:
64 |           - name: dshm
65 |             emptyDir:
66 |               medium: Memory
67 |           containers:
68 |           - image: {{.BertTrainingImage}}
69 |             imagePullPolicy: Always
70 |             name: bert-training-worker
71 |             volumeMounts:
72 |             - mountPath: /dev/shm
73 |               name: dshm
74 |             resources:
75 |               requests:
76 |                 nvidia.com/gpu: {{.GPUPerNode}}
77 |                 vpc.amazonaws.com/efa: {{.EFARequested}}
78 |               limits:
79 |                 nvidia.com/gpu: {{.GPUPerNode}}
80 |                 vpc.amazonaws.com/efa: {{.EFARequested}}
81 | 


--------------------------------------------------------------------------------
/test/cases/nvidia/containerd_test.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package nvidia
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"log"
 8 | 	"testing"
 9 | 	"time"
10 | 
11 | 	"github.com/aws/aws-k8s-tester/internal/e2e"
12 | 
13 | 	appsv1 "k8s.io/api/apps/v1"
14 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15 | 
16 | 	"sigs.k8s.io/e2e-framework/klient/wait"
17 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
18 | 	"sigs.k8s.io/e2e-framework/pkg/features"
19 | 
20 | 	_ "embed"
21 | )
22 | 
23 | //go:embed manifests/daemonset-containerd-check.yaml
24 | var containerdCheckDS []byte
25 | 
26 | func TestContainerdConfig(t *testing.T) {
27 | 	feat := features.New("containerd-config-check").
28 | 		WithLabel("suite", "nvidia").
29 | 		Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
30 | 			log.Println("[Setup] Applying containerd-check DaemonSet manifest.")
31 | 			if err := e2e.ApplyManifests(cfg.Client().RESTConfig(), containerdCheckDS); err != nil {
32 | 				t.Fatalf("Failed to apply containerd-check DS: %v", err)
33 | 			}
34 | 			return ctx
35 | 		}).
36 | 		Assess("DaemonSet becomes ready", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
37 | 			dsName := "containerd-check"
38 | 			dsNS := "default"
39 | 
40 | 			log.Println("[Assess] Waiting up to 1 minute for containerd-check DS to become Ready...")
41 | 			ds := &appsv1.DaemonSet{
42 | 				ObjectMeta: metav1.ObjectMeta{
43 | 					Name:      dsName,
44 | 					Namespace: dsNS,
45 | 				},
46 | 			}
47 | 			err := wait.For(
48 | 				e2e.NewConditionExtension(cfg.Client().Resources()).DaemonSetReady(ds),
49 | 				wait.WithTimeout(1*time.Minute),
50 | 			)
51 | 			if err != nil {
52 | 				t.Logf("[Assess] containerd-check DS did not become Ready: %v", err)
53 | 				e2e.PrintDaemonSetPodLogs(t, ctx, cfg.Client().RESTConfig(), dsNS, "app=containerd-check")
54 | 				t.Fatalf("containerd-check DS not Ready within 1 minute")
55 | 			}
56 | 
57 | 			log.Println("[Assess] containerd-check DS is Ready.")
58 | 			return ctx
59 | 		}).
60 | 		Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
61 | 			t.Log("[Teardown] Removing containerd-check DS (no additional logs).")
62 | 			if err := e2e.DeleteManifests(cfg.Client().RESTConfig(), containerdCheckDS); err != nil {
63 | 				t.Fatalf("Failed to delete containerd-check DS: %v", err)
64 | 			}
65 | 			t.Log("[Teardown] containerd-check DS removed successfully.")
66 | 			return ctx
67 | 		}).
68 | 		Feature()
69 | 
70 | 	testenv.Test(t, feat)
71 | }
72 | 


--------------------------------------------------------------------------------
/internal/e2e/logs.go:
--------------------------------------------------------------------------------
 1 | package e2e
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"io"
 7 | 	"testing"
 8 | 
 9 | 	corev1 "k8s.io/api/core/v1"
10 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11 | 	"k8s.io/client-go/kubernetes"
12 | 	"k8s.io/client-go/rest"
13 | )
14 | 
15 | // PrintDaemonSetPodLogs retrieves logs from each container in each pod of a DaemonSet.
16 | // namespace & labelSelector identify the DaemonSet's pods (e.g. "default", "app=containerd-check").
17 | func PrintDaemonSetPodLogs(
18 | 	t *testing.T,
19 | 	ctx context.Context,
20 | 	restConfig *rest.Config,
21 | 	namespace string,
22 | 	labelSelector string,
23 | ) {
24 | 	clientset, err := kubernetes.NewForConfig(restConfig)
25 | 	if err != nil {
26 | 		t.Logf("failed to create typed clientset: %v", err)
27 | 		return
28 | 	}
29 | 
30 | 	pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
31 | 		LabelSelector: labelSelector,
32 | 	})
33 | 	if err != nil {
34 | 		t.Logf("failed to list pods: %v", err)
35 | 		return
36 | 	}
37 | 	if len(pods.Items) == 0 {
38 | 		t.Logf("No pods found for DaemonSet with label %q in namespace %q.", labelSelector, namespace)
39 | 		return
40 | 	}
41 | 
42 | 	for _, pod := range pods.Items {
43 | 		t.Logf("Pod %s status: %s", pod.Name, pod.Status.Phase)
44 | 		for _, container := range pod.Spec.Containers {
45 | 			logs, logErr := ReadPodLogs(ctx, restConfig, pod.Namespace, pod.Name, container.Name)
46 | 			if logErr != nil {
47 | 				t.Logf("Failed reading logs from %s/%s: %v", pod.Name, container.Name, logErr)
48 | 			} else {
49 | 				t.Logf("=== Logs from %s/%s ===\n%s", pod.Name, container.Name, logs)
50 | 			}
51 | 		}
52 | 	}
53 | }
54 | 
55 | // ReadPodLogs streams logs for a specific container in a pod.
56 | func ReadPodLogs(
57 | 	ctx context.Context,
58 | 	restConfig *rest.Config,
59 | 	namespace, podName, containerName string,
60 | ) (string, error) {
61 | 	clientset, err := kubernetes.NewForConfig(restConfig)
62 | 	if err != nil {
63 | 		return "", fmt.Errorf("failed to create typed clientset: %w", err)
64 | 	}
65 | 	req := clientset.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{
66 | 		Container: containerName,
67 | 	})
68 | 	stream, err := req.Stream(ctx)
69 | 	if err != nil {
70 | 		return "", fmt.Errorf("failed to open log stream for %s/%s: %w", podName, containerName, err)
71 | 	}
72 | 	defer stream.Close()
73 | 
74 | 	data, err := io.ReadAll(stream)
75 | 	if err != nil {
76 | 		return "", fmt.Errorf("error reading logs: %w", err)
77 | 	}
78 | 	return string(data), nil
79 | }
80 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/userdata.go:
--------------------------------------------------------------------------------
 1 | package eksapi
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"strconv"
 7 | 	"strings"
 8 | 	"text/template"
 9 | 
10 | 	"github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates"
11 | )
12 | 
13 | const (
14 | 	UserDataBootstrapSh  = "bootstrap.sh"
15 | 	UserDataNodeadm      = "nodeadm"
16 | 	UserDataBottlerocket = "bottlerocket"
17 | )
18 | 
19 | func generateUserData(cluster *Cluster, opts *deployerOptions) (string, bool, error) {
20 | 	userDataIsMimePart := true
21 | 	var t *template.Template
22 | 	switch opts.UserDataFormat {
23 | 	case UserDataBootstrapSh:
24 | 		t = templates.UserDataBootstrapSh
25 | 	case UserDataNodeadm:
26 | 		// TODO: replace the YAML template with proper usage of the nodeadm API go types
27 | 		t = templates.UserDataNodeadm
28 | 	case UserDataBottlerocket:
29 | 		t = templates.UserDataBottlerocket
30 | 		userDataIsMimePart = false
31 | 	default:
32 | 		return "", false, fmt.Errorf("unknown user data format: '%s'", opts.UserDataFormat)
33 | 	}
34 | 
35 | 	kubeletFeatureGates := map[string]bool{}
36 | 	// DRA is in beta for 1.33, and so needs to be explicitly enabled.
37 | 	if opts.KubernetesVersion == "1.33" {
38 | 		kubeletFeatureGates["DynamicResourceAllocation"] = true
39 | 	}
40 | 
41 | 	nodeadmFeatureGates, err := extractFeatureGates(opts.NodeadmFeatureGates)
42 | 	if err != nil {
43 | 		return "", false, err
44 | 	}
45 | 
46 | 	var buf bytes.Buffer
47 | 	if err := t.Execute(&buf, templates.UserDataTemplateData{
48 | 		APIServerEndpoint:    cluster.endpoint,
49 | 		CertificateAuthority: cluster.certificateAuthorityData,
50 | 		CIDR:                 cluster.cidr,
51 | 		Name:                 cluster.name,
52 | 		KubeletFeatureGates:  kubeletFeatureGates,
53 | 		NodeadmFeatureGates:  nodeadmFeatureGates,
54 | 	}); err != nil {
55 | 		return "", false, err
56 | 	}
57 | 	return buf.String(), userDataIsMimePart, nil
58 | }
59 | 
60 | func extractFeatureGates(featureGatePairs []string) (map[string]bool, error) {
61 | 	featureGateMap := make(map[string]bool)
62 | 	for _, keyValuePair := range featureGatePairs {
63 | 		components := strings.Split(keyValuePair, "=")
64 | 		if len(components) != 2 {
65 | 			return featureGateMap, fmt.Errorf("expected key=value pairs but %s has %d components", keyValuePair, len(components))
66 | 		}
67 | 		boolValue, err := strconv.ParseBool(components[1])
68 | 		if err != nil {
69 | 			return featureGateMap, fmt.Errorf("expected bool value in %s: %v", keyValuePair, err)
70 | 		}
71 | 		featureGateMap[components[0]] = boolValue
72 | 	}
73 | 	return featureGateMap, nil
74 | }
75 | 


--------------------------------------------------------------------------------
/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v2beta1
 2 | kind: MPIJob
 3 | metadata:
 4 |   name: {{.JobName}}
 5 | spec:
 6 |   slotsPerWorker: {{.GpuPerNode}}
 7 |   runPolicy:
 8 |     # it may take a bit for the workers to get ready (the container image is heavy)
 9 |     # and we don't want the launcher to reach it's CrashLoopBackoff limit in the meantime
10 |     backoffLimit: 20
11 |     cleanPodPolicy: Running
12 |   mpiReplicaSpecs:
13 |     Launcher:
14 |       replicas: 1
15 |       template:
16 |         spec:
17 |           restartPolicy: OnFailure
18 |           containers:
19 |           - image: {{.NvidiaTestImage}}
20 |             imagePullPolicy: Always
21 |             name: nccl-test-launcher
22 |             env:
23 |             command:
24 |             - mpirun
25 |             - --allow-run-as-root
26 |             - --tag-output
27 |             - -np
28 |             - "{{.WorkerNodeGpuCount}}"
29 |             - -bind-to
30 |             - none
31 |             - -map-by
32 |             - slot
33 |             - -x
34 |             - PATH
35 |             - -x
36 |             - LD_LIBRARY_PATH
37 |             - -x
38 |             - NCCL_DEBUG=INFO
39 |             - -x
40 |             - NCCL_BUFFSIZE={{.NcclBuffSize}}
41 |             - -x
42 |             - NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so
43 |             - --mca
44 |             - pml
45 |             - ^cm,ucx
46 |             - --mca
47 |             - btl
48 |             - tcp,self
49 |             - --mca
50 |             - btl_tcp_if_exclude
51 |             - lo,docker0,veth_def_agent
52 |             - /opt/nccl-tests/build/{{.TestName}}
53 |             - -b
54 |             - "8"
55 |             - -e
56 |             - {{.MaxBytes}}
57 |             - -f
58 |             - "2"
59 |             - -c
60 |             - "1"
61 |             - -n
62 |             - "10"
63 |     Worker:
64 |       replicas: {{.WorkerNodeCount}}
65 |       template:
66 |         spec:
67 |           volumes:
68 |           - name: dshm
69 |             emptyDir:
70 |               medium: Memory
71 |           containers:
72 |           - image: {{.NvidiaTestImage}}
73 |             imagePullPolicy: Always
74 |             name: nccl-test-worker
75 |             volumeMounts:
76 |             - mountPath: /dev/shm
77 |               name: dshm
78 |             resources:
79 |               requests:
80 |                 nvidia.com/gpu: {{.GpuPerNode}}
81 |                 vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}}
82 |               limits:
83 |                 nvidia.com/gpu: {{.GpuPerNode}}
84 |                 vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}}
85 | 


--------------------------------------------------------------------------------
/test/cases/nvidia/capabilities_test.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package nvidia
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"testing"
 8 | 	"time"
 9 | 
10 | 	"github.com/aws/aws-k8s-tester/internal/e2e"
11 | 
12 | 	v1 "k8s.io/api/core/v1"
13 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14 | 
15 | 	"k8s.io/apimachinery/pkg/util/wait"
16 | 	e2ewait "sigs.k8s.io/e2e-framework/klient/wait"
17 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
18 | 	"sigs.k8s.io/e2e-framework/pkg/features"
19 | 
20 | 	_ "embed"
21 | )
22 | 
23 | //go:embed manifests/nvidia-driver-capabilities-check.yaml
24 | var capabilitiesCheckPod []byte
25 | 
26 | const (
27 | 	PodName      = "moderngl-pod"
28 | 	PodNamespace = "default"
29 | )
30 | 
31 | func TestNvidiaDriverCapabilities(t *testing.T) {
32 | 	feat := features.New("nvidia-driver-capabilities-check").
33 | 		WithLabel("suite", "nvidia").
34 | 		Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
35 | 			t.Log("Applying nvidia driver capabilities check pod manifest.")
36 | 			// capabilitiesCheckPod only run moderngl.create_standalone_context() with NVIDIA_DRIVER_CAPABILITIES=all to load all capabilities enabled by nvidia driver.
37 | 			// If any lib required by any of nvidia driver capabilities is missing, it will failed with exception.
38 | 			if err := e2e.ApplyManifests(cfg.Client().RESTConfig(), capabilitiesCheckPod); err != nil {
39 | 				t.Fatalf("Failed to apply capabilities check pod manifest: %v", err)
40 | 			}
41 | 			return ctx
42 | 		}).
43 | 		Assess("Check Pod becomes ready", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
44 | 			t.Log("Waiting up to 5 minute for pod to complete...")
45 | 			pod := &v1.Pod{
46 | 				ObjectMeta: metav1.ObjectMeta{
47 | 					Name:      PodName,
48 | 					Namespace: PodNamespace,
49 | 				},
50 | 			}
51 | 			err := e2ewait.For(
52 | 				e2e.NewConditionExtension(cfg.Client().Resources()).PodSucceeded(pod),
53 | 				e2ewait.WithTimeout(5*time.Minute),
54 | 			)
55 | 			if err != nil {
56 | 				if err == wait.ErrWaitTimeout {
57 | 					t.Fatalf("nvidia capabilities check pod not in compeleted phase (succeeded or failed) within 5 minute and waiter timeout: %v", err)
58 | 				}
59 | 				t.Fatalf("nvidia capabilities pod in Failed status, ModernGL check failed. Could be caused by required library missing")
60 | 			}
61 | 			t.Log("nvidia driver capabilities check succeeded.")
62 | 			return ctx
63 | 		}).
64 | 		Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
65 | 			t.Log("Removing nvidia driver capabilities check pod.")
66 | 			if err := e2e.DeleteManifests(cfg.Client().RESTConfig(), capabilitiesCheckPod); err != nil {
67 | 				t.Errorf("Failed to delete pod: %v", err)
68 | 			}
69 | 			t.Log("all test resources removed successfully.")
70 | 			return ctx
71 | 		}).
72 | 		Feature()
73 | 
74 | 	testenv.Test(t, feat)
75 | }
76 | 


--------------------------------------------------------------------------------
/internal/metrics/cloudwatch.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"sync"
 6 | 	"time"
 7 | 
 8 | 	"github.com/aws/aws-sdk-go-v2/service/cloudwatch"
 9 | 	"github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
10 | 	"github.com/aws/aws-sdk-go/aws"
11 | 	"k8s.io/klog"
12 | )
13 | 
14 | // NewCloudWatchRegistry creates a new metric registry that will emit values using the specified cloudwatch client
15 | func NewCloudWatchRegistry(cw *cloudwatch.Client) MetricRegistry {
16 | 	return &cloudwatchRegistry{
17 | 		cw:              cw,
18 | 		lock:            &sync.Mutex{},
19 | 		dataByNamespace: make(map[string][]*cloudwatchMetricDatum),
20 | 	}
21 | }
22 | 
23 | type cloudwatchRegistry struct {
24 | 	cw              *cloudwatch.Client
25 | 	lock            *sync.Mutex
26 | 	dataByNamespace map[string][]*cloudwatchMetricDatum
27 | }
28 | 
29 | type cloudwatchMetricDatum struct {
30 | 	spec       *MetricSpec
31 | 	value      float64
32 | 	dimensions map[string]string
33 | 	timestamp  time.Time
34 | }
35 | 
36 | func (r *cloudwatchRegistry) Record(spec *MetricSpec, value float64, dimensions map[string]string) {
37 | 	r.lock.Lock()
38 | 	defer r.lock.Unlock()
39 | 	r.dataByNamespace[spec.Namespace] = append(r.dataByNamespace[spec.Namespace], &cloudwatchMetricDatum{
40 | 		spec:       spec,
41 | 		value:      value,
42 | 		dimensions: dimensions,
43 | 		timestamp:  time.Now(),
44 | 	})
45 | }
46 | 
47 | func (r *cloudwatchRegistry) Emit() error {
48 | 	r.lock.Lock()
49 | 	defer r.lock.Unlock()
50 | 	for namespace, data := range r.dataByNamespace {
51 | 		for i := 0; i < len(data); {
52 | 			var metricData []types.MetricDatum
53 | 			// we can emit up to 1000 values per PutMetricData
54 | 			for j := 0; j < len(data) && j < 1000; j++ {
55 | 				datum := data[i]
56 | 				var dimensions []types.Dimension
57 | 				for key, val := range datum.dimensions {
58 | 					dimensions = append(dimensions, types.Dimension{
59 | 						Name:  aws.String(key),
60 | 						Value: aws.String(val),
61 | 					})
62 | 				}
63 | 				metricData = append(metricData, types.MetricDatum{
64 | 					MetricName: aws.String(datum.spec.Metric),
65 | 					Value:      aws.Float64(datum.value),
66 | 					Dimensions: dimensions,
67 | 					Timestamp:  &datum.timestamp,
68 | 				})
69 | 				i++
70 | 			}
71 | 			_, err := r.cw.PutMetricData(context.TODO(), &cloudwatch.PutMetricDataInput{
72 | 				Namespace:  aws.String(namespace),
73 | 				MetricData: metricData,
74 | 			})
75 | 			if err != nil {
76 | 				return err
77 | 			}
78 | 		}
79 | 		klog.Infof("emitted %d metrics to namespace: %s", len(data), namespace)
80 | 	}
81 | 	r.dataByNamespace = make(map[string][]*cloudwatchMetricDatum)
82 | 	return nil
83 | }
84 | 
85 | func (r *cloudwatchRegistry) GetRegistered() int {
86 | 	r.lock.Lock()
87 | 	defer r.lock.Unlock()
88 | 	registered := 0
89 | 	for _, data := range r.dataByNamespace {
90 | 		registered += len(data)
91 | 	}
92 | 	return registered
93 | }
94 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/templates/templates.go:
--------------------------------------------------------------------------------
 1 | package templates
 2 | 
 3 | import (
 4 | 	_ "embed"
 5 | 	"text/template"
 6 | )
 7 | 
 8 | //go:embed infra.yaml
 9 | var Infrastructure string
10 | 
11 | //go:embed cloudwatch_agent_infra.yaml
12 | var CloudWatchAgentRbac []byte
13 | 
14 | var (
15 | 	//go:embed unmanaged-nodegroup.yaml.template
16 | 	unmanagedNodegroupTemplate string
17 | 	UnmanagedNodegroup         = template.Must(template.New("unmanagedNodegroup").Parse(unmanagedNodegroupTemplate))
18 | )
19 | 
20 | //go:embed cloudwatch-infra.yaml.template
21 | var CloudWatchInfra string
22 | 
23 | type NetworkInterface struct {
24 | 	Description         *string
25 | 	NetworkCardIndex    *int
26 | 	DeviceIndex         *int
27 | 	InterfaceType       *string
28 | 	Groups              []string
29 | 	SubnetId            *string
30 | 	DeleteOnTermination *bool
31 | }
32 | 
33 | type UnmanagedNodegroupTemplateData struct {
34 | 	NetworkInterfaces []NetworkInterface
35 | 	KubernetesVersion string
36 | 	InstanceTypes     []string
37 | }
38 | 
39 | type BusyboxDeploymentTemplateData struct {
40 | 	Nodes int
41 | }
42 | 
43 | type NvidiaStaticClusterNodepoolTemplateData struct {
44 | 	Arch          string
45 | 	InstanceTypes []string
46 | }
47 | 
48 | var (
49 | 	//go:embed userdata_bootstrap.sh.mimepart.template
50 | 	userDataBootstrapShTemplate string
51 | 	UserDataBootstrapSh         = template.Must(template.New("userDataBootstrapSh").Parse(userDataBootstrapShTemplate))
52 | 
53 | 	//go:embed userdata_nodeadm.yaml.mimepart.template
54 | 	userDataNodeadmTemplate string
55 | 	UserDataNodeadm         = template.Must(template.New("userDataNodeadm").Parse(userDataNodeadmTemplate))
56 | 
57 | 	//go:embed userdata_bottlerocket.toml.template
58 | 	userDataBottlerocketTemplate string
59 | 	UserDataBottlerocket         = template.Must(template.New("userDataBottlerocket").Parse(userDataBottlerocketTemplate))
60 | 
61 | 	//go:embed busybox_deployment.yaml.template
62 | 	busyboxDeploymentTemplate string
63 | 	BusyboxDeployment         = template.Must(template.New("busyboxDeployment").Parse(busyboxDeploymentTemplate))
64 | 
65 | 	//go:embed nvidia_static_cluster_nodepool.yaml.template
66 | 	nvidiaStaticClusterNodepoolTemplate string
67 | 	NvidiaStaticClusterNodepool         = template.Must(template.New("nvidiaStaticClusterNodepool").Parse(nvidiaStaticClusterNodepoolTemplate))
68 | )
69 | 
70 | type UserDataTemplateData struct {
71 | 	Name                 string
72 | 	CertificateAuthority string
73 | 	CIDR                 string
74 | 	APIServerEndpoint    string
75 | 	KubeletFeatureGates  map[string]bool
76 | 	NodeadmFeatureGates  map[string]bool
77 | }
78 | 
79 | var (
80 | 	//go:embed auth_map_role.yaml.template
81 | 	authMapRoleTemplate string
82 | 	AuthMapRole         = template.Must(template.New("authMapRole").Parse(authMapRoleTemplate))
83 | )
84 | 
85 | type AuthMapRoleTemplateData struct {
86 | 	NodeNameStrategy string
87 | 	Rolearn          string
88 | }
89 | 


--------------------------------------------------------------------------------
/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh:
--------------------------------------------------------------------------------
 1 | # Validate basic system configuration by comparing with expected config
 2 | #
 3 | setup_suite()
 4 | {
 5 |     source common.sh
 6 | 
 7 |     EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-$(get_instance_type)}
 8 |     data=test_sysinfo.sh.data/$EC2_INSTANCE_TYPE
 9 |     ACTUAL_RESULTS=`mktemp -t -d test_sysinfo.sh.actual-data.XXX`
10 |     assert_not_equals "" "$ACTUAL_RESULTS"
11 |     notify_trace_info "ACTUAL_RESULTS: $ACTUAL_RESULTS"
12 | 
13 |     if [ -n "$GENERATE_DATA" ]
14 |     then
15 | 	echo "GENERATE_DATA is enabled..."
16 | 	mkdir -p $data
17 | 	function assert_data() {
18 | 	    generate_data "$@"
19 | 	}
20 |     fi
21 | }
22 | 
23 | teardown_suite()
24 | {
25 |     assert "test -z \"$GENERATE_DATA\"" "GENERATE_DATA was enabled, fail full suite"
26 |     assert_gpu_unused
27 | }
28 | 
29 | 
30 | test_numa_topo_topo()
31 | {
32 |     assert_data $data/numa_topo.txt "grep . /sys/devices/system/node/node*/{cpulist,distance}" "Unexpected cpu topology"
33 | }
34 | 
35 | test_nvidia_gpu_count()
36 | {
37 |     #Just for logging purposesclear
38 |     assert_status_code 0 "nvidia-smi -q"
39 |     assert_data $data/gpu_count.txt "nvidia-smi --query-gpu=name,index,pci.bus_id --format csv" "Unexpected gpu count"
40 | }
41 | 
42 | 
43 | test_nvidia_smi_topo()
44 | {
45 |     assert_data $data/nvidia_smi_topo.txt "nvidia-smi topo -m | grep GPU | cut -f 1-11" \
46 | 		"Unexpected gpu topology, likely broken nvlinks"
47 | }
48 | 
49 | 
50 | test_nvidia_persistence_status()
51 | {
52 |     assert_data $data/nvidia_persistence_status.txt "nvidia-smi --query-gpu=name,pci.bus_id,persistence_mode --format=csv" \
53 | 		  "Unexpected perfistance status, likely system configuration issue"
54 | }
55 | 
56 | test_nvidia_gpu_unused()
57 | {
58 |     assert_gpu_unused
59 | }
60 | 
61 | test_nvidia_gpu_throttled()
62 | {
63 | 
64 |     # vGPU instances don't support GPU clock throttling detection.
65 |     # This test is not applicable for vGPU instance types.
66 |     if is_vgpu; then
67 |         skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)"
68 |     fi
69 |     # https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons
70 |     # The only  bit allowed is nvmlClocksEventReasonGpuIdle 0x0000000000000001LL
71 |     filter="egrep -v -e '(0x0000000000000000|0x0000000000000001|0x0000000000000004)'"
72 |     cmd="nvidia-smi --query-gpu index,gpu_bus_id,gpu_uuid,clocks_throttle_reasons.active --format=csv,noheader"
73 |     assert_status_code 1 "$cmd | $filter" "Throttled gpu detected"
74 | }
75 | 
76 | 
77 | test_nvidia_vgpu_license_status()
78 | {
79 |     if ! is_vgpu; then
80 |         skip "This test only applies to vGPU instances (g6f.*, gr6f.*)"
81 |     fi
82 | 
83 |     assert_data $data/nvidia_vgpu_license_status.txt \
84 |           "nvidia-smi -q | grep 'vGPU Software' -A 2" \
85 |           "vGPU license status validation failed"
86 | }


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: "CI"
 2 | on:
 3 |   pull_request:
 4 |     types:
 5 |       - opened
 6 |       - reopened
 7 |       - synchronize
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v3
13 |     - run: go build ./...
14 |     - run: go test ./...
15 |   build-test:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - run: go test -c -tags=e2e ./test/...
20 |   build-image:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |     - uses: actions/checkout@v3
24 |     - run: ./hack/free-disk-space.sh
25 |     - run: docker build --build-arg=KUBERNETES_MINOR_VERSION=latest --file Dockerfile .
26 |   build-image-efa:
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |     - uses: actions/checkout@v3
30 |     - run: ./hack/free-disk-space.sh
31 |     - run: docker build --file test/images/efa/Dockerfile .
32 |   build-image-neuronx:
33 |     runs-on: ubuntu-latest
34 |     steps:
35 |     - uses: actions/checkout@v3
36 |     - run: ./hack/free-disk-space.sh
37 |     - run: docker build --file test/images/neuron/Dockerfile .
38 |   build-image-nvidia:
39 |     runs-on: ubuntu-latest
40 |     steps:
41 |     - uses: actions/checkout@v3
42 |     - run: ./hack/free-disk-space.sh
43 |     - run: docker build --file test/images/nvidia/Dockerfile .
44 |   build-image-nvidia-training:
45 |     runs-on: ubuntu-latest
46 |     steps:
47 |     - uses: actions/checkout@v3
48 |     - run: ./hack/free-disk-space.sh
49 |     - run: |
50 |         docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training \
51 |           --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=$(($(nproc) - 2)) USE_MKLDNN=0 USE_DISTRIBUTED=0 USE_CUDA=0 USE_ROCM=0 USE_CAFFE2=0 USE_QNNPACK=0 USE_NNPACK=0 USE_XNNPACK=0 USE_MPS=0 BUILD_SHARED_LIBS=OFF USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 BUILD_TEST=0"
52 |   build-image-nvidia-inference:
53 |     runs-on: ubuntu-latest
54 |     steps:
55 |     - uses: actions/checkout@v3
56 |     - run: ./hack/free-disk-space.sh
57 |     - run: |
58 |         docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference \
59 |           --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=$(($(nproc) - 2)) USE_MKLDNN=0 USE_DISTRIBUTED=0 USE_CUDA=0 USE_ROCM=0 USE_CAFFE2=0 USE_QNNPACK=0 USE_NNPACK=0 USE_XNNPACK=0 USE_MPS=0 BUILD_SHARED_LIBS=OFF USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 BUILD_TEST=0"
60 |   build-image-neuron-training:
61 |     runs-on: ubuntu-latest
62 |     steps:
63 |     - uses: actions/checkout@v3
64 |     - run: ./hack/free-disk-space.sh
65 |     - run: docker build --file test/images/neuron-training/Dockerfile test/images/neuron-training
66 |   build-image-neuron-inference:
67 |     runs-on: ubuntu-latest
68 |     steps:
69 |     - uses: actions/checkout@v3
70 |     - run: ./hack/free-disk-space.sh
71 |     - run: docker build --file test/images/neuron-inference/Dockerfile test/images/neuron-inference
72 | 


--------------------------------------------------------------------------------
/test/manifests/assets/cloudwatch-agent.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: ConfigMap
  3 | metadata:
  4 |   name: prometheus-cwagentconfig
  5 |   namespace: amazon-cloudwatch
  6 | data:
  7 |   cwagentconfig.json: |
  8 |     {
  9 |       "agent": {
 10 |         "debug": true
 11 |       },
 12 |       "logs": {
 13 |         "metrics_collected": {
 14 |           "prometheus": {
 15 |             "prometheus_config_path": "/etc/prometheusconfig/prometheus.yaml",
 16 |             "emf_processor": {
 17 |               "metric_declaration": [
 18 |                 {
 19 |                   "source_labels": ["job"],
 20 |                   "label_matcher": "dcgm-exporter",
 21 |                   "dimensions": [[{{.DimensionKeys}}]],
 22 |                   "metric_selectors": [
 23 |                     "^DCGM_FI_DEV_GPU_UTIL$",
 24 |                     "^DCGM_FI_DEV_MEM_COPY_UTIL$",
 25 |                     "^DCGM_FI_DEV_FB_USED$",
 26 |                     "^DCGM_FI_DEV_FB_FREE$",
 27 |                     "^DCGM_FI_DEV_POWER_USAGE$"
 28 |                   ]
 29 |                 }
 30 |               ]
 31 |             }
 32 |           }
 33 |         },
 34 |         "force_flush_interval": 5
 35 |       }
 36 |     }
 37 | 
 38 | ---
 39 | apiVersion: v1
 40 | kind: ConfigMap
 41 | metadata:
 42 |   name: prometheus-config
 43 |   namespace: amazon-cloudwatch
 44 | data:
 45 |   prometheus.yaml: |
 46 |     global:
 47 |       scrape_interval: 1s
 48 |       scrape_timeout: 1s
 49 |     scrape_configs:
 50 |       - job_name: dcgm-exporter
 51 |         static_configs:
 52 |           - targets:
 53 |             - dcgm-exporter.kube-system.svc.cluster.local:9400
 54 |         metrics_path: /metrics
 55 |         metric_relabel_configs:
 56 | {{- range $key, $value := .MetricDimensions}}
 57 |           - {action: replace, target_label: {{$key}}, replacement: '{{$value}}'}
 58 | {{- end}}
 59 | ---
 60 | apiVersion: apps/v1
 61 | kind: DaemonSet
 62 | metadata:
 63 |   name: cwagent
 64 |   namespace: amazon-cloudwatch
 65 | spec:
 66 |   selector:
 67 |     matchLabels:
 68 |       app: cwagent
 69 |   template:
 70 |     metadata:
 71 |       labels:
 72 |         app: cwagent
 73 |     spec:
 74 |       serviceAccountName: cwagent
 75 |       dnsPolicy: ClusterFirst 
 76 |       containers:
 77 |         - name: cloudwatch-agent
 78 |           image: public.ecr.aws/cloudwatch-agent/cloudwatch-agent:latest
 79 |           imagePullPolicy: Always
 80 |           resources:
 81 |             limits:
 82 |               cpu: 1000m
 83 |               memory: 1000Mi
 84 |             requests:
 85 |               cpu: 200m
 86 |               memory: 200Mi
 87 |           volumeMounts:
 88 |             - name: prometheus-cwagentconfig
 89 |               mountPath: /etc/cwagentconfig
 90 |             - name: prometheus-config
 91 |               mountPath: /etc/prometheusconfig
 92 |       volumes:
 93 |         - name: prometheus-cwagentconfig
 94 |           configMap:
 95 |             name: prometheus-cwagentconfig
 96 |         - name: prometheus-config
 97 |           configMap:
 98 |             name: prometheus-config
 99 |       terminationGracePeriodSeconds: 60
100 | ---


--------------------------------------------------------------------------------
/test/manifests/assets/k8s-neuron-device-plugin.yml:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/aws-neuron/aws-neuron-sdk/blob/master/src/k8/k8s-neuron-device-plugin.yml
 2 | apiVersion: apps/v1
 3 | kind: DaemonSet
 4 | metadata:
 5 |   name: neuron-device-plugin-daemonset
 6 |   namespace: kube-system
 7 | spec:
 8 |   selector:
 9 |     matchLabels:
10 |       name:  neuron-device-plugin-ds
11 |   updateStrategy:
12 |     type: RollingUpdate
13 |   template:
14 |     metadata:
15 |       # Uncomment the annotation below if k8s version is 1.13 or lower
16 |       # annotations:
17 |       #  scheduler.alpha.kubernetes.io/critical-pod: ""
18 |       labels:
19 |         name: neuron-device-plugin-ds
20 |     spec:
21 |       serviceAccount: neuron-device-plugin
22 |       tolerations:
23 |       - key: CriticalAddonsOnly
24 |         operator: Exists
25 |       - key: aws.amazon.com/neuron
26 |         operator: Exists
27 |         effect: NoSchedule
28 |       # Mark this pod as a critical add-on; when enabled, the critical add-on
29 |       # scheduler reserves resources for critical add-on pods so that they can
30 |       # be rescheduled after a failure.
31 |       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
32 |       priorityClassName: "system-node-critical"
33 |       affinity:
34 |         nodeAffinity:
35 |           requiredDuringSchedulingIgnoredDuringExecution:
36 |             nodeSelectorTerms:
37 |               - matchExpressions:
38 |                   - key: "node.kubernetes.io/instance-type"
39 |                     operator: In
40 |                     values:
41 |                       - inf1.xlarge
42 |                       - inf1.2xlarge
43 |                       - inf1.6xlarge
44 |                       - inf1.24xlarge
45 |                       - inf2.xlarge
46 |                       - inf2.8xlarge
47 |                       - inf2.24xlarge
48 |                       - inf2.48xlarge
49 |                       - trn1.2xlarge
50 |                       - trn1.32xlarge
51 |                       - trn1n.32xlarge
52 |                       - trn2.48xlarge
53 |                       - trn2u.48xlarge
54 |       containers:
55 |         # Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin
56 |       - image: public.ecr.aws/neuron/neuron-device-plugin:2.26.26.0
57 |         imagePullPolicy: Always
58 |         name: neuron-device-plugin
59 |         env:
60 |         - name: KUBECONFIG
61 |           value: /etc/kubernetes/kubelet.conf
62 |         - name: NODE_NAME
63 |           valueFrom:
64 |             fieldRef:
65 |               fieldPath: spec.nodeName
66 |         securityContext:
67 |           allowPrivilegeEscalation: false
68 |           capabilities:
69 |             drop: ["ALL"]
70 |         volumeMounts:
71 |           - name: device-plugin
72 |             mountPath: /var/lib/kubelet/device-plugins
73 |           - name: infa-map
74 |             mountPath: /run
75 |       volumes:
76 |         - name: device-plugin
77 |           hostPath:
78 |             path: /var/lib/kubelet/device-plugins
79 |         - name: infa-map
80 |           hostPath:
81 |             path: /run
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/hack/update-image-tags.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o nounset
 4 | set -o errexit
 5 | set -o pipefail
 6 | 
 7 | ECR_PUBLIC_REGISTRY="public.ecr.aws"
 8 | EKS_CONTAINER_REGISTRY="602401143452.dkr.ecr.us-west-2.amazonaws.com"
 9 | 
10 | # get_ecr_image_tags <REGISTRY> <REPOSITORY>
11 | # e.g. get_ecr_image_tags $ECR_PUBLIC_REGISTRY amazonlinux/amazonlinux
12 | get_ecr_image_tags() {
13 |     set -e
14 |     local REGISTRY=$1 
15 |     local REPOSITORY=$2
16 |     local TOKEN
17 | 
18 |     # Get ECR public token if image is from a public registry, otherwise use a private token
19 |     # An authorization token is required for every ECR HTTP request
20 |     if [ "$REGISTRY" = "$ECR_PUBLIC_REGISTRY" ]; then
21 |         TOKEN=$(aws ecr-public get-authorization-token --region us-east-1 --output=text --query 'authorizationData.authorizationToken')
22 |         local AUTHORIZATION_TYPE="Bearer"
23 |     else 
24 |         TOKEN=$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken')
25 |         local AUTHORIZATION_TYPE="Basic"
26 |     fi
27 | 
28 |     curl -s -H "Authorization: ${AUTHORIZATION_TYPE} $TOKEN" "https://$REGISTRY/v2/$REPOSITORY/tags/list" | jq '.tags'
29 | }
30 | 
31 | # update_image_uris REPOSITORY IMAGE_TAG
32 | update_image_uris() {
33 |     local REPOSITORY=$1
34 |     local NEW_TAG=$2
35 |     PREFIX="image: ${REPOSITORY}"
36 |     find ./test/manifests -type f -exec sed -i "s#$PREFIX:.*#$PREFIX:$NEW_TAG#g" {} +
37 | }
38 | 
39 | # update the nvidia k8s device plugin
40 | echo "Updating Nvidia device plugin image"
41 | NVIDIA_DEVICE_PLUGIN_TAG=$(curl -s 'https://catalog.ngc.nvidia.com/api/containers/images?orgName=nvidia&name=k8s-device-plugin&isPublic=true' | jq -r '.images | sort_by(.updatedDate) | reverse | map(select(.tag | test("^v[0-9]+.[0-9]+.[0-9]+$"))) | first | .tag')
42 | update_image_uris nvcr.io/nvidia/k8s-device-plugin $NVIDIA_DEVICE_PLUGIN_TAG
43 | 
44 | # below updates require authentication and should not exit early with a failure.
45 | # TODO: remove this once the aws credentials are setup and the paths are expected to succeed.
46 | set +e
47 | 
48 | # update the neuron k8s device plugin
49 | echo "Updating Neuron device plugin image"
50 | NEURON_DEVICE_PLUGIN_REPOSITORY_NAME="neuron/neuron-device-plugin"
51 | NEURON_DEVICE_PLUGIN_TAGS=$(get_ecr_image_tags $ECR_PUBLIC_REGISTRY $NEURON_DEVICE_PLUGIN_REPOSITORY_NAME)
52 | if [ $? -eq 0 ]; then
53 |     LATEST_NEURON_DEVICE_PLUGIN_TAG=$(echo $NEURON_DEVICE_PLUGIN_TAGS | jq -r 'max_by(split(".") | map(tonumber))')
54 |     update_image_uris "${ECR_PUBLIC_REGISTRY}/${NEURON_DEVICE_PLUGIN_REPOSITORY_NAME}" $LATEST_NEURON_DEVICE_PLUGIN_TAG
55 | fi
56 | 
57 | # update the efa k8s device plugin
58 | echo "Updating EFA device plugin image"
59 | EFA_DEVICE_PLUGIN_REPOSITORY_NAME="eks/aws-efa-k8s-device-plugin"
60 | EFA_DEVICE_PLUGIN_TAGS=$(get_ecr_image_tags $EKS_CONTAINER_REGISTRY $EFA_DEVICE_PLUGIN_REPOSITORY_NAME)
61 | if [ $? -eq 0 ]; then
62 |     LATEST_EFA_DEVICE_PLUGIN_TAG=$(echo $EFA_DEVICE_PLUGIN_TAGS | jq -r 'map(split("-") | .[0]) | max_by(sub("^v"; "") | split(".") | map(tonumber))')
63 |     update_image_uris "${EKS_CONTAINER_REGISTRY}/${EFA_DEVICE_PLUGIN_REPOSITORY_NAME}" $LATEST_EFA_DEVICE_PLUGIN_TAG
64 | fi


--------------------------------------------------------------------------------
/test/cases/quick/node_topology_test.go:
--------------------------------------------------------------------------------
 1 | //go:build e2e
 2 | 
 3 | package quick
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	_ "embed"
 8 | 	"strconv"
 9 | 	"strings"
10 | 	"testing"
11 | 
12 | 	"github.com/aws/aws-k8s-tester/internal/e2e"
13 | 	"github.com/aws/aws-sdk-go-v2/aws"
14 | 	v1 "k8s.io/api/core/v1"
15 | 	cloudprovider "k8s.io/cloud-provider-aws/pkg/providers/v1"
16 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
17 | 	"sigs.k8s.io/e2e-framework/pkg/features"
18 | )
19 | 
20 | func TestNodeTopology(t *testing.T) {
21 | 	topology := features.New("node-topology").
22 | 		WithLabel("suite", "node-topology").
23 | 		Assess("Nodes have correct network topology labels", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
24 | 
25 | 			var nodes v1.NodeList
26 | 			cfg.Client().Resources().List(ctx, &nodes)
27 | 
28 | 			if len(nodes.Items) == 0 {
29 | 				t.Fatal("no nodes found in the cluster")
30 | 			}
31 | 
32 | 			nodeMap := make(map[string]v1.Node)
33 | 			var instanceIDs []string
34 | 			ec2Client := e2e.NewEC2Client()
35 | 			for _, node := range nodes.Items {
36 | 				providerIDParts := strings.Split(node.Spec.ProviderID, "/")
37 | 				instanceID := providerIDParts[len(providerIDParts)-1]
38 | 				instanceIDs = append(instanceIDs, instanceID)
39 | 				nodeMap[instanceID] = node
40 | 			}
41 | 
42 | 			nodeTopologies, err := ec2Client.DescribeInstanceTopology(instanceIDs)
43 | 			if err != nil {
44 | 				t.Fatalf("could not describe instance topologies: %v", err)
45 | 			}
46 | 
47 | 			t.Logf("checking instance topologies for %d node(s) (out of %d node(s) in the cluster)", len(nodeTopologies), len(instanceIDs))
48 | 
49 | 			for _, nodeTopology := range nodeTopologies {
50 | 				node := nodeMap[aws.ToString(nodeTopology.InstanceId)]
51 | 				instanceType := node.Labels["node.kubernetes.io/instance-type"]
52 | 
53 | 				t.Logf("verifying instance topology for node %s (type: %s)", node.Name, instanceType)
54 | 
55 | 				for i, networkNode := range nodeTopology.NetworkNodes {
56 | 					// https://github.com/kubernetes/cloud-provider-aws/blob/b47d2cf2a33ae655cd353ec42ea43362b804c397/pkg/providers/v1/well_known_labels.go#L26
57 | 					expectedLabel := cloudprovider.LabelNetworkNodePrefix + strconv.Itoa(i+1)
58 | 					if actualValue, ok := node.Labels[expectedLabel]; !ok {
59 | 						t.Errorf("node %s (type: %s) does not have expected network label %s", node.Name, instanceType, expectedLabel)
60 | 					} else if actualValue != networkNode {
61 | 						t.Errorf("node %s (type: %s) has incorrect value for label %s: expected %s, got %s", node.Name, instanceType, expectedLabel, networkNode, actualValue)
62 | 					}
63 | 				}
64 | 
65 | 				// https://github.com/kubernetes/cloud-provider-aws/blob/b47d2cf2a33ae655cd353ec42ea43362b804c397/pkg/providers/v1/well_known_labels.go#L22C2-L22C13
66 | 				if aws.ToString(nodeTopology.ZoneId) != node.Labels[cloudprovider.LabelZoneID] {
67 | 					t.Logf("node %s (type: %s) has incorrect value for label %s: expected %s, got %s", node.Name, instanceType, cloudprovider.LabelZoneID, aws.ToString(nodeTopology.ZoneId), node.Labels[cloudprovider.LabelZoneID])
68 | 					t.Fail()
69 | 				}
70 | 			}
71 | 
72 | 			return ctx
73 | 		}).Feature()
74 | 
75 | 	testenv.Test(t, topology)
76 | }
77 | 


--------------------------------------------------------------------------------
/test/cases/efa/commons.go:
--------------------------------------------------------------------------------
  1 | //go:build e2e
  2 | 
  3 | package efa
  4 | 
  5 | import (
  6 | 	"context"
  7 | 	_ "embed"
  8 | 	"fmt"
  9 | 	"log"
 10 | 
 11 | 	"github.com/aws/aws-k8s-tester/internal/e2e"
 12 | 	"github.com/aws/aws-sdk-go-v2/aws"
 13 | 	corev1 "k8s.io/api/core/v1"
 14 | 	v1 "k8s.io/api/core/v1"
 15 | 	"k8s.io/client-go/kubernetes"
 16 | 	"sigs.k8s.io/e2e-framework/pkg/env"
 17 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
 18 | 
 19 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 20 | )
 21 | 
 22 | var (
 23 | 	testenv   env.Environment
 24 | 	ec2Client e2e.EC2Client
 25 | 
 26 | 	testImage *string
 27 | 
 28 | 	pingPongSize            *string
 29 | 	pingPongIters           *int
 30 | 	pingPongDeadlineSeconds *int
 31 | 
 32 | 	nodeType               *string
 33 | 	expectedEFADeviceCount *int
 34 | 
 35 | 	verbose *bool
 36 | )
 37 | 
 38 | const (
 39 | 	EFA_RESOURCE_NAME   = "vpc.amazonaws.com/efa"
 40 | 	TEST_NAMESPACE_NAME = "efa-tests"
 41 | )
 42 | 
 43 | func getEfaCapacity(node corev1.Node) int {
 44 | 	capacity, ok := node.Status.Capacity[v1.ResourceName(EFA_RESOURCE_NAME)]
 45 | 	if !ok {
 46 | 		return 0
 47 | 	}
 48 | 	return int(capacity.Value())
 49 | }
 50 | 
 51 | func getEfaNodes(ctx context.Context, config *envconf.Config) ([]corev1.Node, error) {
 52 | 	var efaNodes []corev1.Node
 53 | 	clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig())
 54 | 	if err != nil {
 55 | 		return []corev1.Node{}, fmt.Errorf("failed to create Kubernetes client: %w", err)
 56 | 	}
 57 | 
 58 | 	nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
 59 | 	if err != nil {
 60 | 		return []corev1.Node{}, fmt.Errorf("failed to list nodes: %w", err)
 61 | 	}
 62 | 
 63 | 	if len(nodes.Items) == 0 {
 64 | 		return []corev1.Node{}, fmt.Errorf("no nodes found in the cluster")
 65 | 	}
 66 | 
 67 | 	for _, node := range nodes.Items {
 68 | 		instanceType := node.Labels["node.kubernetes.io/instance-type"]
 69 | 
 70 | 		if aws.ToString(nodeType) != "" && instanceType != aws.ToString(nodeType) {
 71 | 			log.Printf("[INFO] Skipping node %s (type: %s), node is not of target type %s", node.Name, instanceType, aws.ToString(nodeType))
 72 | 			continue
 73 | 		}
 74 | 
 75 | 		numEfaDevices, err := e2e.GetNonZeroResourceCapacity(&node, EFA_RESOURCE_NAME)
 76 | 		if err != nil {
 77 | 			log.Printf("[INFO] Skipping node %s (type: %s): %v", node.Name, instanceType, err)
 78 | 			continue
 79 | 		}
 80 | 
 81 | 		expectedDeviceCount := aws.ToInt(expectedEFADeviceCount)
 82 | 		if expectedDeviceCount < 0 {
 83 | 			instanceInfo, err := ec2Client.DescribeInstanceType(instanceType)
 84 | 			if err != nil {
 85 | 				return []corev1.Node{}, err
 86 | 			}
 87 | 			expectedDeviceCount = int(aws.ToInt32(instanceInfo.NetworkInfo.EfaInfo.MaximumEfaInterfaces))
 88 | 		}
 89 | 
 90 | 		if expectedDeviceCount != numEfaDevices {
 91 | 			return []corev1.Node{}, fmt.Errorf("unexpected EFA device capacity on node %s: expected %d, got %d", node.Name, expectedDeviceCount, numEfaDevices)
 92 | 		}
 93 | 
 94 | 		efaNodes = append(efaNodes, node)
 95 | 	}
 96 | 
 97 | 	if len(efaNodes) == 0 {
 98 | 		return []corev1.Node{}, fmt.Errorf("no nodes with EFA capacity found in the cluster")
 99 | 	}
100 | 
101 | 	return efaNodes, nil
102 | }
103 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/addons.go:
--------------------------------------------------------------------------------
  1 | package eksapi
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"strings"
  7 | 	"time"
  8 | 
  9 | 	"github.com/aws/aws-sdk-go-v2/aws"
 10 | 	"github.com/aws/aws-sdk-go-v2/service/eks"
 11 | 	"k8s.io/klog/v2"
 12 | )
 13 | 
 14 | const (
 15 | 	addonCreationTimeout = 5 * time.Minute
 16 | )
 17 | 
 18 | type AddonManager struct {
 19 | 	clients *awsClients
 20 | }
 21 | 
 22 | func NewAddonManager(clients *awsClients) *AddonManager {
 23 | 	return &AddonManager{
 24 | 		clients: clients,
 25 | 	}
 26 | }
 27 | 
 28 | func (m *AddonManager) createAddons(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
 29 | 	ctx := context.TODO()
 30 | 
 31 | 	addonMap := map[string]string{}
 32 | 	for _, addon := range opts.Addons {
 33 | 		addonParts := strings.Split(addon, ":")
 34 | 		if len(addonParts) != 2 {
 35 | 			return fmt.Errorf("invalid addon format: %s", addon)
 36 | 		}
 37 | 		name := addonParts[0]
 38 | 		version := addonParts[1]
 39 | 		klog.Infof("resolving addon %s version: %s", name, version)
 40 | 		resolvedVersion, err := m.resolveAddonVersion(name, version, opts.KubernetesVersion)
 41 | 		if err != nil {
 42 | 			return err
 43 | 		}
 44 | 		// dedupe addons with the same name. last provided entry wins.
 45 | 		addonMap[name] = resolvedVersion
 46 | 	}
 47 | 
 48 | 	for addonName, addonVersion := range addonMap {
 49 | 		klog.Infof("creating addon %s version: %s", addonName, addonVersion)
 50 | 		input := eks.CreateAddonInput{
 51 | 			AddonName:    aws.String(addonName),
 52 | 			AddonVersion: aws.String(addonVersion),
 53 | 			ClusterName:  aws.String(cluster.name),
 54 | 		}
 55 | 		_, err := m.clients.EKS().CreateAddon(ctx, &input)
 56 | 		if err != nil {
 57 | 			return fmt.Errorf("failed to create addon: %v", err)
 58 | 		}
 59 | 		klog.Infof("waiting for addon to be active: %s", addonName)
 60 | 		err = eks.NewAddonActiveWaiter(m.clients.EKS()).
 61 | 			Wait(ctx, &eks.DescribeAddonInput{
 62 | 				AddonName:   aws.String(addonName),
 63 | 				ClusterName: aws.String(cluster.name),
 64 | 			}, addonCreationTimeout)
 65 | 		if err != nil {
 66 | 			return fmt.Errorf("failed to wait for addon to be active: %v", err)
 67 | 		}
 68 | 	}
 69 | 
 70 | 	return nil
 71 | }
 72 | 
 73 | func (m *AddonManager) resolveAddonVersion(name string, versionMarker string, kubernetesVersion string) (string, error) {
 74 | 	input := eks.DescribeAddonVersionsInput{
 75 | 		AddonName:         aws.String(name),
 76 | 		KubernetesVersion: aws.String(kubernetesVersion),
 77 | 	}
 78 | 	descOutput, err := m.clients.EKS().DescribeAddonVersions(context.TODO(), &input)
 79 | 	if err != nil {
 80 | 		return "", err
 81 | 	}
 82 | 	for _, addon := range descOutput.Addons {
 83 | 		for _, versionInfo := range addon.AddonVersions {
 84 | 			switch versionMarker {
 85 | 			case "latest":
 86 | 				return *versionInfo.AddonVersion, nil
 87 | 			case "default":
 88 | 				for _, compatibility := range versionInfo.Compatibilities {
 89 | 					if compatibility.DefaultVersion {
 90 | 						return *versionInfo.AddonVersion, nil
 91 | 					}
 92 | 				}
 93 | 			default:
 94 | 				if *versionInfo.AddonVersion == versionMarker {
 95 | 					return *versionInfo.AddonVersion, nil
 96 | 				}
 97 | 			}
 98 | 		}
 99 | 	}
100 | 	return "", fmt.Errorf("failed to resolve addon version: %s=%s", name, versionMarker)
101 | }
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tools for testing Kubernetes on AWS
  2 | 
  3 | ## Installation
  4 | 
  5 | This project will use rolling releases going forward; we recommend fetching the latest commit:
  6 | ```
  7 | go install github.com/aws/aws-k8s-tester/...@HEAD
  8 | ```
  9 | 
 10 | You'll need the standard `kubetest` tools as well:
 11 | ```
 12 | go install sigs.k8s.io/kubetest2/...@latest
 13 | ```
 14 | 
 15 | ## `kubetest2` deployers and testers for EKS
 16 | 
 17 | 
 18 | ### Usage
 19 | 
 20 | **Auto-detect cluster version**
 21 | 
 22 | The deployers will search for a file called `kubernetes-version.txt` on your `PATH`.
 23 | This file should contain a valid tag for a Kubernetes release.
 24 | The `--kubernetes-version` flag can be omitted if this file exists.
 25 | 
 26 | ---
 27 | 
 28 | ### `eksctl` deployer
 29 | 
 30 | This deployer is a thin wrapper around `eksctl`.
 31 | 
 32 | The simplest usage is:
 33 | ```
 34 | kubetest2 \
 35 |   eksctl \
 36 |   --kubernetes-version=X.XX \
 37 |   --up \
 38 |   --down \
 39 |   --test=exec \
 40 |   -- echo "Hello world"
 41 | ```
 42 | 
 43 | **Additional flags**
 44 | 
 45 | - `--instance-types` - comma-separated list of instance types to use for nodes
 46 | - `--ami` - AMI ID for nodes
 47 | - `--nodes` - number of nodes
 48 | - `--region` - AWS region
 49 | - `--config-file` - Path to eksctl config file (**if provided, other flags are ignored**)
 50 | - `--availability-zones` - Node availability zones
 51 | - `--ami-family` - AMI family to use: `AmazonLinux2023` | `Bottlerocket`
 52 | - `--efa-enabled` - Enable Elastic Fabric Adapter for the nodegroup
 53 | - `--volume-size` - Size of the node root volume in GB
 54 | - `--private-networking` - Use private networking for nodes
 55 | - `--with-oidc` - Enable OIDC provider for IAM roles for service accounts
 56 | - `--deploy-target` - The target to deploy: `cluster` | `nodegroup` (defaults to `cluster`)
 57 | - `--cluster-name` - Name of the EKS cluster (defaults to RunID if not specified)
 58 | - `--unmanaged-nodegroup` - Use unmanaged nodegroup instead of managed nodegroup
 59 | - `--nodegroup-name` - Name of the nodegroup (defaults to `ng-1`)
 60 | 
 61 | ---
 62 | 
 63 | ### `eksapi` deployer
 64 | 
 65 | This deployer calls the EKS API directly, instead of using CloudFormation for EKS resources.
 66 | 
 67 | The simplest usage is:
 68 | ```
 69 | kubetest2 \
 70 |   eksapi \
 71 |   --kubernetes-version=X.XX \
 72 |   --up \
 73 |   --down \
 74 |   --test=exec \
 75 |   -- echo "Hello world"
 76 | ```
 77 | 
 78 | **Additional flags**
 79 | 
 80 | - `--instance-types` - comma-separated list of instance types to use for nodes
 81 | - `--ami` - AMI ID for nodes
 82 | - `--nodes` - number of nodes
 83 | - `--region` - AWS region
 84 | - `--endpoint-url` - Override the EKS endpoint URL
 85 | - `--cluster-role-service-principal` - Additional service principal that can assume the cluster IAM role.
 86 | 
 87 | ---
 88 | 
 89 | ### `multi` tester
 90 | 
 91 | This tester wraps multiple executions of other testers.
 92 | 
 93 | Tester argument groups are separated by `--`, with the first group being passed to the `multi` tester itself.
 94 | 
 95 | The first positional argument of each subsequent group should be the name of a tester.
 96 | 
 97 | ```
 98 | kubetest2 \
 99 |   noop \
100 |   --test=multi \
101 |   -- \
102 |   --fail-fast=true \
103 |   -- \
104 |   ginkgo \
105 |   --focus-regex='\[Conformance\]' \
106 |   --parallel=4 \
107 |   -- \
108 |   exec \
109 |   go test ./my/test/package
110 | ```
111 | 


--------------------------------------------------------------------------------
/test/images/nvidia-inference/Dockerfile:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Base image, arguments, and environment
 3 | ###############################################################################
 4 | ARG CUDA_MAJOR_VERSION=12
 5 | ARG CUDA_MINOR_VERSION=8
 6 | 
 7 | FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04
 8 | 
 9 | ARG CUDA_MAJOR_VERSION
10 | ARG CUDA_MINOR_VERSION
11 | 
12 | # Disable interactive prompts
13 | ENV DEBIAN_FRONTEND=noninteractive
14 | 
15 | ###############################################################################
16 | # System packages
17 | ###############################################################################
18 | RUN apt update \
19 |  && apt upgrade -y \
20 |  && apt install -y --no-install-recommends \
21 |        build-essential \
22 |        ca-certificates \
23 |        cmake \
24 |        curl \
25 |        emacs \
26 |        git \
27 |        jq \
28 |        libopencv-dev \
29 |        software-properties-common \
30 |        wget \
31 |        unzip \
32 |        vim \
33 |        pkg-config \
34 |        gdb \
35 |        lcov \
36 |        libbz2-dev \
37 |        zlib1g-dev \
38 |        openssl \
39 |        libssl-dev \
40 |        libsqlite3-dev \
41 |        libgdbm-dev \
42 |        libc6-dev \
43 |        libbz2-dev \
44 |        libncurses-dev \
45 |        tk-dev \
46 |        libffi-dev \
47 |        libcap-dev \
48 |        gnupg2 \
49 |        gpg-agent \
50 |  && rm -rf /var/lib/apt/lists/*
51 | 
52 | ###############################################################################
53 | # Build and install Python from source
54 | ###############################################################################
55 | ARG PYTHON=python3.10
56 | ARG PYTHON_VERSION=3.10.12
57 | 
58 | RUN curl -sL https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz | tar xvz -C /tmp \
59 |  && cd /tmp/Python-$PYTHON_VERSION \
60 |  && ./configure --enable-shared --prefix=/usr/local \
61 |  && make -j$(nproc) \
62 |  && make install \
63 |  && cd && rm -rf /tmp/Python-$PYTHON_VERSION
64 | 
65 | RUN ln -s /usr/local/bin/pip3 /usr/bin/pip \
66 |  && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
67 |  && pip3 --no-cache-dir install --upgrade pip setuptools
68 | 
69 | ###############################################################################
70 | # Install Pytorch from Source
71 | ###############################################################################
72 | ARG PYTORCH_BRANCH=v2.6.0
73 | ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"
74 | 
75 | # envs needed to make the path of NVCC known to the compilation
76 | ENV CUDA_HOME=/usr/local/cuda
77 | ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
78 | ENV PATH=$PATH:$CUDA_HOME/bin
79 | ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.7;8.9;9.0;10.0;12.0"
80 | 
81 | RUN pip3 install typing-extensions sympy pyyaml
82 | RUN git clone https://github.com/pytorch/pytorch.git /tmp/pytorch \
83 |       --recursive \
84 |       --branch $PYTORCH_BRANCH \
85 |  && cd /tmp/pytorch \
86 |  && eval "$PYTORCH_BUILD_ENV python3 setup.py install" \
87 |  && cd && rm -rf /tmp/pytorch
88 | 
89 | ###############################################################################
90 | # Application files and Python dependencies
91 | ###############################################################################
92 | WORKDIR /app
93 | COPY infer.py /app/
94 | COPY requirements.txt /app/
95 | RUN pip install --no-cache-dir -r requirements.txt
96 | 


--------------------------------------------------------------------------------
/test/cases/efa/main_test.go:
--------------------------------------------------------------------------------
  1 | //go:build e2e
  2 | 
  3 | package efa
  4 | 
  5 | import (
  6 | 	"context"
  7 | 	_ "embed"
  8 | 	"flag"
  9 | 	"log"
 10 | 	"os"
 11 | 	"os/signal"
 12 | 	"testing"
 13 | 	"time"
 14 | 
 15 | 	"github.com/aws/aws-k8s-tester/internal/e2e"
 16 | 	"github.com/aws/aws-k8s-tester/test/manifests"
 17 | 	appsv1 "k8s.io/api/apps/v1"
 18 | 	corev1 "k8s.io/api/core/v1"
 19 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 20 | 	"sigs.k8s.io/e2e-framework/klient/wait"
 21 | 	"sigs.k8s.io/e2e-framework/pkg/env"
 22 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
 23 | )
 24 | 
 25 | func getTestNamespace() *corev1.Namespace {
 26 | 	return &corev1.Namespace{
 27 | 		ObjectMeta: metav1.ObjectMeta{
 28 | 			Name: TEST_NAMESPACE_NAME,
 29 | 		},
 30 | 	}
 31 | }
 32 | 
 33 | func deployEFAPlugin(ctx context.Context, config *envconf.Config) (context.Context, error) {
 34 | 	err := e2e.ApplyManifests(config.Client().RESTConfig(), manifests.EfaDevicePluginManifest)
 35 | 	if err != nil {
 36 | 		return ctx, err
 37 | 	}
 38 | 	efaDS := appsv1.DaemonSet{
 39 | 		ObjectMeta: metav1.ObjectMeta{Name: "aws-efa-k8s-device-plugin-daemonset", Namespace: "kube-system"},
 40 | 	}
 41 | 	err = wait.For(e2e.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&efaDS),
 42 | 		wait.WithContext(ctx),
 43 | 		wait.WithTimeout(5*time.Minute),
 44 | 	)
 45 | 	if err != nil {
 46 | 		return ctx, err
 47 | 	}
 48 | 
 49 | 	return ctx, nil
 50 | }
 51 | 
 52 | func TestMain(m *testing.M) {
 53 | 	testImage = flag.String("testImage", "", "container image to use for tests")
 54 | 	pingPongSize = flag.String("pingPongSize", "all", "sizes to use for ping pong")
 55 | 	pingPongIters = flag.Int("pingPongIters", 10000, "number of iterations to use for ping pong")
 56 | 	pingPongDeadlineSeconds = flag.Int("pingPongDeadlineSeconds", 120, "maximum run time for a ping pong attempt")
 57 | 	nodeType = flag.String("nodeType", "", "instance type to target for tests")
 58 | 	expectedEFADeviceCount = flag.Int("expectedEFADeviceCount", -1, "expected number of efa devices for the target nodes")
 59 | 	verbose = flag.Bool("verbose", true, "use verbose mode for tests")
 60 | 
 61 | 	cfg, err := envconf.NewFromFlags()
 62 | 	if err != nil {
 63 | 		log.Fatalf("failed to initialize test environment: %v", err)
 64 | 	}
 65 | 
 66 | 	if *testImage == "" {
 67 | 		log.Fatal("--testImage must be set, use https://github.com/aws/aws-k8s-tester/blob/main/test/efa/Dockerfile to build the image")
 68 | 	}
 69 | 
 70 | 	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
 71 | 	defer cancel()
 72 | 	timedCtx, cancel := context.WithTimeout(ctx, 55*time.Minute)
 73 | 	defer cancel()
 74 | 
 75 | 	testenv = env.NewWithConfig(cfg)
 76 | 	testenv = testenv.WithContext(timedCtx)
 77 | 
 78 | 	ec2Client = e2e.NewEC2Client()
 79 | 
 80 | 	testenv.Setup(
 81 | 		deployEFAPlugin,
 82 | 		func(ctx context.Context, config *envconf.Config) (context.Context, error) {
 83 | 			select {
 84 | 			case <-ctx.Done():
 85 | 			// Cooldown to let device plugin update node object with resources
 86 | 			case <-time.After(15 * time.Second):
 87 | 			}
 88 | 
 89 | 			return ctx, cfg.Client().Resources().Create(ctx, getTestNamespace())
 90 | 		},
 91 | 	)
 92 | 
 93 | 	testenv.Finish(
 94 | 		func(ctx context.Context, config *envconf.Config) (context.Context, error) {
 95 | 			cfg.Client().Resources().Delete(context.TODO(), getTestNamespace())
 96 | 			err := e2e.DeleteManifests(cfg.Client().RESTConfig(), manifests.EfaDevicePluginManifest)
 97 | 			if err != nil {
 98 | 				return ctx, err
 99 | 			}
100 | 			return ctx, nil
101 | 		},
102 | 	)
103 | 
104 | 	os.Exit(testenv.Run(m))
105 | }
106 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/aws/aws-k8s-tester/issues), or [recently closed](https://github.com/aws/aws-k8s-tester/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-k8s-tester/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws/aws-k8s-tester/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/internal/deployers/eksctl/cluster_config.go:
--------------------------------------------------------------------------------
  1 | package eksctl
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 
  6 | 	eksctl_api "github.com/weaveworks/eksctl/pkg/apis/eksctl.io/v1alpha5"
  7 | 	"k8s.io/klog"
  8 | 	"sigs.k8s.io/yaml"
  9 | )
 10 | 
 11 | // CreateClusterConfig constructs an eksctl_api.ClusterConfig object based on UpOptions.
 12 | // This function replaces the string-based template rendering.
 13 | func (d *deployer) CreateClusterConfig() (*eksctl_api.ClusterConfig, error) {
 14 | 	d.initClusterName()
 15 | 
 16 | 	cfg := eksctl_api.NewClusterConfig()
 17 | 	// Metadata
 18 | 	cfg.Metadata.Name = d.clusterName
 19 | 	cfg.Metadata.Region = d.Region
 20 | 	cfg.Metadata.Version = d.KubernetesVersion
 21 | 	// IAM
 22 | 	cfg.IAM.WithOIDC = &d.WithOIDC
 23 | 
 24 | 	amiFamily := d.AMIFamily
 25 | 	if amiFamily == "" {
 26 | 		amiFamily = eksctl_api.NodeImageFamilyAmazonLinux2
 27 | 	}
 28 | 	nodeGroupName := d.NodegroupName
 29 | 	if nodeGroupName == "" {
 30 | 		nodeGroupName = "ng-1"
 31 | 	}
 32 | 	// Create node group or managed node group (MNG)
 33 | 	if d.UseUnmanagedNodegroup {
 34 | 		ng := cfg.NewNodeGroup()
 35 | 		// TODO: update this when we add support for SSH.
 36 | 		ng.SSH = nil
 37 | 		ng.AMIFamily = amiFamily
 38 | 		ng.Name = nodeGroupName
 39 | 		if len(d.InstanceTypes) > 0 {
 40 | 			ng.InstanceType = d.InstanceTypes[0]
 41 | 		}
 42 | 		if d.Nodes >= 0 {
 43 | 			ng.MinSize = &d.Nodes
 44 | 			ng.MaxSize = &d.Nodes
 45 | 			ng.DesiredCapacity = &d.Nodes
 46 | 		}
 47 | 		if d.VolumeSize >= 0 {
 48 | 			ng.VolumeSize = &d.VolumeSize
 49 | 		}
 50 | 		ng.PrivateNetworking = d.PrivateNetworking
 51 | 		ng.EFAEnabled = &d.EFAEnabled
 52 | 		if len(d.AvailabilityZones) > 0 {
 53 | 			ng.AvailabilityZones = d.AvailabilityZones
 54 | 		}
 55 | 		if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyAmazonLinux2 {
 56 | 			bootstrapCommand := fmt.Sprintf(`#!/bin/bash
 57 | source /var/lib/cloud/scripts/eksctl/bootstrap.helper.sh
 58 | /etc/eks/bootstrap.sh %s --kubelet-extra-args "--node-labels=${NODE_LABELS}"`, d.clusterName)
 59 | 			ng.OverrideBootstrapCommand = &bootstrapCommand
 60 | 		}
 61 | 	} else {
 62 | 		// Create managed node group
 63 | 		mng := eksctl_api.NewManagedNodeGroup()
 64 | 		cfg.ManagedNodeGroups = append(cfg.ManagedNodeGroups, mng)
 65 | 		// TODO: update this when we add support for SSH.
 66 | 		mng.SSH = nil
 67 | 		mng.AMIFamily = amiFamily
 68 | 		mng.Name = nodeGroupName
 69 | 		mng.InstanceTypes = d.InstanceTypes
 70 | 		if d.Nodes >= 0 {
 71 | 			mng.MinSize = &d.Nodes
 72 | 			mng.MaxSize = &d.Nodes
 73 | 			mng.DesiredCapacity = &d.Nodes
 74 | 		}
 75 | 		if d.VolumeSize >= 0 {
 76 | 			mng.VolumeSize = &d.VolumeSize
 77 | 		}
 78 | 		mng.PrivateNetworking = d.PrivateNetworking
 79 | 		mng.EFAEnabled = &d.EFAEnabled
 80 | 		if len(d.AvailabilityZones) > 0 {
 81 | 			mng.AvailabilityZones = d.AvailabilityZones
 82 | 		}
 83 | 		if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyAmazonLinux2 {
 84 | 			bootstrapCommand := fmt.Sprintf(`#!/bin/bash
 85 | source /var/lib/cloud/scripts/eksctl/bootstrap.helper.sh
 86 | /etc/eks/bootstrap.sh %s --kubelet-extra-args "--node-labels=${NODE_LABELS}"`, d.clusterName)
 87 | 			mng.OverrideBootstrapCommand = &bootstrapCommand
 88 | 		} else if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyBottlerocket {
 89 | 			mng.AMI = d.AMI
 90 | 		}
 91 | 	}
 92 | 	return cfg, nil
 93 | }
 94 | 
 95 | type clusterConfigTemplateParams struct {
 96 | 	UpOptions
 97 | 	ClusterName string
 98 | 	Region      string
 99 | }
100 | 
101 | func (d *deployer) RenderClusterConfig() ([]byte, error) {
102 | 
103 | 	cfg, err := d.CreateClusterConfig()
104 | 	if err != nil {
105 | 		klog.Errorf("failed to create ClusterConfig with the deployer: %v", err)
106 | 	}
107 | 	klog.Infof("rendering cluster config yaml based on the ClusterConfig: %v", cfg)
108 | 	return yaml.Marshal(cfg)
109 | }
110 | 


--------------------------------------------------------------------------------
/test/cases/nvidia/manifests/daemonset-containerd-check.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: containerd-check
 5 |   namespace: default
 6 |   labels:
 7 |     app: containerd-check
 8 | spec:
 9 |   selector:
10 |     matchLabels:
11 |       app: containerd-check
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: containerd-check
16 |     spec:
17 |       containers:
18 |       - name: containerd-check
19 |         image: public.ecr.aws/amazonlinux/amazonlinux:latest
20 |         command:
21 |         - sh
22 |         - -c
23 |         - |
24 |           # 1. Ensure the script fails on any command or pipeline error
25 |           set -e
26 |           set -o pipefail
27 | 
28 |           echo "=== content read by the container ==="
29 |           cat /host-etc/containerd/config.toml
30 | 
31 |           # 2. Check containerd config version and look for appropriate sandbox field
32 |           #    In containerd config version = 2 expect to find pattern `sandbox_image = "registry.k8s.io/pause:3.10.1"`
33 |           #    In containerd config version = 3 expect to find pattern `sandbox = "registry.k8s.io/pause:3.10.1"`
34 |           #    For more details: https://github.com/containerd/containerd/blob/main/docs/cri/config.md
35 |           version_line=$(grep -E '^version\s*=' /host-etc/containerd/config.toml || true)
36 |           if [ -z "$version_line" ]; then
37 |             echo "FAIL: no version line found in containerd config"
38 |             exit 1
39 |           fi
40 | 
41 |           version=$(echo "$version_line" | cut -d'=' -f2 | tr -d ' ')
42 |           echo "INFO: containerd config version = $version"
43 |           if [ "$version" = "2" ]; then
44 |             sandbox_line=$(grep -E 'sandbox_image\s*=' /host-etc/containerd/config.toml || true)
45 |           elif [ "$version" = "3" ]; then
46 |             sandbox_line=$(grep -E 'sandbox\s*=' /host-etc/containerd/config.toml || true)
47 |           else
48 |             echo "FAIL: unsupported containerd config version: $version"
49 |             exit 1
50 |           fi
51 | 
52 |           # 3. If no sandbox configuration is found, fail explicitly
53 |           if [ -z "$sandbox_line" ]; then
54 |             echo "FAIL: no sandbox_image or sandbox line found"
55 |             echo "=== debug ==="
56 |             exit 1
57 |           fi
58 |           sandbox_image=$(echo "$sandbox_line" | cut -d'"' -f2)
59 | 
60 |           # 4. Check that $sandbox_image references .ecr. or is provided on the instance
61 |           if [[ "$sandbox_image" == "localhost"* ]]; then
62 |             echo "INFO: skipping .ecr. check for localhost sandbox image"
63 |           else
64 |             if [[ "$sandbox_image" != *".ecr."* ]]; then
65 |               echo "FAIL: no .ecr. reference in $sandbox_image"
66 |               echo "=== debug ==="
67 |               exit 1
68 |             fi
69 |           fi
70 | 
71 |           # 5. Check for 'nvidia-container-runtime'
72 |           if ! grep -q "nvidia-container-runtime" /host-etc/containerd/config.toml; then
73 |             echo "FAIL: no nvidia-container-runtime found"
74 |             echo "=== debug ==="
75 |             exit 1
76 |           fi
77 | 
78 |           # 6. Check for 'systemd_cgroup = true' or 'SystemdCgroup = true'
79 |           if ! ( grep -q 'systemd_cgroup = true' /host-etc/containerd/config.toml || \
80 |                  grep -q 'SystemdCgroup = true' /host-etc/containerd/config.toml ); then
81 |             echo "FAIL: no systemd cgroup setting"
82 |             echo "=== debug ==="
83 |             exit 1
84 |           fi
85 | 
86 |           echo "containerd config check PASSED."
87 |           # Keep container running so DS can be marked Ready
88 |           tail -f /dev/null
89 |         volumeMounts:
90 |         - name: containerd-config
91 |           mountPath: /host-etc/containerd
92 |           readOnly: true
93 |       volumes:
94 |       - name: containerd-config
95 |         hostPath:
96 |           path: /etc/containerd
97 | 


--------------------------------------------------------------------------------
/test/cases/quick/io_uring_test.go:
--------------------------------------------------------------------------------
  1 | //go:build e2e
  2 | 
  3 | package quick
  4 | 
  5 | import (
  6 | 	"context"
  7 | 	"log"
  8 | 	"testing"
  9 | 	"time"
 10 | 
 11 | 	"github.com/aws/aws-k8s-tester/internal/e2e"
 12 | 
 13 | 	corev1 "k8s.io/api/core/v1"
 14 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 15 | 
 16 | 	"sigs.k8s.io/e2e-framework/klient/k8s"
 17 | 	"sigs.k8s.io/e2e-framework/klient/wait"
 18 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
 19 | 	"sigs.k8s.io/e2e-framework/pkg/features"
 20 | )
 21 | 
 22 | func TestNpmInstallWithCPULimits(t *testing.T) {
 23 | 	feat := features.New("npm-install").
 24 | 		WithLabel("suite", "quick").
 25 | 		Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
 26 | 			log.Println("[Setup] Verifying cluster nodes...")
 27 | 			var nodeList corev1.NodeList
 28 | 			if err := cfg.Client().Resources().List(ctx, &nodeList); err != nil {
 29 | 				t.Fatalf("Failed to list nodes: %v", err)
 30 | 			}
 31 | 
 32 | 			// Log node information
 33 | 			for _, node := range nodeList.Items {
 34 | 				arch := node.Labels["kubernetes.io/arch"]
 35 | 				kernelVersion := node.Status.NodeInfo.KernelVersion
 36 | 				t.Logf("Node: %s, Architecture: %s, Kernel: %s", node.Name, arch, kernelVersion)
 37 | 			}
 38 | 			return ctx
 39 | 		}).
 40 | 		Assess("Pod can successfully run npm install", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
 41 | 			podName := "npm-install-test"
 42 | 			podNS := "default"
 43 | 
 44 | 			pod := &corev1.Pod{
 45 | 				ObjectMeta: metav1.ObjectMeta{
 46 | 					Name:      podName,
 47 | 					Namespace: podNS,
 48 | 					Labels: map[string]string{
 49 | 						"app": "npm-install-test",
 50 | 					},
 51 | 				},
 52 | 				Spec: corev1.PodSpec{
 53 | 					Containers: []corev1.Container{
 54 | 						{
 55 | 							Name:    "test-container",
 56 | 							Image:   "public.ecr.aws/ubuntu/ubuntu:noble",
 57 | 							Command: []string{"/bin/sh", "-c"},
 58 | 							Args: []string{`
 59 | 								set -x
 60 | 								echo "[Test] Starting npm installation test..."
 61 | 								mkdir asd && 
 62 | 								cd asd && 
 63 | 								apt-get update && 
 64 | 								apt-get install -y npm nodejs && 
 65 | 								echo "[Test] Starting npm install webpack..."
 66 | 								npm install webpack --loglevel verbose || exit 1
 67 | 								echo "[Test] npm install completed successfully"
 68 | 							`},
 69 | 						},
 70 | 					},
 71 | 					RestartPolicy: corev1.RestartPolicyNever,
 72 | 				},
 73 | 			}
 74 | 
 75 | 			if err := cfg.Client().Resources().Create(ctx, pod); err != nil {
 76 | 				t.Fatalf("[Assess] Failed to create pod: %v", err)
 77 | 			}
 78 | 
 79 | 			log.Printf("[Assess] Waiting up to 10 minutes for pod %s to complete...", podName)
 80 | 			err := wait.For(
 81 | 				e2e.NewConditionExtension(cfg.Client().Resources()).ResourceMatch(pod, func(object k8s.Object) bool {
 82 | 					pod := object.(*corev1.Pod)
 83 | 					return pod.Status.Phase == corev1.PodSucceeded
 84 | 				}),
 85 | 				wait.WithTimeout(10*time.Minute),
 86 | 			)
 87 | 			if err != nil {
 88 | 				t.Logf("[Assess] Pod did not complete successfully: %v", err)
 89 | 				e2e.PrintDaemonSetPodLogs(t, ctx, cfg.Client().RESTConfig(), podNS, "app=npm-install-test")
 90 | 				t.Fatal("Pod did not complete within 10 minutes - possible io_uring hang detected")
 91 | 			}
 92 | 
 93 | 			log.Printf("[Assess] Pod %s completed successfully", podName)
 94 | 			return ctx
 95 | 		}).
 96 | 		Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
 97 | 			podName := "npm-install-test"
 98 | 			podNS := "default"
 99 | 
100 | 			t.Logf("[Teardown] Cleaning up pod %s/%s...", podNS, podName)
101 | 			pod := &corev1.Pod{
102 | 				ObjectMeta: metav1.ObjectMeta{
103 | 					Name:      podName,
104 | 					Namespace: podNS,
105 | 				},
106 | 			}
107 | 			if err := cfg.Client().Resources().Delete(ctx, pod); err != nil {
108 | 				t.Logf("[Teardown] Failed to delete pod: %v", err)
109 | 			}
110 | 			return ctx
111 | 		}).
112 | 		Feature()
113 | 
114 | 	testenv.Test(t, feat)
115 | }
116 | 


--------------------------------------------------------------------------------
/internal/deployers/eksctl/deployer.go:
--------------------------------------------------------------------------------
  1 | package eksctl
  2 | 
  3 | import (
  4 | 	"flag"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 
  9 | 	"github.com/aws/aws-k8s-tester/internal"
 10 | 	"github.com/aws/aws-k8s-tester/internal/awssdk"
 11 | 	"github.com/aws/aws-sdk-go-v2/aws"
 12 | 	"github.com/aws/aws-sdk-go-v2/service/eks"
 13 | 	"github.com/urfave/sflags/gen/gpflag"
 14 | 	"github.com/spf13/pflag"
 15 | 	"k8s.io/klog"
 16 | 	"sigs.k8s.io/kubetest2/pkg/types"
 17 | 	"sigs.k8s.io/yaml"
 18 | )
 19 | 
 20 | // DeployerName is the name of the deployer
 21 | const DeployerName = "eksctl"
 22 | 
 23 | type deployer struct {
 24 | 	// generic parts
 25 | 	commonOptions types.Options
 26 | 	*UpOptions
 27 | 	awsConfig      aws.Config
 28 | 	eksClient      *eks.Client
 29 | 	KubeconfigPath string `flag:"kubeconfig" desc:"Path to kubeconfig"`
 30 | 	// ClusterName is the effective cluster name (from flag or RunID)
 31 | 	clusterName string
 32 | }
 33 | 
 34 | // NewDeployer implements deployer.New for EKS using eksctl
 35 | func NewDeployer(opts types.Options) (types.Deployer, *pflag.FlagSet) {
 36 | 	// create a deployer object and set fields that are not flag controlled
 37 | 	awsConfig := awssdk.NewConfig()
 38 | 	d := &deployer{
 39 | 		commonOptions: opts,
 40 | 		awsConfig:     awsConfig,
 41 | 		eksClient:     eks.NewFromConfig(awsConfig),
 42 | 	}
 43 | 	// register flags and return
 44 | 	return d, bindFlags(d)
 45 | }
 46 | 
 47 | func (d *deployer) DumpClusterLogs() error {
 48 | 	return nil
 49 | }
 50 | 
 51 | func (d *deployer) Kubeconfig() (string, error) {
 52 | 	if d.KubeconfigPath != "" {
 53 | 		return d.KubeconfigPath, nil
 54 | 	}
 55 | 	return filepath.Join(d.commonOptions.RunDir(), "kubeconfig"), nil
 56 | }
 57 | 
 58 | func (d *deployer) Version() string {
 59 | 	return internal.Version
 60 | }
 61 | 
 62 | // bindFlags is a helper used to create & bind a flagset to the deployer
 63 | func bindFlags(d *deployer) *pflag.FlagSet {
 64 | 	flags, err := gpflag.Parse(d)
 65 | 	if err != nil {
 66 | 		klog.Fatalf("unable to bind flags for deployer")
 67 | 		return nil
 68 | 	}
 69 | 	klog.InitFlags(nil)
 70 | 	flags.AddGoFlagSet(flag.CommandLine)
 71 | 	return flags
 72 | }
 73 | 
 74 | // initClusterName sets the effective cluster name with this precedence:
 75 | // 1. config file
 76 | // 2. --cluster-name flag
 77 | // 3. RunID of the kubetest
 78 | func (d *deployer) initClusterName() {
 79 | 	// First priority: config file if provided
 80 | 	if d.UpOptions.ConfigFile != "" {
 81 | 		clusterName, err := d.parseClusterNameFromConfig(d.UpOptions.ConfigFile)
 82 | 		if err == nil {
 83 | 			d.clusterName = clusterName
 84 | 			klog.V(2).Infof("Using cluster name from config file: %s", d.clusterName)
 85 | 			return
 86 | 		}
 87 | 		klog.Warningf("Failed to extract cluster name from config file: %v", err)
 88 | 		// Continue with other methods if parsing fails
 89 | 	}
 90 | 
 91 | 	if d.UpOptions.ClusterName != "" {
 92 | 		d.clusterName = d.UpOptions.ClusterName
 93 | 		klog.V(2).Infof("Using cluster name from flag: %s", d.clusterName)
 94 | 	} else {
 95 | 		d.clusterName = d.commonOptions.RunID()
 96 | 		klog.V(2).Infof("Using RunID for cluster name: %s", d.clusterName)
 97 | 	}
 98 | }
 99 | 
100 | // parseClusterNameFromConfig extracts the cluster name from an eksctl config file
101 | func (d *deployer) parseClusterNameFromConfig(configFilePath string) (string, error) {
102 | 	configData, err := os.ReadFile(configFilePath)
103 | 	if err != nil {
104 | 		return "", fmt.Errorf("failed to read config file: %v", err)
105 | 	}
106 | 
107 | 	// Simple YAML parsing to extract the cluster name
108 | 	var configMap map[string]interface{}
109 | 	if err := yaml.Unmarshal(configData, &configMap); err != nil {
110 | 		return "", fmt.Errorf("failed to parse config file YAML: %v", err)
111 | 	}
112 | 
113 | 	// Extract metadata section
114 | 	metadata, ok := configMap["metadata"].(map[string]interface{})
115 | 	if !ok {
116 | 		return "", fmt.Errorf("metadata section missing in config file")
117 | 	}
118 | 
119 | 	// Extract name field
120 | 	name, ok := metadata["name"].(string)
121 | 	if !ok || name == "" {
122 | 		return "", fmt.Errorf("cluster name not found in config file metadata")
123 | 	}
124 | 
125 | 	return name, nil
126 | }
127 | 
128 | // assert that deployer implements types.DeployerWithKubeconfig
129 | var _ types.DeployerWithKubeconfig = &deployer{}
130 | 


--------------------------------------------------------------------------------
/test/cases/quick/limit_test.go:
--------------------------------------------------------------------------------
  1 | //go:build e2e
  2 | 
  3 | package quick
  4 | 
  5 | import (
  6 | 	"bytes"
  7 | 	"context"
  8 | 	_ "embed"
  9 | 	"io"
 10 | 	"strings"
 11 | 	"testing"
 12 | 	"time"
 13 | 
 14 | 	fwext "github.com/aws/aws-k8s-tester/internal/e2e"
 15 | 	corev1 "k8s.io/api/core/v1"
 16 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 17 | 	"k8s.io/client-go/kubernetes"
 18 | 	"sigs.k8s.io/e2e-framework/klient/k8s"
 19 | 	"sigs.k8s.io/e2e-framework/klient/wait"
 20 | 	"sigs.k8s.io/e2e-framework/klient/wait/conditions"
 21 | 	"sigs.k8s.io/e2e-framework/pkg/envconf"
 22 | 	"sigs.k8s.io/e2e-framework/pkg/features"
 23 | )
 24 | 
 25 | var (
 26 | 	//go:embed manifests/ulimit.yaml
 27 | 	ulimitManifest []byte
 28 | 
 29 | 	expectedResourceLimit = map[string]string{
 30 | 		"-R": "unlimited",
 31 | 		"-c": "unlimited",
 32 | 		"-d": "unlimited",
 33 | 		"-e": "0",
 34 | 		"-f": "unlimited",
 35 | 		"-i": "30446",
 36 | 		"-l": "unlimited",
 37 | 		"-m": "unlimited",
 38 | 		"-n": "1048576",
 39 | 		"-p": "8",
 40 | 		"-q": "819200",
 41 | 		"-r": "0",
 42 | 		"-s": "10240",
 43 | 		"-t": "unlimited",
 44 | 		"-u": "unlimited",
 45 | 		"-v": "unlimited",
 46 | 		"-x": "unlimited",
 47 | 	}
 48 | )
 49 | 
 50 | func TestUserLimits(t *testing.T) {
 51 | 	f1 := features.New("ulimit pod").
 52 | 		WithLabel("type", "ulimit").
 53 | 		Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
 54 | 			err := fwext.ApplyManifests(cfg.Client().RESTConfig(), ulimitManifest)
 55 | 			if err != nil {
 56 | 				t.Fatalf("failed to apply manifests: %v", err)
 57 | 			}
 58 | 			pod := &corev1.Pod{
 59 | 				ObjectMeta: metav1.ObjectMeta{Name: "ulimit", Namespace: "default"},
 60 | 			}
 61 | 			err = wait.For(conditions.New(cfg.Client().Resources()).ResourceMatch(pod, containerTerminated),
 62 | 				wait.WithTimeout(time.Minute*5))
 63 | 			if err != nil {
 64 | 				t.Fatalf("encounter error when waiting for container finished running commands: %v", err)
 65 | 			}
 66 | 			return ctx
 67 | 		}).
 68 | 		Assess("Use default resources limit", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
 69 | 			client, err := kubernetes.NewForConfig(cfg.Client().RESTConfig())
 70 | 			if err != nil {
 71 | 				t.Fatal(err)
 72 | 			}
 73 | 			tailLine := int64(10000)
 74 | 			podLogOptions := corev1.PodLogOptions{
 75 | 				Container: "al2023",
 76 | 				TailLines: &tailLine,
 77 | 			}
 78 | 			req := client.CoreV1().Pods("default").GetLogs("ulimit", &podLogOptions)
 79 | 			logs, err := req.Stream(ctx)
 80 | 			if err != nil {
 81 | 				t.Fatalf("error in opening stream: %v", err)
 82 | 			}
 83 | 			defer logs.Close()
 84 | 			compareResourceLimitsWithExpectedValues(t, logs)
 85 | 			return ctx
 86 | 		}).
 87 | 		Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
 88 | 			err := fwext.DeleteManifests(cfg.Client().RESTConfig(), ulimitManifest)
 89 | 			if err != nil {
 90 | 				t.Fatalf("failed to delete manifests: %v", err)
 91 | 			}
 92 | 			return ctx
 93 | 		}).Feature()
 94 | 
 95 | 	// test feature
 96 | 	testenv.Test(t, f1)
 97 | }
 98 | 
 99 | func compareResourceLimitsWithExpectedValues(t *testing.T, logs io.ReadCloser) {
100 | 	buf := new(bytes.Buffer)
101 | 	_, err := io.Copy(buf, logs)
102 | 	if err != nil {
103 | 		t.Fatalf("error in copy information from podLogs to buf: %v", err)
104 | 	}
105 | 	str := buf.String()
106 | 
107 | 	lines := strings.Split(str, "\n")
108 | 	for _, line := range lines[:len(lines)-1] {
109 | 		info := strings.Split(line, " ")
110 | 		marker := getMarker(info[len(info)-2])
111 | 		value := info[len(info)-1]
112 | 		if expectedResourceLimit[marker] != value {
113 | 			t.Errorf("resource limit doesn't match with the default value, limit we get %v, but default value is %v", line, expectedResourceLimit[marker])
114 | 		} else {
115 | 			t.Logf("resrouce limit fetched from ulimit: %v. Equal to the default value %v", line, expectedResourceLimit[marker])
116 | 		}
117 | 	}
118 | }
119 | 
120 | func containerTerminated(obj k8s.Object) bool {
121 | 	j := obj.(*corev1.Pod)
122 | 	containerTerminatedState := j.Status.ContainerStatuses[0].State.Terminated
123 | 	return containerTerminatedState.Reason == "Completed"
124 | }
125 | 
126 | func getMarker(str string) string {
127 | 	startIndex := 0
128 | 	if str[:1] == "(" {
129 | 		startIndex = 1
130 | 	}
131 | 	return str[startIndex : len(str)-1]
132 | }
133 | 


--------------------------------------------------------------------------------
/test/images/neuron/tests/testNeuronMlp.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/aws/deep-learning-containers/blob/master/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronMlp
  2 | import os
  3 | import time
  4 | import torch
  5 | 
  6 | from torchvision.datasets import mnist
  7 | from torch.utils.data import DataLoader
  8 | from torchvision.transforms import ToTensor
  9 | 
 10 | # XLA imports
 11 | import torch_xla.core.xla_model as xm
 12 | import torch_xla.runtime as xr
 13 | 
 14 | # XLA imports for parallel loader and multi-processing
 15 | import torch_xla.distributed.parallel_loader as pl
 16 | from torch.utils.data.distributed import DistributedSampler
 17 | 
 18 | # Initialize XLA process group for torchrun
 19 | import torch_xla.distributed.xla_backend
 20 | import torch.nn as nn
 21 | import torch.nn.functional as F
 22 | 
 23 | torch.distributed.init_process_group('xla')
 24 | 
 25 | # Global constants
 26 | EPOCHS = 4
 27 | WARMUP_STEPS = 2
 28 | BATCH_SIZE = 32
 29 | 
 30 | # Load MNIST train dataset
 31 | train_dataset = mnist.MNIST(root=os.path.join(os.path.expanduser("~") + '/MNIST_DATA_train', str(xr.global_ordinal())),
 32 |                             train=True, download=True, transform=ToTensor())
 33 | 
 34 | # Declare 3-layer MLP for MNIST dataset
 35 | class MLP(nn.Module):
 36 |   def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]):
 37 |       super(MLP, self).__init__()
 38 |       self.fc1 = nn.Linear(input_size, layers[0])
 39 |       self.fc2 = nn.Linear(layers[0], layers[1])
 40 |       self.fc3 = nn.Linear(layers[1], output_size)
 41 | 
 42 |   def forward(self, x):
 43 |       x = F.relu(self.fc1(x))
 44 |       x = F.relu(self.fc2(x))
 45 |       x = self.fc3(x)
 46 |       return F.log_softmax(x, dim=1)
 47 | 
 48 | 
 49 | def main():
 50 |     # XLA MP: get world size
 51 |     world_size = xr.world_size()
 52 |     # multi-processing: ensure each worker has same initial weights
 53 |     torch.manual_seed(0)
 54 | 
 55 |     # Move model to device and declare optimizer and loss function
 56 |     device = 'xla'
 57 |     model = MLP().to(device)
 58 |     # For multiprocessing, scale up learning rate
 59 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * world_size)
 60 |     loss_fn = torch.nn.NLLLoss()
 61 | 
 62 |     # Prepare data loader
 63 |     train_sampler = None
 64 |     if world_size > 1:
 65 |         train_sampler = DistributedSampler(train_dataset,
 66 |                                            num_replicas=world_size,
 67 |                                            rank=xr.global_ordinal(),
 68 |                                            shuffle=True)
 69 |     train_loader = DataLoader(train_dataset,
 70 |                               batch_size=BATCH_SIZE,
 71 |                               sampler=train_sampler,
 72 |                               shuffle=False if train_sampler else True)
 73 |     # XLA MP: use MpDeviceLoader from torch_xla.distributed
 74 |     train_device_loader = pl.MpDeviceLoader(train_loader, device)
 75 | 
 76 |     # Run the training loop
 77 |     print('----------Training ---------------')
 78 |     model.train()
 79 |     for epoch in range(EPOCHS):
 80 |         start = time.time()
 81 |         for idx, (train_x, train_label) in enumerate(train_device_loader):
 82 |             optimizer.zero_grad()
 83 |             train_x = train_x.view(train_x.size(0), -1)
 84 |             output = model(train_x)
 85 |             loss = loss_fn(output, train_label)
 86 |             loss.backward()
 87 |             xm.optimizer_step(optimizer) # XLA MP: performs grad allreduce and optimizer step
 88 |             if idx < WARMUP_STEPS: # skip warmup iterations
 89 |                 start = time.time()
 90 | 
 91 |     # Compute statistics for the last epoch
 92 |     interval = idx - WARMUP_STEPS # skip warmup iterations
 93 |     throughput = interval / (time.time() - start)
 94 |     print("Train throughput (iter/sec): {}".format(throughput))
 95 |     print("Final loss is {:0.4f}".format(loss.detach().to('cpu')))
 96 | 
 97 |     # Save checkpoint for evaluation (xm.save ensures only one process save)
 98 |     os.makedirs(os.path.expanduser("~") + "/checkpoints", exist_ok=True)
 99 |     checkpoint = {'state_dict': model.state_dict()}
100 |     xm.save(checkpoint, os.path.expanduser("~") + '/checkpoints/checkpoint.pt')
101 | 
102 |     print('----------End Training ---------------')
103 | 
104 | if __name__ == '__main__':
105 |     main()
106 | 


--------------------------------------------------------------------------------
/internal/e2e/conditions.go:
--------------------------------------------------------------------------------
  1 | package e2e
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 
  7 | 	appsv1 "k8s.io/api/apps/v1"
  8 | 	batchv1 "k8s.io/api/batch/v1"
  9 | 	v1 "k8s.io/api/core/v1"
 10 | 	apimachinerywait "k8s.io/apimachinery/pkg/util/wait"
 11 | 
 12 | 	"sigs.k8s.io/e2e-framework/klient/k8s"
 13 | 	"sigs.k8s.io/e2e-framework/klient/k8s/resources"
 14 | )
 15 | 
 16 | type ConditionExtension struct {
 17 | 	resources *resources.Resources
 18 | }
 19 | 
 20 | func NewConditionExtension(r *resources.Resources) *ConditionExtension {
 21 | 	return &ConditionExtension{resources: r}
 22 | }
 23 | 
 24 | // ResourceMatch is a helper function used to check if the resource under question has met a pre-defined state. This can
 25 | // be leveraged for checking fields on a resource that may not be immediately present upon creation.
 26 | func (c *ConditionExtension) ResourceMatch(obj k8s.Object, matchFetcher func(object k8s.Object) bool) apimachinerywait.ConditionWithContextFunc {
 27 | 	return func(ctx context.Context) (done bool, err error) {
 28 | 		if err := c.resources.Get(ctx, obj.GetName(), obj.GetNamespace(), obj); err != nil {
 29 | 			return false, err
 30 | 		}
 31 | 		return matchFetcher(obj), nil
 32 | 	}
 33 | }
 34 | 
 35 | func (c *ConditionExtension) PodRunning(pod k8s.Object) apimachinerywait.ConditionWithContextFunc {
 36 | 	return func(ctx context.Context) (done bool, err error) {
 37 | 		if err := c.resources.Get(ctx, pod.GetName(), pod.GetNamespace(), pod); err != nil {
 38 | 			return false, err
 39 | 		}
 40 | 		status := pod.(*v1.Pod).Status
 41 | 		switch status.Phase {
 42 | 		case v1.PodRunning:
 43 | 			return true, nil
 44 | 		case v1.PodPending:
 45 | 			return false, nil
 46 | 		default:
 47 | 			return false, fmt.Errorf("pod cannot transition to running from current status: %s", status.Phase)
 48 | 		}
 49 | 	}
 50 | }
 51 | 
 52 | func (c *ConditionExtension) PodSucceeded(pod k8s.Object) apimachinerywait.ConditionWithContextFunc {
 53 | 	return func(ctx context.Context) (done bool, err error) {
 54 | 		if err := c.resources.Get(ctx, pod.GetName(), pod.GetNamespace(), pod); err != nil {
 55 | 			return false, err
 56 | 		}
 57 | 		status := pod.(*v1.Pod).Status
 58 | 		if status.Phase == v1.PodSucceeded {
 59 | 			return true, nil
 60 | 		} else if status.Phase == v1.PodFailed {
 61 | 			return false, fmt.Errorf("Pod in Failed status")
 62 | 		}
 63 | 		return false, nil
 64 | 	}
 65 | }
 66 | 
 67 | func (c *ConditionExtension) DaemonSetReady(daemonset k8s.Object) apimachinerywait.ConditionWithContextFunc {
 68 | 	return func(ctx context.Context) (done bool, err error) {
 69 | 		if err := c.resources.Get(ctx, daemonset.GetName(), daemonset.GetNamespace(), daemonset); err != nil {
 70 | 			return false, err
 71 | 		}
 72 | 		status := daemonset.(*appsv1.DaemonSet).Status
 73 | 		if status.NumberReady == status.DesiredNumberScheduled && status.NumberUnavailable == 0 {
 74 | 			done = true
 75 | 		}
 76 | 		return
 77 | 	}
 78 | }
 79 | 
 80 | func (c *ConditionExtension) JobSucceeded(job k8s.Object) apimachinerywait.ConditionWithContextFunc {
 81 | 	return func(ctx context.Context) (done bool, err error) {
 82 | 		if err := c.resources.Get(ctx, job.GetName(), job.GetNamespace(), job); err != nil {
 83 | 			return false, err
 84 | 		}
 85 | 		batchJob := job.(*batchv1.Job)
 86 | 		status := batchJob.Status
 87 | 		spec := batchJob.Spec
 88 | 		for _, condition := range status.Conditions {
 89 | 			if condition.Type == batchv1.JobFailed && condition.Status == v1.ConditionTrue {
 90 | 				return false, fmt.Errorf("job failed")
 91 | 			}
 92 | 		}
 93 | 		if status.Succeeded != *spec.Completions {
 94 | 			return false, nil
 95 | 		}
 96 | 		return true, nil
 97 | 	}
 98 | }
 99 | 
100 | func (c *ConditionExtension) AllNodesHaveNonZeroResourceCapacity(resourceLabel string) apimachinerywait.ConditionWithContextFunc {
101 | 	return func(ctx context.Context) (done bool, err error) {
102 | 		nodeList := &v1.NodeList{}
103 | 		if err := c.resources.List(ctx, nodeList); err != nil {
104 | 			return false, fmt.Errorf("failed to list nodes: %w", err)
105 | 		}
106 | 		if len(nodeList.Items) == 0 {
107 | 			return false, fmt.Errorf("no nodes found in the cluster")
108 | 		}
109 | 		for _, node := range nodeList.Items {
110 | 			resource, ok := node.Status.Capacity[v1.ResourceName(resourceLabel)]
111 | 			if !ok {
112 | 				return false, nil
113 | 			}
114 | 			if resource.Value() <= 0 {
115 | 				return false, nil
116 | 			}
117 | 		}
118 | 		return true, nil
119 | 	}
120 | }
121 | 


--------------------------------------------------------------------------------
/internal/deployers/eksapi/ami_resolver.go:
--------------------------------------------------------------------------------
  1 | package eksapi
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 
  7 | 	"github.com/aws/aws-sdk-go-v2/aws"
  8 | 	"github.com/aws/aws-sdk-go-v2/service/ec2"
  9 | 	ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
 10 | 	"github.com/aws/aws-sdk-go-v2/service/ssm"
 11 | 	"k8s.io/klog/v2"
 12 | )
 13 | 
 14 | func NewAMIResolver(awsClients *awsClients) *amiResolver {
 15 | 	return &amiResolver{
 16 | 		clients: awsClients,
 17 | 	}
 18 | }
 19 | 
 20 | type amiResolver struct {
 21 | 	clients *awsClients
 22 | }
 23 | 
 24 | func (r *amiResolver) Resolve(ctx context.Context, opts *deployerOptions) (string, error) {
 25 | 	switch opts.UserDataFormat {
 26 | 	case UserDataBootstrapSh:
 27 | 		// TODO: AL2 is not a high priority, skipping for now.
 28 | 		return "", fmt.Errorf("%s is not handled", opts.UserDataFormat)
 29 | 	case UserDataNodeadm:
 30 | 		return r.ResolveAL2023(ctx, opts)
 31 | 	case UserDataBottlerocket:
 32 | 		return r.ResolveBottlerocket(ctx, opts)
 33 | 	default:
 34 | 		return "", fmt.Errorf("unhandled userdata format: %s", opts.UserDataFormat)
 35 | 	}
 36 | }
 37 | 
 38 | func (r *amiResolver) ResolveAL2023(ctx context.Context, opts *deployerOptions) (string, error) {
 39 | 	describeInstanceTypesResponse, err := r.clients.EC2().DescribeInstanceTypes(ctx, &ec2.DescribeInstanceTypesInput{
 40 | 		InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(r.getInstance(opts))},
 41 | 	})
 42 | 	if err != nil {
 43 | 		return "", err
 44 | 	}
 45 | 	instanceTypeInfo := describeInstanceTypesResponse.InstanceTypes[0]
 46 | 
 47 | 	arch, err := r.resolveArch(instanceTypeInfo)
 48 | 	if err != nil {
 49 | 		return "", err
 50 | 	}
 51 | 
 52 | 	variant := "standard"
 53 | 	if instanceTypeInfo.NeuronInfo != nil {
 54 | 		if len(instanceTypeInfo.NeuronInfo.NeuronDevices) > 0 {
 55 | 			variant = "neuron"
 56 | 		}
 57 | 	} else if instanceTypeInfo.GpuInfo != nil {
 58 | 		for _, gpu := range instanceTypeInfo.GpuInfo.Gpus {
 59 | 			if aws.ToString(gpu.Manufacturer) == "NVIDIA" {
 60 | 				variant = "nvidia"
 61 | 				break
 62 | 			}
 63 | 		}
 64 | 	}
 65 | 
 66 | 	getParameterReponse, err := r.clients.SSM().GetParameter(ctx, &ssm.GetParameterInput{
 67 | 		Name: aws.String(fmt.Sprintf("/aws/service/eks/optimized-ami/%s/amazon-linux-2023/%s/%s/recommended/image_id", opts.KubernetesVersion, arch, variant)),
 68 | 	})
 69 | 	if err != nil {
 70 | 		return "", err
 71 | 	}
 72 | 
 73 | 	return aws.ToString(getParameterReponse.Parameter.Value), nil
 74 | }
 75 | 
 76 | func (r *amiResolver) ResolveBottlerocket(ctx context.Context, opts *deployerOptions) (string, error) {
 77 | 	describeInstanceTypesResponse, err := r.clients.EC2().DescribeInstanceTypes(ctx, &ec2.DescribeInstanceTypesInput{
 78 | 		InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(r.getInstance(opts))},
 79 | 	})
 80 | 	if err != nil {
 81 | 		return "", err
 82 | 	}
 83 | 	instanceTypeInfo := describeInstanceTypesResponse.InstanceTypes[0]
 84 | 
 85 | 	arch, err := r.resolveArch(instanceTypeInfo)
 86 | 	if err != nil {
 87 | 		return "", err
 88 | 	}
 89 | 
 90 | 	// TODO: enable fips
 91 | 	flavorSuffix := ""
 92 | 	if instanceTypeInfo.GpuInfo != nil {
 93 | 		for _, gpu := range instanceTypeInfo.GpuInfo.Gpus {
 94 | 			if aws.ToString(gpu.Manufacturer) == "NVIDIA" {
 95 | 				flavorSuffix = "-nvidia"
 96 | 				break
 97 | 			}
 98 | 		}
 99 | 	}
100 | 
101 | 	getParameterResponse, err := r.clients.SSM().GetParameter(ctx, &ssm.GetParameterInput{
102 | 		Name: aws.String(fmt.Sprintf("/aws/service/bottlerocket/aws-k8s-%s%s/%s/latest/image_id", opts.KubernetesVersion, flavorSuffix, arch)),
103 | 	})
104 | 	if err != nil {
105 | 		return "", err
106 | 	}
107 | 
108 | 	return aws.ToString(getParameterResponse.Parameter.Value), nil
109 | }
110 | 
111 | func (r *amiResolver) getInstance(opts *deployerOptions) string {
112 | 	instanceType := opts.InstanceTypes[0]
113 | 	if len(opts.InstanceTypes) > 1 {
114 | 		klog.Warningf("only resolving AMI based on first instance type: %s", instanceType)
115 | 	}
116 | 	return instanceType
117 | }
118 | 
119 | func (r *amiResolver) resolveArch(instanceTypeInfo ec2types.InstanceTypeInfo) (string, error) {
120 | 	// TODO: the ordering might be weird because old instances might support
121 | 	// both i386 and x8664.
122 | 	switch arch := instanceTypeInfo.ProcessorInfo.SupportedArchitectures[0]; arch {
123 | 	case ec2types.ArchitectureTypeArm64, ec2types.ArchitectureTypeX8664:
124 | 		return string(arch), nil
125 | 	default:
126 | 		return "", fmt.Errorf("unhandled arch: %s", arch)
127 | 	}
128 | }
129 | 


--------------------------------------------------------------------------------