├── bmg.json ├── .vscode └── settings.json ├── internal ├── version.go ├── e2e │ ├── doc.go │ ├── mpijobs │ │ ├── types.go │ │ ├── conditions_test.go │ │ └── conditions.go │ ├── resources.go │ ├── ec2.go │ ├── health.go │ ├── logs.go │ └── conditions.go ├── deployers │ ├── eksctl │ │ ├── build.go │ │ ├── down.go │ │ ├── cluster_config.go │ │ └── deployer.go │ └── eksapi │ │ ├── templates │ │ ├── auth_map_role.yaml.template │ │ ├── userdata_bootstrap.sh.mimepart.template │ │ ├── userdata_bottlerocket.toml.template │ │ ├── templates_test.go │ │ ├── userdata_nodeadm.yaml.mimepart.template │ │ ├── busybox_deployment.yaml.template │ │ ├── nvidia_static_cluster_nodepool.yaml.template │ │ ├── cloudwatch_agent_infra.yaml │ │ ├── cloudwatch-infra.yaml.template │ │ └── templates.go │ │ ├── vpccni_test.go │ │ ├── common_test.go │ │ ├── auth_map_role.go │ │ ├── common.go │ │ ├── logs_ssm_doc.json │ │ ├── metrics.go │ │ ├── vpccni.go │ │ ├── auth_map_role_test.go │ │ ├── ami_resolver_test.go │ │ ├── aws.go │ │ ├── kubeconfig.go │ │ ├── userdata.go │ │ ├── addons.go │ │ └── ami_resolver.go ├── util │ ├── lang.go │ ├── exec.go │ ├── version.go │ ├── path.go │ ├── http.go │ ├── http_test.go │ └── cloudformation.go ├── metrics │ ├── noop.go │ ├── registry.go │ └── cloudwatch.go ├── testers │ └── ginkgov1 │ │ ├── README.md │ │ └── kubectl │ │ └── kubectl.go └── awssdk │ └── config.go ├── test ├── images │ ├── nvidia-inference │ │ ├── requirements.txt │ │ └── Dockerfile │ ├── nvidia-training │ │ └── requirements.txt │ ├── nvidia │ │ └── gpu_unit_tests │ │ │ ├── tests │ │ │ ├── test_sysinfo.sh.data │ │ │ │ ├── g6f.2xlarge │ │ │ │ │ ├── efa_count.txt │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ │ └── nvidia_vgpu_license_status.txt │ │ │ │ ├── g6f.4xlarge │ │ │ │ │ ├── efa_count.txt │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ │ └── nvidia_vgpu_license_status.txt │ │ │ │ ├── g6f.large │ │ │ │ │ ├── efa_count.txt │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ │ └── nvidia_vgpu_license_status.txt │ │ │ │ ├── g6f.xlarge │ │ │ │ │ ├── efa_count.txt │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ │ └── nvidia_vgpu_license_status.txt │ │ │ │ ├── g5.8xlarge │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ └── nvidia_persistence_status.txt │ │ │ │ ├── g5g.2xlarge │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ └── nvidia_persistence_status.txt │ │ │ │ ├── p3.2xlarge │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ └── nvidia_persistence_status.txt │ │ │ │ ├── g5.48xlarge │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ │ └── nvidia_smi_topo.txt │ │ │ │ ├── p4d.24xlarge │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ │ └── nvidia_smi_topo.txt │ │ │ │ ├── p4de.24xlarge │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ │ └── nvidia_smi_topo.txt │ │ │ │ └── p5.48xlarge │ │ │ │ │ ├── numa_topo.txt │ │ │ │ │ ├── gpu_count.txt │ │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ │ └── nvidia_smi_topo.txt │ │ │ ├── test_basic.sh │ │ │ ├── common.sh │ │ │ └── test_sysinfo.sh │ │ │ ├── unit_test │ │ │ └── README.md │ ├── neuron │ │ └── tests │ │ │ ├── singleNodeTest.sh │ │ │ ├── testNeuronSingleAllReduce.py │ │ │ └── testNeuronMlp.py │ └── efa │ │ ├── scripts │ │ └── unit-test.sh │ │ └── Dockerfile ├── cases │ ├── neuron-training │ │ ├── manifests │ │ │ ├── training-comm-service.yaml │ │ │ └── bert-training.yaml │ │ └── vars.go │ ├── quick │ │ ├── manifests │ │ │ └── ulimit.yaml │ │ ├── main_test.go │ │ ├── node_topology_test.go │ │ ├── io_uring_test.go │ │ └── limit_test.go │ ├── dra │ │ └── main_test.go │ ├── nvidia-training │ │ ├── vars.go │ │ └── manifests │ │ │ └── bert-training.yaml │ ├── disruptive │ │ └── main_test.go │ ├── nvidia-inference │ │ └── manifests │ │ │ └── bert-inference.yaml │ ├── neuron │ │ └── manifests │ │ │ ├── single-node-test-neuronx.yaml │ │ │ └── multi-node-test-neuron.yaml │ ├── neuron-inference │ │ ├── vars.go │ │ └── manifests │ │ │ └── neuron-bert-inference.yaml │ ├── nvidia │ │ ├── manifests │ │ │ ├── job-unit-test-single-node.yaml │ │ │ ├── nvidia-driver-capabilities-check.yaml │ │ │ ├── job-hpc-benchmarks.yaml │ │ │ ├── mpi-job-pytorch-training-single-node.yaml │ │ │ ├── mpi-job-nccl-test-multi-node.yaml │ │ │ └── daemonset-containerd-check.yaml │ │ ├── containerd_test.go │ │ └── capabilities_test.go │ ├── workload │ │ └── main_test.go │ └── efa │ │ ├── commons.go │ │ └── main_test.go ├── manifests │ ├── raw.go │ ├── rendered.go │ └── assets │ │ ├── k8s-neuron-device-plugin-rbac.yml │ │ ├── dcgm-exporter.yaml │ │ ├── efa-device-plugin.yaml │ │ ├── nvidia-device-plugin.yaml │ │ ├── cloudwatch-agent.yaml │ │ └── k8s-neuron-device-plugin.yml └── common │ ├── flags.go │ └── resources.go ├── NOTICE ├── .dockerignore ├── .gitignore ├── cmd ├── kubetest2-tester-multi │ └── main.go ├── kubetest2-tester-ginkgo-v1 │ └── main.go ├── kubetest2-eksapi │ └── main.go ├── kubetest2-eksctl │ └── main.go └── kubetest2-eksapi-janitor │ └── main.go ├── .github ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── update-image-tags.yaml │ ├── update-nvidia-dependencies.yaml │ ├── update-go-dependencies.yaml │ ├── update-neuron-dependencies.yaml │ └── ci.yaml ├── CODE_OF_CONDUCT.md ├── Makefile ├── external └── tools.go ├── Config ├── hack ├── update-go-dependencies.sh ├── free-disk-space.sh ├── update-nvidia-dependencies.sh ├── download-kubernetes-binaries.sh ├── update-neuron-dependencies.sh └── update-image-tags.sh ├── Dockerfile ├── README.md └── CONTRIBUTING.md /bmg.json: -------------------------------------------------------------------------------- 1 | { 2 | "binary_artifacts_only": true 3 | } 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "git.ignoreLimitWarning": true 3 | } -------------------------------------------------------------------------------- /internal/version.go: -------------------------------------------------------------------------------- 1 | package internal 2 | 3 | var Version string 4 | -------------------------------------------------------------------------------- /test/images/nvidia-inference/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.53.0 2 | numpy==1.26 3 | -------------------------------------------------------------------------------- /test/images/nvidia-training/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.53.0 2 | numpy==1.26 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/efa_count.txt: -------------------------------------------------------------------------------- 1 | 0 2 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt: -------------------------------------------------------------------------------- 1 | 0 2 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/efa_count.txt: -------------------------------------------------------------------------------- 1 | 0 2 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/efa_count.txt: -------------------------------------------------------------------------------- 1 | 0 2 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Awstester 2 | Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git/ 2 | .github/ 3 | bin/ 4 | CHANGELOG/ 5 | Dockerfile 6 | Makefile 7 | aws-k8s-tester 8 | -------------------------------------------------------------------------------- /internal/e2e/doc.go: -------------------------------------------------------------------------------- 1 | // Package frameworkext contains extensions to sigs.k8s.io/e2e-framework 2 | package e2e 3 | -------------------------------------------------------------------------------- /internal/deployers/eksctl/build.go: -------------------------------------------------------------------------------- 1 | package eksctl 2 | 3 | // Build is a no-op 4 | func (d *deployer) Build() error { 5 | return nil 6 | } 7 | -------------------------------------------------------------------------------- /internal/util/lang.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | func Must[T any](t T, err error) T { 4 | if err != nil { 5 | panic(err) 6 | } 7 | return t 8 | } 9 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA A10G, 0, 00000000:00:1E.0 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA T4G, 0, 00000000:00:1F.0 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA L4-6Q, 0, 00000000:31:00.0 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA L4-12Q, 0, 00000000:35:00.0 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA L4-3Q, 0, 00000000:31:00.0 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA L4-3Q, 0, 00000000:31:00.0 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.DS_Store 2 | /bin 3 | /_tmp 4 | .idea 5 | *.swp 6 | /aws-k8s-tester 7 | */*/.DS_Store 8 | */.DS_Store 9 | /_artifacts 10 | /_rundir 11 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | Tesla V100-SXM2-16GB, 0, 00000000:00:1E.0 3 | -------------------------------------------------------------------------------- /cmd/kubetest2-tester-multi/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "github.com/aws/aws-k8s-tester/internal/testers/multi" 4 | 5 | func main() { 6 | multi.Main() 7 | } 8 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 CPU Affinity NUMA Affinity GPU NUMA ID 2 | GPU0 X 0-31 0 N/A 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 CPU Affinity NUMA Affinity GPU NUMA ID 2 | GPU0 X 0-7 0 N/A 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 CPU Affinity NUMA Affinity GPU NUMA ID 2 | GPU0 X 0-7 0 N/A 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 CPU Affinity NUMA Affinity GPU NUMA ID 2 | GPU0 X 0-1 0 N/A 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 CPU Affinity NUMA Affinity GPU NUMA ID 2 | GPU0 X 0-3 0 N/A 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 CPU Affinity NUMA Affinity GPU NUMA ID 2 | GPU0 X 0-7 0 N/A 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 CPU Affinity NUMA Affinity GPU NUMA ID 2 | GPU0 X 0-15 0 N/A 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-1 2 | /sys/devices/system/node/node0/distance:10 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-31 2 | /sys/devices/system/node/node0/distance:10 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA A10G, 00000000:00:1E.0, Enabled 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-7 2 | /sys/devices/system/node/node0/distance:10 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA T4G, 00000000:00:1F.0, Enabled 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-7 2 | /sys/devices/system/node/node0/distance:10 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-15 2 | /sys/devices/system/node/node0/distance:10 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA L4-3Q, 00000000:31:00.0, Enabled 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-3 2 | /sys/devices/system/node/node0/distance:10 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-7 2 | /sys/devices/system/node/node0/distance:10 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA L4-6Q, 00000000:31:00.0, Enabled 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA L4-12Q, 00000000:35:00.0, Enabled 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA L4-3Q, 00000000:31:00.0, Enabled 3 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | Tesla V100-SXM2-16GB, 00000000:00:1E.0, Enabled 3 | -------------------------------------------------------------------------------- /cmd/kubetest2-tester-ginkgo-v1/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/aws/aws-k8s-tester/internal/testers/ginkgov1" 5 | ) 6 | 7 | func main() { 8 | ginkgov1.Main() 9 | } 10 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | 6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 7 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/auth_map_role.yaml.template: -------------------------------------------------------------------------------- 1 | 2 | - username: system:node:{{"{{"}}{{.NodeNameStrategy}}{{"}}"}} 3 | groups: 4 | - system:bootstrappers 5 | - system:nodes 6 | rolearn: {{.Rolearn}} -------------------------------------------------------------------------------- /test/cases/neuron-training/manifests/training-comm-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: training 5 | labels: 6 | app: training 7 | spec: 8 | clusterIP: None 9 | selector: 10 | job-name: bert-training 11 | -------------------------------------------------------------------------------- /cmd/kubetest2-eksapi/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/aws/aws-k8s-tester/internal/deployers/eksapi" 5 | "sigs.k8s.io/kubetest2/pkg/app" 6 | ) 7 | 8 | func main() { 9 | app.Main(eksapi.DeployerName, eksapi.NewDeployer) 10 | } 11 | -------------------------------------------------------------------------------- /cmd/kubetest2-eksctl/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/aws/aws-k8s-tester/internal/deployers/eksctl" 5 | "sigs.k8s.io/kubetest2/pkg/app" 6 | ) 7 | 8 | func main() { 9 | app.Main(eksctl.DeployerName, eksctl.NewDeployer) 10 | } 11 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_vgpu_license_status.txt: -------------------------------------------------------------------------------- 1 | vGPU Software Licensed Product 2 | Product Name : NVIDIA RTX Virtual Workstation 3 | License Status : Licensed (Expiry: N/A) 4 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt: -------------------------------------------------------------------------------- 1 | vGPU Software Licensed Product 2 | Product Name : NVIDIA RTX Virtual Workstation 3 | License Status : Licensed (Expiry: N/A) 4 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_vgpu_license_status.txt: -------------------------------------------------------------------------------- 1 | vGPU Software Licensed Product 2 | Product Name : NVIDIA RTX Virtual Workstation 3 | License Status : Licensed (Expiry: N/A) 4 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_vgpu_license_status.txt: -------------------------------------------------------------------------------- 1 | vGPU Software Licensed Product 2 | Product Name : NVIDIA RTX Virtual Workstation 3 | License Status : Licensed (Expiry: N/A) 4 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-47,96-143 2 | /sys/devices/system/node/node1/cpulist:48-95,144-191 3 | /sys/devices/system/node/node0/distance:10 32 4 | /sys/devices/system/node/node1/distance:32 10 5 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-23,48-71 2 | /sys/devices/system/node/node1/cpulist:24-47,72-95 3 | /sys/devices/system/node/node0/distance:10 21 4 | /sys/devices/system/node/node1/distance:21 10 5 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-23,48-71 2 | /sys/devices/system/node/node1/cpulist:24-47,72-95 3 | /sys/devices/system/node/node0/distance:10 21 4 | /sys/devices/system/node/node1/distance:21 10 5 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/numa_topo.txt: -------------------------------------------------------------------------------- 1 | /sys/devices/system/node/node0/cpulist:0-47,96-143 2 | /sys/devices/system/node/node1/cpulist:48-95,144-191 3 | /sys/devices/system/node/node0/distance:10 32 4 | /sys/devices/system/node/node1/distance:32 10 5 | -------------------------------------------------------------------------------- /test/images/neuron/tests/singleNodeTest.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronSingleAllReduce.py 6 | torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronParallelState.py 7 | torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronMlp.py -------------------------------------------------------------------------------- /internal/util/exec.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "os" 5 | "os/exec" 6 | ) 7 | 8 | func ExecuteCommand(name string, args ...string) error { 9 | command := exec.Command(name, args...) 10 | command.Stdout = os.Stdout 11 | command.Stderr = os.Stderr 12 | return command.Run() 13 | } 14 | -------------------------------------------------------------------------------- /test/cases/quick/manifests/ulimit.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ulimit 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: al2023 9 | image: public.ecr.aws/amazonlinux/amazonlinux:2023 10 | command: ["ulimit"] 11 | args: 12 | - -a 13 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/userdata_bootstrap.sh.mimepart.template: -------------------------------------------------------------------------------- 1 | Content-Type: text/x-shellscript; charset="us-ascii" 2 | MIME-Version: 1.0 3 | 4 | #!/usr/bin/env bash 5 | /etc/eks/bootstrap.sh {{.Name}} \ 6 | --b64-cluster-ca {{.CertificateAuthority}} \ 7 | --apiserver-endpoint {{.APIServerEndpoint}} 8 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/vpccni_test.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "encoding/json" 5 | "testing" 6 | ) 7 | 8 | func Test_validVPCCNIDaemonSetPatch(t *testing.T) { 9 | var j json.RawMessage 10 | if err := json.Unmarshal([]byte(vpcCNIDaemonSetPatch), &j); err != nil { 11 | t.Error(err) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/userdata_bottlerocket.toml.template: -------------------------------------------------------------------------------- 1 | [settings.kubernetes] 2 | "cluster-name" = "{{.Name}}" 3 | "api-server" = "{{.APIServerEndpoint}}" 4 | "cluster-certificate" = "{{.CertificateAuthority}}" 5 | device-ownership-from-security-context = true 6 | 7 | [settings.host-containers.admin] 8 | "enabled" = true 9 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /internal/metrics/noop.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | func NewNoopMetricRegistry() MetricRegistry { 4 | return &noopRegistry{} 5 | } 6 | 7 | type noopRegistry struct{} 8 | 9 | func (r *noopRegistry) Record(spec *MetricSpec, value float64, dimensions map[string]string) {} 10 | 11 | func (r *noopRegistry) Emit() error { 12 | return nil 13 | } 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include ${BGO_MAKEFILE} 2 | 3 | pre-release:: 4 | go test -c -tags=e2e ./test/... -o $(GOBIN) 5 | go install sigs.k8s.io/kubetest2/...@latest 6 | 7 | update-deps: 8 | for SCRIPT in ./hack/update-*.sh; do \ 9 | "$$SCRIPT" ; \ 10 | done 11 | 12 | .PHONY: test-integration 13 | test-integration: ## Run unit and integration tests 14 | go test -v -tags=integration ./... 15 | -------------------------------------------------------------------------------- /external/tools.go: -------------------------------------------------------------------------------- 1 | //go:build tools 2 | // +build tools 3 | 4 | package external 5 | 6 | // this file allows us to declare direct dependencies on our required external tools. 7 | // this file will not compile! that's expected. 8 | 9 | import ( 10 | _ "sigs.k8s.io/kubetest2" 11 | _ "sigs.k8s.io/kubetest2/kubetest2-tester-exec" 12 | _ "sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo" 13 | ) 14 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA A10G, 0, 00000000:00:16.0 3 | NVIDIA A10G, 1, 00000000:00:17.0 4 | NVIDIA A10G, 2, 00000000:00:18.0 5 | NVIDIA A10G, 3, 00000000:00:19.0 6 | NVIDIA A10G, 4, 00000000:00:1A.0 7 | NVIDIA A10G, 5, 00000000:00:1B.0 8 | NVIDIA A10G, 6, 00000000:00:1C.0 9 | NVIDIA A10G, 7, 00000000:00:1D.0 10 | -------------------------------------------------------------------------------- /Config: -------------------------------------------------------------------------------- 1 | # This file is for Amazon internal build processes 2 | 3 | # Copyright 2025 Amazon.com, Inc. or its affiliates. 4 | # SPDX-License-Identifier: Apache-2.0 5 | 6 | package.Aws-k8s-tester-mirror = { 7 | interfaces = (1.0); 8 | 9 | build-system = bgo-wrap-make; 10 | build-tools = { 11 | 1.0 = { 12 | BrazilMakeGo = 3.0; 13 | GoLang = 1.x; 14 | }; 15 | }; 16 | }; 17 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/common_test.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func Test_AZ_PRIORITY(t *testing.T) { 10 | t.Setenv(AvailabilityZonePriorityEnv, "us-west-2d") 11 | assert.Equal(t, 12 | []string{"us-west-2d", "us-west-2b", "us-west-2c"}, 13 | availabilityZoneHintedOrder([]string{"us-west-2b", "us-west-2c", "us-west-2d"}), 14 | ) 15 | } 16 | -------------------------------------------------------------------------------- /internal/testers/ginkgov1/README.md: -------------------------------------------------------------------------------- 1 | This tester supports ginkgo 1.x versions, which were used for Kubernetes versions prior to 1.25. 2 | 3 | --- 4 | 5 | This is a fork of the `ginkgo` tester: https://github.com/kubernetes-sigs/kubetest2/tree/master/pkg/testers/ginkgo 6 | 7 | The fork originated at commit `d7fcb799ce84ceda66c8b9b1ec8eefcbe226f293`. 8 | 9 | A copy of the original license is provided in the file named `LICENSE.original`. 10 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/unit_test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | set -o pipefail 6 | 7 | TRACE_LOG=trace.log 8 | TEST_TIMEOUT=1800 9 | BASH="/usr/bin/bash" 10 | CURRENT_DIR=$(pwd) 11 | SKIP_TESTS_SUBCOMMAND=${SKIP_TESTS_SUBCOMMAND:-""} 12 | 13 | timeout -k 10 ${TEST_TIMEOUT} ${BASH} gpu_unit_tests/bash_unit -f tap ${SKIP_TESTS_SUBCOMMAND} -t gpu_unit_tests/${TRACE_LOG} gpu_unit_tests/tests/*test*.sh 14 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA A10G, 00000000:00:16.0, Enabled 3 | NVIDIA A10G, 00000000:00:17.0, Enabled 4 | NVIDIA A10G, 00000000:00:18.0, Enabled 5 | NVIDIA A10G, 00000000:00:19.0, Enabled 6 | NVIDIA A10G, 00000000:00:1A.0, Enabled 7 | NVIDIA A10G, 00000000:00:1B.0, Enabled 8 | NVIDIA A10G, 00000000:00:1C.0, Enabled 9 | NVIDIA A10G, 00000000:00:1D.0, Enabled 10 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/templates_test.go: -------------------------------------------------------------------------------- 1 | package templates 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | ) 7 | 8 | func Test_UnmanagedNodegroup(t *testing.T) { 9 | buf := bytes.Buffer{} 10 | err := UnmanagedNodegroup.Execute(&buf, UnmanagedNodegroupTemplateData{ 11 | KubernetesVersion: "1.28", 12 | InstanceTypes: []string{ 13 | "t2.medium", 14 | "t2.large", 15 | "t2.xlarge", 16 | }, 17 | }) 18 | if err != nil { 19 | t.Error(err) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA A100-SXM4-40GB, 0, 00000000:10:1C.0 3 | NVIDIA A100-SXM4-40GB, 1, 00000000:10:1D.0 4 | NVIDIA A100-SXM4-40GB, 2, 00000000:20:1C.0 5 | NVIDIA A100-SXM4-40GB, 3, 00000000:20:1D.0 6 | NVIDIA A100-SXM4-40GB, 4, 00000000:90:1C.0 7 | NVIDIA A100-SXM4-40GB, 5, 00000000:90:1D.0 8 | NVIDIA A100-SXM4-40GB, 6, 00000000:A0:1C.0 9 | NVIDIA A100-SXM4-40GB, 7, 00000000:A0:1D.0 10 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA A100-SXM4-80GB, 0, 00000000:10:1C.0 3 | NVIDIA A100-SXM4-80GB, 1, 00000000:10:1D.0 4 | NVIDIA A100-SXM4-80GB, 2, 00000000:20:1C.0 5 | NVIDIA A100-SXM4-80GB, 3, 00000000:20:1D.0 6 | NVIDIA A100-SXM4-80GB, 4, 00000000:90:1C.0 7 | NVIDIA A100-SXM4-80GB, 5, 00000000:90:1D.0 8 | NVIDIA A100-SXM4-80GB, 6, 00000000:A0:1C.0 9 | NVIDIA A100-SXM4-80GB, 7, 00000000:A0:1D.0 10 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/gpu_count.txt: -------------------------------------------------------------------------------- 1 | name, index, pci.bus_id 2 | NVIDIA H100 80GB HBM3, 0, 00000000:53:00.0 3 | NVIDIA H100 80GB HBM3, 1, 00000000:64:00.0 4 | NVIDIA H100 80GB HBM3, 2, 00000000:75:00.0 5 | NVIDIA H100 80GB HBM3, 3, 00000000:86:00.0 6 | NVIDIA H100 80GB HBM3, 4, 00000000:97:00.0 7 | NVIDIA H100 80GB HBM3, 5, 00000000:A8:00.0 8 | NVIDIA H100 80GB HBM3, 6, 00000000:B9:00.0 9 | NVIDIA H100 80GB HBM3, 7, 00000000:CA:00.0 10 | -------------------------------------------------------------------------------- /internal/awssdk/config.go: -------------------------------------------------------------------------------- 1 | package awssdk 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/aws/aws-sdk-go-v2/aws" 7 | "github.com/aws/aws-sdk-go-v2/config" 8 | "k8s.io/klog/v2" 9 | ) 10 | 11 | // NewConfig returns an AWS SDK config 12 | // It will panic if the cnfig cannot be created 13 | func NewConfig() aws.Config { 14 | c, err := config.LoadDefaultConfig(context.TODO()) 15 | if err != nil { 16 | klog.Fatalf("failed to create AWS SDK config: %v", err) 17 | } 18 | return c 19 | } 20 | -------------------------------------------------------------------------------- /hack/update-go-dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | set -o pipefail 6 | 7 | echo "Updating go modules..." 8 | go get $(go list -f '{{if not (or .Main .Indirect)}}{{.Path}}{{end}}' -mod=mod -m all) && go mod tidy 9 | 10 | echo "Updating kubetest2 image go version..." 11 | MODULE_GO_VERSION=$(go list -m -f "{{if .Main}}{{.GoVersion}}{{end}}" | cut -d'.' -f1-2) 12 | find . -type f -name Dockerfile -exec sed -i "s/\(GO_MINOR_VERSION\)=.*/\1=${MODULE_GO_VERSION}/g" {} + 13 | -------------------------------------------------------------------------------- /hack/free-disk-space.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | set -o pipefail 6 | 7 | # hack to free up disk space for build 8 | # ref: https://github.com/easimon/maximize-build-space/blob/master/action.yml 9 | 10 | # storage before 11 | sudo df -h 12 | 13 | sudo rm -rf \ 14 | /usr/share/dotnet \ 15 | /usr/local/lib/android \ 16 | /opt/ghc \ 17 | /opt/hostedtoolcache/CodeQL 18 | 19 | docker image prune --all --force 20 | docker builder prune -a 21 | 22 | # storage after 23 | sudo df -h 24 | -------------------------------------------------------------------------------- /internal/metrics/registry.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" 5 | ) 6 | 7 | type MetricRegistry interface { 8 | // Record adds a new metric value to the registry 9 | Record(spec *MetricSpec, value float64, dimensions map[string]string) 10 | // Emit sends all registered metric values to cloudwatch, emptying the registry 11 | Emit() error 12 | } 13 | 14 | type MetricSpec struct { 15 | Namespace string 16 | Metric string 17 | Unit types.StandardUnit 18 | } 19 | -------------------------------------------------------------------------------- /internal/e2e/mpijobs/types.go: -------------------------------------------------------------------------------- 1 | package mpijobs 2 | 3 | import ( 4 | "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 5 | "k8s.io/apimachinery/pkg/runtime/schema" 6 | ) 7 | 8 | var MPIJobGVK = schema.GroupVersionKind{ 9 | Group: "kubeflow.org", 10 | Version: "v2beta1", 11 | Kind: "MPIJob", 12 | } 13 | 14 | func NewUnstructured(name, namespace string) *unstructured.Unstructured { 15 | u := unstructured.Unstructured{} 16 | u.SetGroupVersionKind(MPIJobGVK) 17 | u.SetName(name) 18 | u.SetNamespace(namespace) 19 | return &u 20 | } 21 | -------------------------------------------------------------------------------- /internal/e2e/resources.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "fmt" 5 | 6 | v1 "k8s.io/api/core/v1" 7 | ) 8 | 9 | func GetNonZeroResourceCapacity(node *v1.Node, resourceName string) (int, error) { 10 | capacity, ok := node.Status.Capacity[v1.ResourceName(resourceName)] 11 | if !ok { 12 | return 0, fmt.Errorf("node %q has no resource %q", node.Name, resourceName) 13 | } 14 | if capacity.Value() == 0 { 15 | return 0, fmt.Errorf("node %q has zero capacity for resource %q", node.Name, resourceName) 16 | } 17 | return int(capacity.Value()), nil 18 | } 19 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA A100-SXM4-40GB, 00000000:10:1C.0, Enabled 3 | NVIDIA A100-SXM4-40GB, 00000000:10:1D.0, Enabled 4 | NVIDIA A100-SXM4-40GB, 00000000:20:1C.0, Enabled 5 | NVIDIA A100-SXM4-40GB, 00000000:20:1D.0, Enabled 6 | NVIDIA A100-SXM4-40GB, 00000000:90:1C.0, Enabled 7 | NVIDIA A100-SXM4-40GB, 00000000:90:1D.0, Enabled 8 | NVIDIA A100-SXM4-40GB, 00000000:A0:1C.0, Enabled 9 | NVIDIA A100-SXM4-40GB, 00000000:A0:1D.0, Enabled 10 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA A100-SXM4-80GB, 00000000:10:1C.0, Enabled 3 | NVIDIA A100-SXM4-80GB, 00000000:10:1D.0, Enabled 4 | NVIDIA A100-SXM4-80GB, 00000000:20:1C.0, Enabled 5 | NVIDIA A100-SXM4-80GB, 00000000:20:1D.0, Enabled 6 | NVIDIA A100-SXM4-80GB, 00000000:90:1C.0, Enabled 7 | NVIDIA A100-SXM4-80GB, 00000000:90:1D.0, Enabled 8 | NVIDIA A100-SXM4-80GB, 00000000:A0:1C.0, Enabled 9 | NVIDIA A100-SXM4-80GB, 00000000:A0:1D.0, Enabled 10 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_persistence_status.txt: -------------------------------------------------------------------------------- 1 | name, pci.bus_id, persistence_mode 2 | NVIDIA H100 80GB HBM3, 00000000:53:00.0, Enabled 3 | NVIDIA H100 80GB HBM3, 00000000:64:00.0, Enabled 4 | NVIDIA H100 80GB HBM3, 00000000:75:00.0, Enabled 5 | NVIDIA H100 80GB HBM3, 00000000:86:00.0, Enabled 6 | NVIDIA H100 80GB HBM3, 00000000:97:00.0, Enabled 7 | NVIDIA H100 80GB HBM3, 00000000:A8:00.0, Enabled 8 | NVIDIA H100 80GB HBM3, 00000000:B9:00.0, Enabled 9 | NVIDIA H100 80GB HBM3, 00000000:CA:00.0, Enabled 10 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity 2 | GPU0 X PHB PHB PHB PHB PHB PHB PHB 0-191 0-1 3 | GPU1 PHB X PHB PHB PHB PHB PHB PHB 0-191 0-1 4 | GPU2 PHB PHB X PHB PHB PHB PHB PHB 0-191 0-1 5 | GPU3 PHB PHB PHB X PHB PHB PHB PHB 0-191 0-1 6 | GPU4 PHB PHB PHB PHB X PHB PHB PHB 0-191 0-1 7 | GPU5 PHB PHB PHB PHB PHB X PHB PHB 0-191 0-1 8 | GPU6 PHB PHB PHB PHB PHB PHB X PHB 0-191 0-1 9 | GPU7 PHB PHB PHB PHB PHB PHB PHB X 0-191 0-1 10 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/auth_map_role.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "bytes" 5 | 6 | "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" 7 | ) 8 | 9 | func generateAuthMapRole(nodeNameStrategy string, rolearn string) (string, error) { 10 | template := templates.AuthMapRole 11 | buf := bytes.Buffer{} 12 | if err := template.Execute(&buf, templates.AuthMapRoleTemplateData{ 13 | NodeNameStrategy: nodeNameStrategy, 14 | Rolearn: rolearn, 15 | }); err != nil { 16 | return "", err 17 | } 18 | return buf.String(), nil 19 | } 20 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity 2 | GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 3 | GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 4 | GPU2 NV12 NV12 X NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 5 | GPU3 NV12 NV12 NV12 X NV12 NV12 NV12 NV12 0-23,48-71 0 6 | GPU4 NV12 NV12 NV12 NV12 X NV12 NV12 NV12 24-47,72-95 1 7 | GPU5 NV12 NV12 NV12 NV12 NV12 X NV12 NV12 24-47,72-95 1 8 | GPU6 NV12 NV12 NV12 NV12 NV12 NV12 X NV12 24-47,72-95 1 9 | GPU7 NV12 NV12 NV12 NV12 NV12 NV12 NV12 X 24-47,72-95 1 10 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity 2 | GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 3 | GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 4 | GPU2 NV12 NV12 X NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 5 | GPU3 NV12 NV12 NV12 X NV12 NV12 NV12 NV12 0-23,48-71 0 6 | GPU4 NV12 NV12 NV12 NV12 X NV12 NV12 NV12 24-47,72-95 1 7 | GPU5 NV12 NV12 NV12 NV12 NV12 X NV12 NV12 24-47,72-95 1 8 | GPU6 NV12 NV12 NV12 NV12 NV12 NV12 X NV12 24-47,72-95 1 9 | GPU7 NV12 NV12 NV12 NV12 NV12 NV12 NV12 X 24-47,72-95 1 10 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_smi_topo.txt: -------------------------------------------------------------------------------- 1 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity 2 | GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 3 | GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 4 | GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 5 | GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 0-47,96-143 0 6 | GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 48-95,144-191 1 7 | GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 48-95,144-191 1 8 | GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 48-95,144-191 1 9 | GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X 48-95,144-191 1 10 | -------------------------------------------------------------------------------- /test/cases/dra/main_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package dra 4 | 5 | import ( 6 | "context" 7 | _ "embed" 8 | "log" 9 | "os" 10 | "os/signal" 11 | "testing" 12 | 13 | "sigs.k8s.io/e2e-framework/pkg/env" 14 | "sigs.k8s.io/e2e-framework/pkg/envconf" 15 | ) 16 | 17 | var ( 18 | testenv env.Environment 19 | ) 20 | 21 | func TestMain(m *testing.M) { 22 | ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) 23 | defer cancel() 24 | cfg, err := envconf.NewFromFlags() 25 | if err != nil { 26 | log.Fatalf("failed to initialize test environment: %v", err) 27 | } 28 | testenv = env.NewWithConfig(cfg).WithContext(ctx) 29 | os.Exit(testenv.Run(m)) 30 | } 31 | -------------------------------------------------------------------------------- /test/cases/nvidia-training/vars.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package training 4 | 5 | import ( 6 | "github.com/aws/aws-k8s-tester/test/common" 7 | "sigs.k8s.io/e2e-framework/pkg/env" 8 | ) 9 | 10 | type Config struct { 11 | common.MetricOps 12 | BertTrainingImage string `flag:"bertTrainingImage" desc:"Docker image used for BERT training workload"` 13 | EfaEnabled bool `flag:"efaEnabled" desc:"Enable Elastic Fabric Adapter (EFA)"` 14 | NodeType string `flag:"nodeType" desc:"Instance type for cluster nodes"` 15 | } 16 | 17 | // Shared global variables 18 | var ( 19 | testenv env.Environment 20 | testConfig Config 21 | 22 | nodeCount int 23 | gpuPerNode int 24 | efaPerNode int 25 | ) 26 | -------------------------------------------------------------------------------- /test/manifests/raw.go: -------------------------------------------------------------------------------- 1 | package manifests 2 | 3 | import ( 4 | _ "embed" 5 | ) 6 | 7 | var ( 8 | //go:embed assets/nvidia-device-plugin.yaml 9 | NvidiaDevicePluginManifest []byte 10 | //go:embed assets/mpi-operator.yaml 11 | MpiOperatorManifest []byte 12 | 13 | //go:embed assets/efa-device-plugin.yaml 14 | EfaDevicePluginManifest []byte 15 | 16 | //go:embed assets/k8s-neuron-device-plugin-rbac.yml 17 | NeuronDevicePluginRbacManifest []byte 18 | //go:embed assets/k8s-neuron-device-plugin.yml 19 | NeuronDevicePluginManifest []byte 20 | 21 | //go:embed assets/dcgm-exporter.yaml 22 | DCGMExporterManifest []byte 23 | 24 | //go:embed assets/cloudwatch-agent.yaml 25 | cloudWatchAgentManifestTemplate []byte 26 | ) 27 | -------------------------------------------------------------------------------- /test/manifests/rendered.go: -------------------------------------------------------------------------------- 1 | package manifests 2 | 3 | import ( 4 | "html/template" 5 | "strings" 6 | 7 | fwext "github.com/aws/aws-k8s-tester/internal/e2e" 8 | ) 9 | 10 | // RenderCloudWatchAgentManifest renders the CloudWatch Agent manifest with dynamic dimensions 11 | func RenderCloudWatchAgentManifest(metricDimensions map[string]string) ([]byte, error) { 12 | var keys []string 13 | for key := range metricDimensions { 14 | keys = append(keys, `"`+key+`"`) 15 | } 16 | dimensionsStr := strings.Join(keys, ", ") 17 | return fwext.RenderManifests(cloudWatchAgentManifestTemplate, map[string]interface{}{ 18 | "MetricDimensions": metricDimensions, 19 | "DimensionKeys": template.HTML(dimensionsStr), 20 | }) 21 | } 22 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/common.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "os" 5 | "slices" 6 | "strings" 7 | ) 8 | 9 | const AvailabilityZonePriorityEnv = "EKSAPI_AZ_PRIORITY" 10 | 11 | func availabilityZoneHintedOrder(availabilityZones []string) []string { 12 | var priorityAZs []string 13 | if priorityAZsString, ok := os.LookupEnv(AvailabilityZonePriorityEnv); ok { 14 | priorityAZs = strings.Split(priorityAZsString, ",") 15 | } 16 | if len(priorityAZs) == 0 { 17 | return availabilityZones 18 | } 19 | return slices.SortedStableFunc(slices.Values(availabilityZones), func(az1, az2 string) int { 20 | if slices.Contains(priorityAZs, az1) { 21 | if slices.Contains(priorityAZs, az2) { 22 | return 0 23 | } 24 | return -1 25 | } 26 | return 0 27 | }) 28 | } 29 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/userdata_nodeadm.yaml.mimepart.template: -------------------------------------------------------------------------------- 1 | Content-Type: application/node.eks.aws 2 | MIME-Version: 1.0 3 | 4 | --- 5 | apiVersion: node.eks.aws/v1alpha1 6 | kind: NodeConfig 7 | spec: 8 | {{- if .NodeadmFeatureGates}} 9 | featureGates: 10 | {{- range $gate, $value := .NodeadmFeatureGates }} 11 | {{$gate}}: {{$value}} 12 | {{- end }} 13 | {{- end }} 14 | cluster: 15 | name: {{.Name}} 16 | apiServerEndpoint: {{.APIServerEndpoint}} 17 | certificateAuthority: {{.CertificateAuthority}} 18 | cidr: {{.CIDR}} 19 | {{- if .KubeletFeatureGates}} 20 | kubelet: 21 | config: 22 | featureGates: 23 | {{- range $gate, $value := .KubeletFeatureGates }} 24 | {{$gate}}: {{$value}} 25 | {{- end }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/busybox_deployment.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: busybox-deployment 5 | spec: 6 | replicas: {{.Nodes}} 7 | selector: 8 | matchLabels: 9 | app: busybox 10 | template: 11 | metadata: 12 | labels: 13 | app: busybox 14 | spec: 15 | affinity: 16 | podAntiAffinity: 17 | requiredDuringSchedulingIgnoredDuringExecution: 18 | - labelSelector: 19 | matchExpressions: 20 | - key: app 21 | operator: In 22 | values: 23 | - busybox 24 | topologyKey: "kubernetes.io/hostname" 25 | containers: 26 | - name: busybox 27 | image: busybox 28 | command: ["sleep", "infinity"] 29 | -------------------------------------------------------------------------------- /internal/util/version.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strings" 7 | ) 8 | 9 | const KubernetesVersionFile = "kubernetes-version.txt" 10 | 11 | func DetectKubernetesVersion() (string, error) { 12 | versionFile, err := LookPath(KubernetesVersionFile) 13 | if err != nil { 14 | return "", err 15 | } 16 | bytes, err := os.ReadFile(versionFile) 17 | if err != nil { 18 | return "", err 19 | } 20 | // "v1.2.3" 21 | versionTag := string(bytes) 22 | return strings.ReplaceAll(versionTag, "v", ""), nil 23 | } 24 | 25 | func ParseMinorVersion(semanticVersion string) (string, error) { 26 | parts := strings.Split(semanticVersion, ".") 27 | if len(parts) < 2 { 28 | return "", fmt.Errorf("malformed semantic version: '%s'", semanticVersion) 29 | } 30 | return strings.Join(parts[:2], "."), nil 31 | } 32 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/logs_ssm_doc.json: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": "2.2", 3 | "description": "Collect logs from an Amazon Linux EKS node", 4 | "parameters": { 5 | "s3Destination": { 6 | "type": "String" 7 | } 8 | }, 9 | "mainSteps": [ 10 | { 11 | "action": "aws:runShellScript", 12 | "name": "collectAndUploadLogs", 13 | "precondition": { 14 | "StringEquals": [ 15 | "platformType", 16 | "Linux" 17 | ] 18 | }, 19 | "inputs": { 20 | "runCommand": [ 21 | "bash /etc/eks/log-collector-script/eks-log-collector.sh >/dev/null 2>&1", 22 | "aws s3 cp /var/log/eks_i* {{s3Destination}}" 23 | ] 24 | } 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /test/cases/quick/main_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package quick 4 | 5 | import ( 6 | "context" 7 | _ "embed" 8 | "log" 9 | "os" 10 | "os/signal" 11 | "testing" 12 | 13 | "sigs.k8s.io/e2e-framework/pkg/env" 14 | "sigs.k8s.io/e2e-framework/pkg/envconf" 15 | ) 16 | 17 | var ( 18 | testenv env.Environment 19 | ) 20 | 21 | func TestMain(m *testing.M) { 22 | cfg, err := envconf.NewFromFlags() 23 | if err != nil { 24 | log.Fatalf("failed to initialize test environment: %v", err) 25 | } 26 | testenv = env.NewWithConfig(cfg) 27 | ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) 28 | defer cancel() 29 | testenv = testenv.WithContext(ctx) 30 | 31 | testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { 32 | log.Println("Starting quick test suite...") 33 | return ctx, nil 34 | }) 35 | 36 | os.Exit(testenv.Run(m)) 37 | } 38 | -------------------------------------------------------------------------------- /test/cases/disruptive/main_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package disruptive 4 | 5 | import ( 6 | "context" 7 | _ "embed" 8 | "log" 9 | "os" 10 | "os/signal" 11 | "testing" 12 | 13 | "sigs.k8s.io/e2e-framework/pkg/env" 14 | "sigs.k8s.io/e2e-framework/pkg/envconf" 15 | ) 16 | 17 | var ( 18 | testenv env.Environment 19 | ) 20 | 21 | func TestMain(m *testing.M) { 22 | cfg, err := envconf.NewFromFlags() 23 | if err != nil { 24 | log.Fatalf("failed to initialize test environment: %v", err) 25 | } 26 | testenv = env.NewWithConfig(cfg) 27 | ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) 28 | defer cancel() 29 | testenv = testenv.WithContext(ctx) 30 | 31 | testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { 32 | log.Println("Starting quick test suite...") 33 | return ctx, nil 34 | }) 35 | 36 | os.Exit(testenv.Run(m)) 37 | } 38 | -------------------------------------------------------------------------------- /test/cases/neuron-training/vars.go: -------------------------------------------------------------------------------- 1 | package training 2 | 3 | import ( 4 | "flag" 5 | 6 | "sigs.k8s.io/e2e-framework/pkg/env" 7 | ) 8 | 9 | // Shared global variables 10 | var ( 11 | testenv env.Environment 12 | 13 | bertTrainingImage *string 14 | efaEnabled *bool 15 | nodeType *string 16 | nodeCount int 17 | efaPerNode int 18 | neuronPerNode int 19 | neuronCorePerNode int 20 | retries *int 21 | ) 22 | 23 | func init() { 24 | // Define command-line flags 25 | bertTrainingImage = flag.String("bertTrainingImage", "", "Docker image used for BERT training workload") 26 | efaEnabled = flag.Bool("efaEnabled", false, "Enable Elastic Fabric Adapter (EFA)") 27 | nodeType = flag.String("nodeType", "", "Instance type for cluster nodes (e.g., inf1.24xlarge)") 28 | retries = flag.Int("retries", 2, "Number of retries to attempt before marking the test as failed.") 29 | } 30 | -------------------------------------------------------------------------------- /test/cases/nvidia-inference/manifests/bert-inference.yaml: -------------------------------------------------------------------------------- 1 | # Single-node BERT inference job with GPU. Memory-backed volume for /dev/shm 2 | apiVersion: batch/v1 3 | kind: Job 4 | metadata: 5 | name: bert-inference 6 | spec: 7 | backoffLimit: 4 8 | template: 9 | spec: 10 | restartPolicy: OnFailure 11 | volumes: 12 | - name: dshm 13 | emptyDir: 14 | medium: Memory 15 | containers: 16 | - name: bert-inference 17 | image: {{.BertInferenceImage}} 18 | imagePullPolicy: Always 19 | command: ["python", "infer.py"] 20 | env: 21 | - name: INFERENCE_MODE 22 | value: "{{.InferenceMode}}" 23 | volumeMounts: 24 | - mountPath: /dev/shm 25 | name: dshm 26 | resources: 27 | requests: 28 | nvidia.com/gpu: {{.GPUPerNode}} 29 | limits: 30 | nvidia.com/gpu: {{.GPUPerNode}} 31 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/metrics.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "path" 5 | 6 | "github.com/aws/aws-k8s-tester/internal/metrics" 7 | cloudwatchtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" 8 | ) 9 | 10 | var DeployerMetricNamespace = path.Join("kubetest2", DeployerName) 11 | 12 | var ( 13 | totalRuntimeSeconds = &metrics.MetricSpec{ 14 | Namespace: DeployerMetricNamespace, 15 | Metric: "TotalRuntimeSeconds", 16 | Unit: cloudwatchtypes.StandardUnitSeconds, 17 | } 18 | 19 | nodeTimeToRegistrationSeconds = &metrics.MetricSpec{ 20 | Namespace: DeployerMetricNamespace, 21 | Metric: "NodeTimeToRegistrationSeconds", 22 | Unit: cloudwatchtypes.StandardUnitSeconds, 23 | } 24 | 25 | nodeTimeToReadySeconds = &metrics.MetricSpec{ 26 | Namespace: DeployerMetricNamespace, 27 | Metric: "NodeTimeToReadySeconds", 28 | Unit: cloudwatchtypes.StandardUnitSeconds, 29 | } 30 | ) 31 | -------------------------------------------------------------------------------- /cmd/kubetest2-eksapi-janitor/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "time" 7 | 8 | "github.com/aws/aws-k8s-tester/internal/deployers/eksapi" 9 | "k8s.io/klog/v2" 10 | ) 11 | 12 | func main() { 13 | var maxResourceAge time.Duration 14 | flag.DurationVar(&maxResourceAge, "max-resource-age", time.Hour*3, "Maximum resource age") 15 | var workers int 16 | flag.IntVar(&workers, "workers", 1, "number of workers to processes resources in parallel") 17 | var stackStatus string 18 | flag.StringVar(&stackStatus, "stack-status", "", "only process stacks with a specific status") 19 | var emitMetrics bool 20 | flag.BoolVar(&emitMetrics, "emit-metrics", false, "Send metrics to CloudWatch") 21 | flag.Parse() 22 | j := eksapi.NewJanitor(maxResourceAge, emitMetrics, workers, stackStatus) 23 | if err := j.Sweep(context.Background()); err != nil { 24 | klog.Fatalf("failed to sweep resources: %v", err) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /internal/e2e/mpijobs/conditions_test.go: -------------------------------------------------------------------------------- 1 | package mpijobs 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 8 | ) 9 | 10 | func Test_MPIJobSucceeded(t *testing.T) { 11 | u := unstructured.Unstructured{ 12 | Object: map[string]interface{}{ 13 | "status": map[string]interface{}{ 14 | "conditions": []interface{}{ 15 | map[string]interface{}{ 16 | "type": "Succeeded", 17 | "status": "True", 18 | }, 19 | }, 20 | }, 21 | }, 22 | } 23 | assert.True(t, MPIJobSucceeded(&u)) 24 | 25 | u = unstructured.Unstructured{ 26 | Object: map[string]interface{}{ 27 | "status": map[string]interface{}{ 28 | "conditions": []interface{}{ 29 | map[string]interface{}{ 30 | "type": "Succeeded", 31 | "status": "False", 32 | }, 33 | }, 34 | }, 35 | }, 36 | } 37 | assert.False(t, MPIJobSucceeded(&u)) 38 | } 39 | -------------------------------------------------------------------------------- /internal/util/path.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "errors" 5 | "os" 6 | "path/filepath" 7 | "syscall" 8 | ) 9 | 10 | var ErrFileNotFoundInPath = errors.New("file not found in $PATH") 11 | 12 | // LookPath finds a file on the PATH. 13 | // It uses a similar process to exec.LookPath, but can find regular files. 14 | func LookPath(file string) (string, error) { 15 | path := os.Getenv("PATH") 16 | for _, dir := range filepath.SplitList(path) { 17 | if dir == "" { 18 | // Unix shell semantics: path element "" means "." 19 | dir = "." 20 | } 21 | path := filepath.Join(dir, file) 22 | if err := checkFile(path); err == nil { 23 | return path, nil 24 | } 25 | } 26 | return "", ErrFileNotFoundInPath 27 | } 28 | 29 | func checkFile(file string) error { 30 | d, err := os.Stat(file) 31 | if err != nil { 32 | return err 33 | } 34 | m := d.Mode() 35 | if m.IsDir() { 36 | return syscall.EISDIR 37 | } 38 | return nil 39 | } 40 | -------------------------------------------------------------------------------- /test/cases/neuron/manifests/single-node-test-neuronx.yaml: -------------------------------------------------------------------------------- 1 | kind: Job 2 | apiVersion: batch/v1 3 | metadata: 4 | name: neuronx-single-node 5 | labels: 6 | app: neuronx-single-node 7 | spec: 8 | template: 9 | metadata: 10 | labels: 11 | app: neuronx-single-node 12 | spec: 13 | containers: 14 | - name: neuronx-single-node-test 15 | image: {{.NeuronTestImage}} 16 | command: 17 | - /bin/bash 18 | - ./tests/singleNodeTest.sh 19 | imagePullPolicy: Always 20 | resources: 21 | limits: 22 | cpu: "4" 23 | memory: 4Gi 24 | aws.amazon.com/neuron: "1" 25 | requests: 26 | cpu: "1" 27 | memory: 1Gi 28 | aws.amazon.com/neuron: "1" 29 | restartPolicy: Never 30 | securityContext: 31 | runAsUser: 1000 32 | runAsGroup: 2000 33 | fsGroup: 3000 34 | backoffLimit: 4 35 | -------------------------------------------------------------------------------- /test/cases/neuron-inference/vars.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package inference 4 | 5 | import ( 6 | "flag" 7 | 8 | "sigs.k8s.io/e2e-framework/pkg/env" 9 | ) 10 | 11 | // Shared global variables 12 | var ( 13 | // The e2e-framework environment 14 | testenv env.Environment 15 | 16 | // Passed in as flags 17 | bertInferenceImage *string 18 | nodeType *string 19 | inferenceMode *string 20 | 21 | // Discovered in main_test.go 22 | neuronPerNode int 23 | neuronCorePerNode int 24 | ) 25 | 26 | // init() runs before TestMain and sets up the flags 27 | func init() { 28 | bertInferenceImage = flag.String("bertInferenceImage", "", 29 | "[REQUIRED] Docker image used for Neuron-based BERT inference") 30 | nodeType = flag.String("nodeType", "", 31 | "Node type label for K8s nodes, e.g., trn1.32xlarge or inf2.xlarge") 32 | inferenceMode = flag.String("inferenceMode", "throughput", 33 | "Inference mode for BERT (throughput or latency)") 34 | } 35 | -------------------------------------------------------------------------------- /internal/util/http.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/aws/smithy-go/middleware" 8 | smithyhttp "github.com/aws/smithy-go/transport/http" 9 | ) 10 | 11 | const httpHeaderBoundary = ": " 12 | 13 | // NewHTTPHeaderAPIOptions returns a slice of middleware options that adds the 14 | // specified HTTP headers to an API request. 15 | // Each header should be of the format `Header-Key: Header-Value`, in the same manner 16 | // as headers are passed with `curl`-s `-H` flag. 17 | func NewHTTPHeaderAPIOptions(headers []string) ([]func(*middleware.Stack) error, error) { 18 | var opts []func(*middleware.Stack) error 19 | for _, header := range headers { 20 | boundary := strings.Index(header, httpHeaderBoundary) 21 | if boundary == -1 { 22 | return nil, fmt.Errorf("malformed HTTP header: '%s'", header) 23 | } 24 | key := header[:boundary] 25 | val := header[boundary+len(httpHeaderBoundary):] 26 | opts = append(opts, smithyhttp.AddHeaderValue(key, val)) 27 | } 28 | return opts, nil 29 | } 30 | -------------------------------------------------------------------------------- /test/cases/neuron-inference/manifests/neuron-bert-inference.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: neuron-inference 5 | spec: 6 | backoffLimit: 4 7 | template: 8 | spec: 9 | restartPolicy: OnFailure 10 | volumes: 11 | - name: dshm 12 | emptyDir: 13 | medium: Memory 14 | containers: 15 | - name: neuron-inference 16 | image: {{.BertInferenceImage}} 17 | imagePullPolicy: Always 18 | command: ["python", "/app/infer.py"] 19 | env: 20 | - name: INFERENCE_MODE 21 | value: "{{.InferenceMode}}" 22 | volumeMounts: 23 | - mountPath: /dev/shm 24 | name: dshm 25 | resources: 26 | requests: 27 | aws.amazon.com/neuroncore: "{{.NeuronCorePerNode}}" 28 | limits: 29 | aws.amazon.com/neuroncore: "{{.NeuronCorePerNode}}" 30 | nodeSelector: 31 | node.kubernetes.io/instance-type: {{.NodeType}} 32 | -------------------------------------------------------------------------------- /.github/workflows/update-image-tags.yaml: -------------------------------------------------------------------------------- 1 | name: "[CI] update-image-tags" 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | # once a week 6 | - cron: "0 0 * * 0" 7 | permissions: 8 | id-token: write 9 | contents: write 10 | pull-requests: write 11 | jobs: 12 | update-dependencies: 13 | runs-on: ubuntu-latest 14 | if: github.repository == 'aws/aws-k8s-tester' 15 | steps: 16 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 17 | - run: ./hack/update-image-tags.sh 18 | - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8 19 | with: 20 | branch: update-image-tags 21 | base: main 22 | add-paths: | 23 | test/images/ 24 | commit-message: "chore: update image tags" 25 | committer: "GitHub " 26 | author: "GitHub " 27 | title: "chore: update image tags" 28 | body: | 29 | Generated by: 30 | ``` 31 | ./hack/update-image-tags.sh 32 | ``` 33 | -------------------------------------------------------------------------------- /internal/util/http_test.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func Test_NewHTTPHeaderAPIOptions(t *testing.T) { 8 | testCases := []struct { 9 | name string 10 | headers []string 11 | expectError bool 12 | }{ 13 | { 14 | name: "empty", 15 | headers: []string{}, 16 | }, 17 | { 18 | name: "single valid header", 19 | headers: []string{"Content-Type: application/json"}, 20 | }, 21 | { 22 | name: "multiple valid headers", 23 | headers: []string{"Content-Type: application/json", "Accept: application/json"}, 24 | }, 25 | { 26 | name: "invalid header", 27 | headers: []string{"Invalid header"}, 28 | expectError: true, 29 | }, 30 | } 31 | for _, tc := range testCases { 32 | t.Run(tc.name, func(t *testing.T) { 33 | _, err := NewHTTPHeaderAPIOptions(tc.headers) 34 | if err != nil && !tc.expectError { 35 | t.Errorf("unexpected error: %v", err) 36 | } 37 | if err == nil && tc.expectError { 38 | t.Error("expected error but got none") 39 | } 40 | }) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /test/cases/nvidia/manifests/job-unit-test-single-node.yaml: -------------------------------------------------------------------------------- 1 | kind: Job 2 | apiVersion: batch/v1 3 | metadata: 4 | name: unit-test-job 5 | labels: 6 | app: unit-test-job 7 | spec: 8 | template: 9 | metadata: 10 | labels: 11 | app: unit-test-job 12 | spec: 13 | containers: 14 | - name: unit-test-container 15 | image: {{.NvidiaTestImage}} 16 | command: 17 | - /bin/bash 18 | - ./gpu_unit_tests/unit_test 19 | env: 20 | - name: SKIP_TESTS_SUBCOMMAND 21 | value: {{.SkipTestSubcommand}} 22 | # because we started building these from source, this is just a 23 | # regular binary. 24 | - name: DEMO_SUITE_DIR 25 | value: /usr/bin 26 | - name: EC2_INSTANCE_TYPE 27 | value: {{.NodeType}} 28 | imagePullPolicy: Always 29 | resources: 30 | limits: 31 | nvidia.com/gpu: {{.GpuPerNode}} 32 | requests: 33 | cpu: "1" 34 | memory: 1Gi 35 | restartPolicy: Never 36 | backoffLimit: 4 37 | -------------------------------------------------------------------------------- /.github/workflows/update-nvidia-dependencies.yaml: -------------------------------------------------------------------------------- 1 | name: "[CI] update-nvidia-dependencies" 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | # once a week 6 | - cron: "0 0 * * 0" 7 | permissions: 8 | id-token: write 9 | contents: write 10 | pull-requests: write 11 | jobs: 12 | update-dependencies: 13 | runs-on: ubuntu-latest 14 | if: github.repository == 'aws/aws-k8s-tester' 15 | steps: 16 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 17 | - run: ./hack/update-nvidia-dependencies.sh 18 | - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8 19 | with: 20 | branch: update-nvidia-dependencies 21 | base: main 22 | add-paths: | 23 | test/images/ 24 | commit-message: "chore: update nvidia test dependencies" 25 | committer: "GitHub " 26 | author: "GitHub " 27 | title: "chore: update nvidia test dependencies" 28 | body: | 29 | Generated by: 30 | ``` 31 | ./hack/update-nvidia-dependencies.sh 32 | ``` 33 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/nvidia_static_cluster_nodepool.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: karpenter.sh/v1 2 | kind: NodePool 3 | metadata: 4 | labels: 5 | app.kubernetes.io/managed-by: eks 6 | name: nvidia 7 | spec: 8 | weight: 50 9 | template: 10 | spec: 11 | requirements: 12 | - key: kubernetes.io/arch 13 | operator: In 14 | values: [{{.Arch}}] 15 | - key: kubernetes.io/os 16 | operator: In 17 | values: ["linux"] 18 | - key: karpenter.sh/capacity-type 19 | operator: In 20 | values: ["on-demand"] 21 | - key: node.kubernetes.io/instance-type 22 | operator: In 23 | values: 24 | {{- range .InstanceTypes}} 25 | - "{{.}}" 26 | {{- end}} 27 | - key: eks.amazonaws.com/instance-gpu-count 28 | operator: Exists 29 | nodeClassRef: 30 | group: eks.amazonaws.com 31 | kind: NodeClass 32 | name: default 33 | expireAfter: 336h 34 | disruption: 35 | budgets: 36 | - nodes: 10% 37 | consolidationPolicy: WhenEmpty 38 | consolidateAfter: 600s 39 | -------------------------------------------------------------------------------- /hack/update-nvidia-dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # following from the last updated dependency: 4 | # 1. get the latest release of aws-ofi-nccl 5 | # 2. get the supported version of libnccl 6 | # 3. get the latest correct cuda version used for libnccl 7 | 8 | set -o nounset 9 | set -o errexit 10 | set -o pipefail 11 | 12 | echo "Updating aws-ofi-nccl" 13 | AWS_OFI_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .tag_name | sed 's/^v//') 14 | find . -type f -name Dockerfile -exec sed -i "s/AWS_OFI_NCCL_VERSION=.*/AWS_OFI_NCCL_VERSION=$AWS_OFI_NCCL_TAG/g" {} + 15 | 16 | echo "Updating nccl" 17 | LIB_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .body | grep -oP '\[NCCL \K(\S*)(?=\])' | head -n 1 | sed 's/^v//') 18 | find . -type f -name Dockerfile -exec sed -i "s/LIBNCCL_VERSION=.*/LIBNCCL_VERSION=$LIB_NCCL_TAG/g" {} + 19 | 20 | echo "Updating nvbandwidth" 21 | NVBANDWIDTH_TAG=$(curl -s https://api.github.com/repos/NVIDIA/nvbandwidth/releases/latest | jq -r .tag_name) 22 | find . -type f -name Dockerfile -exec sed -i "s/NVBANDWIDTH_VERSION=.*/NVBANDWIDTH_VERSION=$NVBANDWIDTH_TAG/g" {} + 23 | 24 | -------------------------------------------------------------------------------- /test/common/flags.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package common 4 | 5 | import ( 6 | "flag" 7 | "fmt" 8 | "github.com/urfave/sflags/gen/gpflag" 9 | "github.com/spf13/pflag" 10 | "reflect" 11 | ) 12 | 13 | // For CloudWatch metric dimension flag 14 | type MetricOps struct { 15 | MetricDimensions map[string]string `flag:"metricDimensions" desc:"CloudWatch metric dimensions as comma-separated key=value pairs"` 16 | } 17 | 18 | func ParseFlags(config interface{}) (*pflag.FlagSet, error) { 19 | flags, err := gpflag.Parse(config) 20 | if err != nil { 21 | return nil, fmt.Errorf("failed to parse flags: %w", err) 22 | } 23 | 24 | // Handle MetricDimensions map that gpflag doesn't support 25 | if _, hasField := reflect.TypeOf(config).Elem().FieldByName("MetricDimensions"); hasField { 26 | field := reflect.ValueOf(config).Elem().FieldByName("MetricDimensions") 27 | metricDims := field.Addr().Interface().(*map[string]string) 28 | flags.StringToStringVar(metricDims, "metricDimensions", nil, "CloudWatch metric dimensions as comma-separated key=value pairs") 29 | } 30 | 31 | flags.VisitAll(func(pf *pflag.Flag) { 32 | flag.CommandLine.Var(pf.Value, pf.Name, pf.Usage) 33 | }) 34 | 35 | return flags, nil 36 | } -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/cloudwatch_agent_infra.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: amazon-cloudwatch 5 | labels: 6 | name: amazon-cloudwatch 7 | 8 | --- 9 | apiVersion: v1 10 | kind: ServiceAccount 11 | metadata: 12 | name: cwagent 13 | namespace: amazon-cloudwatch 14 | 15 | --- 16 | # ClusterRole for cwagent 17 | apiVersion: rbac.authorization.k8s.io/v1 18 | kind: ClusterRole 19 | metadata: 20 | name: cwagent-role 21 | rules: 22 | - apiGroups: [""] 23 | resources: 24 | - nodes 25 | - nodes/proxy 26 | - services 27 | - endpoints 28 | - pods 29 | verbs: ["get", "list", "watch"] 30 | - apiGroups: ["extensions"] 31 | resources: 32 | - ingresses 33 | verbs: ["get", "list", "watch"] 34 | - nonResourceURLs: ["/metrics"] 35 | verbs: ["get"] 36 | 37 | --- 38 | # ClusterRoleBinding 39 | apiVersion: rbac.authorization.k8s.io/v1 40 | kind: ClusterRoleBinding 41 | metadata: 42 | name: cwagent-role-binding 43 | subjects: 44 | - kind: ServiceAccount 45 | name: cwagent 46 | namespace: amazon-cloudwatch 47 | roleRef: 48 | kind: ClusterRole 49 | name: cwagent-role 50 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /test/common/resources.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package common 4 | 5 | import ( 6 | "context" 7 | "fmt" 8 | "log" 9 | "time" 10 | 11 | fwext "github.com/aws/aws-k8s-tester/internal/e2e" 12 | appsv1 "k8s.io/api/apps/v1" 13 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 14 | "sigs.k8s.io/e2e-framework/klient/wait" 15 | "sigs.k8s.io/e2e-framework/pkg/env" 16 | "sigs.k8s.io/e2e-framework/pkg/envconf" 17 | ) 18 | 19 | // DeployDaemonSet returns a function to deploy and wait for a DaemonSet to be ready 20 | func DeployDaemonSet(name, namespace string) env.Func { 21 | return func(ctx context.Context, config *envconf.Config) (context.Context, error) { 22 | log.Printf("Waiting for %s daemonset to be ready.", name) 23 | daemonset := appsv1.DaemonSet{ 24 | ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, 25 | } 26 | err := wait.For( 27 | fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&daemonset), 28 | wait.WithTimeout(5*time.Minute), 29 | wait.WithContext(ctx), 30 | ) 31 | if err != nil { 32 | return ctx, fmt.Errorf("%s daemonset is not ready: %w", name, err) 33 | } 34 | log.Printf("%s daemonset is ready.", name) 35 | return ctx, nil 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/vpccni.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "k8s.io/apimachinery/pkg/types" 10 | ) 11 | 12 | const vpcCNIDaemonSetPatch = `{ 13 | "spec": { 14 | "template": { 15 | "spec": { 16 | "containers": [ 17 | { 18 | "name": "aws-node", 19 | "env": [ 20 | { 21 | "name": "ENABLE_PREFIX_DELEGATION", 22 | "value": "true" 23 | }, 24 | { 25 | "name": "MINIMUM_IP_TARGET", 26 | "value": "80" 27 | }, 28 | { 29 | "name": "WARM_IP_TARGET", 30 | "value": "10" 31 | } 32 | ] 33 | } 34 | ] 35 | } 36 | } 37 | } 38 | }` 39 | 40 | // tuneVPCCNI applies configuration to the VPC CNI DaemonSet that helps prevent test flakiness 41 | func (k *k8sClient) tuneVPCCNI() error { 42 | var patch bytes.Buffer 43 | if err := json.Compact(&patch, []byte(vpcCNIDaemonSetPatch)); err != nil { 44 | return err 45 | } 46 | _, err := k.clientset.AppsV1().DaemonSets("kube-system").Patch(context.TODO(), "aws-node", types.StrategicMergePatchType, patch.Bytes(), metav1.PatchOptions{}) 47 | return err 48 | } 49 | -------------------------------------------------------------------------------- /.github/workflows/update-go-dependencies.yaml: -------------------------------------------------------------------------------- 1 | name: "[CI] update-go-dependencies" 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | # once a week 6 | - cron: "0 0 * * 0" 7 | permissions: 8 | id-token: write 9 | contents: write 10 | pull-requests: write 11 | jobs: 12 | update-dependencies: 13 | runs-on: ubuntu-latest 14 | if: github.repository == 'aws/aws-k8s-tester' 15 | steps: 16 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 17 | - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # 5.5.0 18 | - run: | 19 | ./hack/update-go-dependencies.sh 20 | - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8 21 | with: 22 | branch: update-go-dependencies 23 | base: main 24 | add-paths: | 25 | . 26 | commit-message: "chore: update go dependencies" 27 | committer: "GitHub " 28 | author: "GitHub " 29 | title: "chore: update go dependencies" 30 | body: | 31 | Generated by: 32 | ``` 33 | ./hack/update-go-dependencies.sh 34 | ``` 35 | -------------------------------------------------------------------------------- /internal/deployers/eksctl/down.go: -------------------------------------------------------------------------------- 1 | package eksctl 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/aws/aws-k8s-tester/internal/util" 7 | "k8s.io/klog" 8 | ) 9 | 10 | func (d *deployer) Down() error { 11 | d.initClusterName() 12 | 13 | var err error 14 | 15 | if d.DeployTarget == "nodegroup" { 16 | klog.Infof("deleting nodegroup %s from cluster %s", d.NodegroupName, d.clusterName) 17 | err = util.ExecuteCommand("eksctl", "delete", "nodegroup", "--cluster", d.clusterName, "--name", d.NodegroupName, "--drain=false", "--wait") 18 | if err != nil { 19 | return fmt.Errorf("failed to delete nodegroup: %v", err) 20 | } 21 | klog.Infof("Successfully deleted nodegroup: %s from cluster: %s", d.NodegroupName, d.clusterName) 22 | } else if d.DeployTarget == "cluster" { 23 | klog.Infof("deleting cluster %s", d.clusterName) 24 | err = util.ExecuteCommand("eksctl", "delete", "cluster", "--name", d.clusterName, "--wait") 25 | if err != nil { 26 | return fmt.Errorf("failed to delete cluster: %v", err) 27 | } 28 | klog.Infof("Successfully deleted cluster: %s", d.clusterName) 29 | } else { 30 | return fmt.Errorf("Unsupported deploy target: %s, supported options: `cluster`, `nodegroup`.", d.DeployTarget) 31 | } 32 | return nil 33 | } 34 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/auth_map_role_test.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | const rolearn = "mock-role-arn" 10 | 11 | const sessionNamedAuthMapRole = ` 12 | - username: system:node:{{SessionName}} 13 | groups: 14 | - system:bootstrappers 15 | - system:nodes 16 | rolearn: mock-role-arn` 17 | 18 | const privateDNSNamedAuthMapRole = ` 19 | - username: system:node:{{EC2PrivateDNSName}} 20 | groups: 21 | - system:bootstrappers 22 | - system:nodes 23 | rolearn: mock-role-arn` 24 | 25 | func Test_generateAuthRoleMap(t *testing.T) { 26 | cases := []struct { 27 | nodeNameStrategy string 28 | expected string 29 | }{ 30 | { 31 | nodeNameStrategy: "SessionName", 32 | expected: sessionNamedAuthMapRole, 33 | }, 34 | { 35 | nodeNameStrategy: "EC2PrivateDNSName", 36 | expected: privateDNSNamedAuthMapRole, 37 | }, 38 | } 39 | for _, c := range cases { 40 | t.Run(c.nodeNameStrategy, func(t *testing.T) { 41 | actual, err := generateAuthMapRole(c.nodeNameStrategy, rolearn) 42 | if err != nil { 43 | t.Log(err) 44 | t.Error(err) 45 | } 46 | assert.Equal(t, c.expected, actual) 47 | }) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_basic.sh: -------------------------------------------------------------------------------- 1 | # Trivial cuda tests to validate that GPU it functional 2 | # Use demu-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html 3 | # and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests 4 | 5 | setup_suite() 6 | { 7 | source common.sh 8 | assert_gpu_unused 9 | DEMO_SUITE_DIR=${DEMO_SUITE_DIR:-$(realpath /usr/local/cuda/extras/demo_suite)} 10 | } 11 | 12 | teardown_suite() 13 | { 14 | assert_gpu_unused 15 | } 16 | 17 | test_01_device_query() 18 | { 19 | assert_status_code 0 "$DEMO_SUITE_DIR/deviceQuery" 20 | } 21 | 22 | test_02_vector_add() 23 | { 24 | assert_status_code 0 "$DEMO_SUITE_DIR/vectorAdd" 25 | } 26 | 27 | test_03_nvbandwidth() 28 | { 29 | assert_status_code 0 "$DEMO_SUITE_DIR/nvbandwidth" 30 | } 31 | 32 | test_04_dcgm_diagnostics() 33 | { 34 | # This test is not applicable for vGPU instance types. 35 | if is_vgpu; then 36 | skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)" 37 | fi 38 | 39 | # https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests 40 | assert_status_code 0 "dcgmi diag -r 2" 41 | } 42 | -------------------------------------------------------------------------------- /test/cases/nvidia/manifests/nvidia-driver-capabilities-check.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: moderngl-pod 5 | spec: 6 | restartPolicy: Never 7 | tolerations: 8 | - key: "nvidia.com/gpu" 9 | operator: "Exists" 10 | effect: "NoSchedule" 11 | containers: 12 | - name: moderngl-container 13 | env: 14 | - name: NVIDIA_DRIVER_CAPABILITIES 15 | value: "all" 16 | image: public.ecr.aws/ubuntu/ubuntu:22.04 17 | command: ["/bin/bash"] 18 | args: 19 | - -c 20 | - | 21 | set -e 22 | apt-get update 23 | apt-get install -y \ 24 | python3 \ 25 | python3-pip \ 26 | libgl1-mesa-glx \ 27 | libegl1-mesa-dev \ 28 | libgles2-mesa-dev \ 29 | mesa-utils \ 30 | xvfb 31 | pip3 install moderngl 32 | sleep 60 33 | cat <<'EOF' > moderngl-script.py 34 | import moderngl 35 | moderngl.create_standalone_context(backend='egl') 36 | EOF 37 | python3 moderngl-script.py 38 | resources: 39 | requests: 40 | memory: "50Gi" 41 | cpu: "15" 42 | "nvidia.com/gpu": "1" 43 | limits: 44 | memory: "50Gi" 45 | "nvidia.com/gpu": "1" 46 | -------------------------------------------------------------------------------- /test/manifests/assets/k8s-neuron-device-plugin-rbac.yml: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/aws-neuron/aws-neuron-sdk/blob/master/src/k8/k8s-neuron-device-plugin-rbac.yml 2 | kind: ClusterRole 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | metadata: 5 | name: neuron-device-plugin 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - nodes 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - apiGroups: 16 | - "" 17 | resources: 18 | - events 19 | verbs: 20 | - create 21 | - patch 22 | - apiGroups: 23 | - "" 24 | resources: 25 | - pods 26 | verbs: 27 | - update 28 | - patch 29 | - get 30 | - list 31 | - watch 32 | - apiGroups: 33 | - "" 34 | resources: 35 | - nodes/status 36 | verbs: 37 | - patch 38 | - update 39 | --- 40 | apiVersion: v1 41 | kind: ServiceAccount 42 | metadata: 43 | name: neuron-device-plugin 44 | namespace: kube-system 45 | --- 46 | kind: ClusterRoleBinding 47 | apiVersion: rbac.authorization.k8s.io/v1 48 | metadata: 49 | name: neuron-device-plugin 50 | namespace: kube-system 51 | roleRef: 52 | apiGroup: rbac.authorization.k8s.io 53 | kind: ClusterRole 54 | name: neuron-device-plugin 55 | subjects: 56 | - kind: ServiceAccount 57 | name: neuron-device-plugin 58 | namespace: kube-system 59 | -------------------------------------------------------------------------------- /internal/e2e/mpijobs/conditions.go: -------------------------------------------------------------------------------- 1 | package mpijobs 2 | 3 | import ( 4 | "fmt" 5 | 6 | "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 7 | "sigs.k8s.io/e2e-framework/klient/k8s" 8 | ) 9 | 10 | // MPIJobSucceeded returns true if the specified k8s.Object is an unstructured.Unstructured 11 | // with .status.conditions["Succeeded"] = "True" 12 | func MPIJobSucceeded(obj k8s.Object) bool { 13 | u := obj.(*unstructured.Unstructured) 14 | conditions, found, err := unstructured.NestedSlice(u.Object, "status", "conditions") 15 | if err != nil { 16 | panic(fmt.Errorf("MPIJob does not match expected schema: %v", err)) 17 | } 18 | if !found { 19 | return false 20 | } 21 | for _, condition := range conditions { 22 | c := condition.(map[string]interface{}) 23 | cType, found, err := unstructured.NestedString(c, "type") 24 | if err != nil { 25 | panic(fmt.Errorf("MPIJob does not match expected schema: %v", err)) 26 | } 27 | if !found { 28 | continue 29 | } 30 | if cType == "Succeeded" { 31 | cStatus, found, err := unstructured.NestedString(c, "status") 32 | if err != nil { 33 | panic(fmt.Errorf("MPIJob does not match expected schema: %v", err)) 34 | } 35 | if !found { 36 | continue 37 | } 38 | return cStatus == "True" 39 | } 40 | } 41 | return false 42 | } 43 | -------------------------------------------------------------------------------- /test/images/neuron/tests/testNeuronSingleAllReduce.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/aws/deep-learning-containers/blob/master/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronSingleAllReduce 2 | import os 3 | import torch 4 | import torch_xla.core.xla_model as xm 5 | import torch_xla.distributed.xla_backend 6 | import torch_xla.runtime as xr 7 | torch.distributed.init_process_group('xla') 8 | import torch_xla.distributed.xla_multiprocessing as xmp 9 | os.environ["NEURON_RT_EXEC_TIMEOUT"] = "20" 10 | os.environ["NCCL_DEBUG"] = "WARN" 11 | os.environ["NCCL_DEBUG_SUBSYS"] = "ALL" 12 | def _mp_fn(): 13 | world_size = xr.world_size() 14 | device = xm.xla_device() 15 | rank = xr.global_ordinal() 16 | ones = torch.ones((2, 3)) 17 | xones = ones.to(device) 18 | if world_size > 0: 19 | print("running all reduce") 20 | for i in range(0, 5): 21 | print(f'at iteration {i}, with local rank {rank}', flush=True) 22 | result = xm.all_reduce(xm.REDUCE_SUM, xones) 23 | result_cpu = result.cpu() 24 | #xm.mark_step() 25 | print(result_cpu, flush = True) 26 | expected = torch.ones((2,3))*world_size 27 | assert expected.allclose(result_cpu) 28 | print('PASS') 29 | if __name__ == '__main__': 30 | _mp_fn() 31 | #xmp.spawn(_mp_fn, args=(),nprocs=2, join=True) 32 | -------------------------------------------------------------------------------- /.github/workflows/update-neuron-dependencies.yaml: -------------------------------------------------------------------------------- 1 | name: "[CI] update-neuron-dependencies" 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | # once a week 6 | - cron: "0 0 * * 0" 7 | permissions: 8 | id-token: write 9 | contents: write 10 | pull-requests: write 11 | jobs: 12 | update-dependencies: 13 | runs-on: ubuntu-latest 14 | if: github.repository == 'aws/aws-k8s-tester' 15 | steps: 16 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 17 | - run: | 18 | ./hack/update-neuron-dependencies.sh 19 | - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8 20 | with: 21 | branch: update-neuron-dependencies 22 | base: main 23 | add-paths: | 24 | test/images/ 25 | commit-message: "chore: update neuron dependencies" 26 | committer: "GitHub " 27 | author: "GitHub " 28 | title: "chore: update neuron dependencies" 29 | body: | 30 | Generated by: 31 | ``` 32 | ./hack/update-neuron-dependencies.sh 33 | ``` 34 | 35 | See the following URL for artifactes in the latest Neuron SDK release: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/releasecontent.html#latest-neuron-release-artifacts 36 | -------------------------------------------------------------------------------- /hack/download-kubernetes-binaries.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | BUNDLES=( 7 | "kubernetes-client" 8 | "kubernetes-test" 9 | ) 10 | 11 | if [ "$#" -ne 3 ]; then 12 | echo >&2 "usage: $0 (KUBERNETES_MINOR_VERSION|latest) OS ARCH" 13 | exit 1 14 | fi 15 | 16 | if [ "$1" = "latest" ]; then 17 | RELEASE_MARKER="latest.txt" 18 | else 19 | RELEASE_MARKER="latest-$1.txt" 20 | fi 21 | 22 | echo >&2 "Release marker: ${RELEASE_MARKER}" 23 | 24 | OS="$2" 25 | ARCH="$3" 26 | 27 | function download_binaries() { 28 | local basePath=$1 29 | 30 | local KUBERNETES_VERSION=$(curl --silent "${basePath}/${RELEASE_MARKER}") 31 | 32 | echo "Kubernetes version: ${KUBERNETES_VERSION}" 33 | echo "${KUBERNETES_VERSION}" > kubernetes-version.txt 34 | 35 | for BUNDLE in ${BUNDLES[@]}; do 36 | echo >&2 "Downloading bundle: ${BUNDLE}" 37 | local TARBALL="${BUNDLE}.tar.gz" 38 | if ! wget --quiet --output-document=${TARBALL} $basePath/${KUBERNETES_VERSION}/${BUNDLE}-${OS}-${ARCH}.tar.gz; then 39 | return 1 40 | fi 41 | tar xzf ${TARBALL} 42 | rm ${TARBALL} 43 | done 44 | } 45 | 46 | if ! download_binaries https://storage.googleapis.com/kubernetes-release/release; then 47 | echo >&2 "binary download failed from release bucket, falling back to ci dev release" 48 | download_binaries https://storage.googleapis.com/k8s-release-dev/ci 49 | fi 50 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/cloudwatch-infra.yaml.template: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Description: kubetest2-eksapi CloudWatch using Pod Identity 3 | 4 | Parameters: 5 | ClusterName: 6 | Description: Name of the EKS cluster 7 | Type: String 8 | 9 | ClusterUUID: 10 | Description: UUID portion of the cluster name 11 | Type: String 12 | 13 | Resources: 14 | CloudWatchRole: 15 | Type: AWS::IAM::Role 16 | Properties: 17 | RoleName: !Sub "cloudwatch-role-${ClusterUUID}" 18 | AssumeRolePolicyDocument: 19 | Version: '2012-10-17' 20 | Statement: 21 | - Sid: AllowEksAuthToAssumeRoleForPodIdentity 22 | Effect: Allow 23 | Principal: 24 | Service: pods.eks.amazonaws.com 25 | Action: 26 | - sts:AssumeRole 27 | - sts:TagSession 28 | ManagedPolicyArns: 29 | - arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy 30 | Description: Role for CloudWatch Agent in EKS cluster 31 | 32 | PodIdentityAssociation: 33 | Type: AWS::EKS::PodIdentityAssociation 34 | Properties: 35 | ClusterName: !Ref ClusterName 36 | Namespace: amazon-cloudwatch 37 | ServiceAccount: cwagent 38 | RoleArn: !GetAtt CloudWatchRole.Arn 39 | 40 | Outputs: 41 | CloudWatchRoleArn: 42 | Description: ARN of the CloudWatch IAM role 43 | Value: !GetAtt CloudWatchRole.Arn 44 | Export: 45 | Name: !Sub "${AWS::StackName}::CloudWatchRoleArn" 46 | 47 | PodIdentityAssociationArn: 48 | Description: ARN of the Pod Identity Association 49 | Value: !Ref PodIdentityAssociation 50 | Export: 51 | Name: !Sub '${AWS::StackName}-PodIdentityAssociationArn' 52 | -------------------------------------------------------------------------------- /internal/e2e/ec2.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/aws/aws-k8s-tester/internal/awssdk" 8 | "github.com/aws/aws-sdk-go-v2/service/ec2" 9 | ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" 10 | ) 11 | 12 | type EC2Client interface { 13 | DescribeInstanceType(instanceType string) (ec2types.InstanceTypeInfo, error) 14 | } 15 | 16 | type ec2Client struct { 17 | client *ec2.Client 18 | } 19 | 20 | func NewEC2Client() *ec2Client { 21 | return &ec2Client{ 22 | client: ec2.NewFromConfig(awssdk.NewConfig()), 23 | } 24 | } 25 | 26 | func (c *ec2Client) DescribeInstanceTopology(instanceIDs []string) ([]ec2types.InstanceTopology, error) { 27 | var instanceTopologies []ec2types.InstanceTopology 28 | paginator := ec2.NewDescribeInstanceTopologyPaginator(c.client, &ec2.DescribeInstanceTopologyInput{ 29 | InstanceIds: instanceIDs, 30 | }) 31 | for paginator.HasMorePages() { 32 | instanceTopologyOuput, err := paginator.NextPage(context.TODO()) 33 | if err != nil { 34 | return []ec2types.InstanceTopology{}, err 35 | } 36 | instanceTopologies = append(instanceTopologies, instanceTopologyOuput.Instances...) 37 | } 38 | return instanceTopologies, nil 39 | } 40 | 41 | func (c *ec2Client) DescribeInstanceType(instanceType string) (ec2types.InstanceTypeInfo, error) { 42 | describeResponse, err := c.client.DescribeInstanceTypes(context.TODO(), &ec2.DescribeInstanceTypesInput{ 43 | InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, 44 | }) 45 | if err != nil { 46 | return ec2types.InstanceTypeInfo{}, fmt.Errorf("failed to describe instance type: %s: %v", instanceType, err) 47 | } else { 48 | return describeResponse.InstanceTypes[0], nil 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /hack/update-neuron-dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | set -o pipefail 6 | 7 | # pip_versionsearch takes exactly 1 argument and returns its latest available version from the neuron pip repo 8 | # usage: pip_versionsearch PACKAGE 9 | pip_versionsearch() { 10 | local PACKAGE_INDEX_NAME=$(echo $1 | tr -s '_' '-') 11 | local PACKAGE_VERSION_NAME=$(echo $PACKAGE_INDEX_NAME | tr -s '-' '_') 12 | curl -s https://pip.repos.neuron.amazonaws.com/${PACKAGE_INDEX_NAME} | grep -o -G "${PACKAGE_VERSION_NAME}-[0-9\.]*+[a-f0-9]*" | sed "s/$PACKAGE_VERSION_NAME-//" | sort -V | tail -n 1 13 | } 14 | 15 | # versionsearch takes exactly 1 argument and returns its latest available version from the neuron amd64 apt repo 16 | # usage: versionsearch PACKAGE 17 | versionsearch() { 18 | local PACKAGE_NAME=$1 19 | curl -s https://apt.repos.neuron.amazonaws.com/dists/focal/main/binary-amd64/Packages | grep -o "${PACKAGE_NAME}_[0-9\.]*-*[a-f0-9]*" | sed "s/${PACKAGE_NAME}_//" | sort -V | tail -n 1 20 | } 21 | 22 | # update_arg ARG NEW_VALUE 23 | update_arg() { 24 | local ARG=$1 25 | local NEW_VALUE=$2 26 | echo "setting $ARG to $NEW_VALUE" 27 | find . -type f -name Dockerfile -exec sed -i "s/${ARG}=.*/${ARG}=$NEW_VALUE/g" {} + 28 | } 29 | 30 | update_arg NEURONX_RUNTIME_LIB_VERSION $(versionsearch aws-neuronx-runtime-lib) 31 | update_arg NEURONX_COLLECTIVES_LIB_VERSION $(versionsearch aws-neuronx-collectives) 32 | update_arg NEURONX_TOOLS_VERSION $(versionsearch aws-neuronx-tools) 33 | update_arg NEURONX_FRAMEWORK_VERSION $(pip_versionsearch torch-neuronx) 34 | update_arg NEURONX_CC_VERSION $(pip_versionsearch neuronx-cc) 35 | update_arg NEURONX_DISTRIBUTED_VERSION $(pip_versionsearch neuronx_distributed) -------------------------------------------------------------------------------- /test/cases/neuron-training/manifests/bert-training.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | labels: 5 | app: bert-training 6 | name: bert-training 7 | spec: 8 | completionMode: Indexed 9 | completions: {{.NodeCount}} 10 | parallelism: {{.NodeCount}} 11 | backoffLimit: 0 12 | template: 13 | spec: 14 | restartPolicy: Never 15 | containers: 16 | - image: {{.BertTrainingImage}} 17 | name: bert-training 18 | env: 19 | - name: MASTER_ADDR 20 | value: bert-training-0.training 21 | args: 22 | - sh 23 | - -c 24 | - | 25 | # Enable EFA https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-runtime/nrt-troubleshoot.html#fi-efa-fork-safe (AL2 legacy requirement) 26 | export FI_EFA_FORK_SAFE=1 27 | export CCOM_SOCKET_IFNAME=eth0 28 | export NCCL_DEBUG=ERROR 29 | torchrun --nproc_per_node {{.NeuronCorePerNode}} --nnodes {{.NodeCount}} --node_rank $JOB_COMPLETION_INDEX --master_addr $MASTER_ADDR train.py 30 | volumeMounts: 31 | - name: dshm 32 | mountPath: /dev/shm 33 | resources: 34 | requests: 35 | aws.amazon.com/neuron: {{.NeuronPerNode}} 36 | aws.amazon.com/neuroncore: {{.NeuronCorePerNode}} 37 | vpc.amazonaws.com/efa: {{.EFAPerNode}} 38 | limits: 39 | aws.amazon.com/neuron: {{.NeuronPerNode}} 40 | aws.amazon.com/neuroncore: {{.NeuronCorePerNode}} 41 | vpc.amazonaws.com/efa: {{.EFAPerNode}} 42 | nodeSelector: 43 | node.kubernetes.io/instance-type: {{.NodeType}} 44 | subdomain: training 45 | volumes: 46 | - name: dshm 47 | emptyDir: 48 | medium: Memory 49 | -------------------------------------------------------------------------------- /internal/e2e/health.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strings" 7 | 8 | "k8s.io/client-go/kubernetes" 9 | "k8s.io/client-go/rest" 10 | ) 11 | 12 | // KubeletIsResponsive returns true if the kubelet /healthz endpoint responds with a 200 status code, and propagates 13 | // any non-connection specific errors 14 | func KubeletIsResponsive(ctx context.Context, cfg *rest.Config, nodeName string) (bool, error) { 15 | client, err := kubernetes.NewForConfig(cfg) 16 | if err != nil { 17 | return false, fmt.Errorf("failed to initialize client set: %v", err) 18 | } 19 | 20 | nodeHealthResponse := client.CoreV1().RESTClient().Get().Resource("nodes"). 21 | Name(nodeName).SubResource("proxy").Suffix("/healthz"). 22 | Do(ctx) 23 | 24 | if nodeHealthResponse.Error() != nil { 25 | errMsg := nodeHealthResponse.Error().Error() 26 | // TODO: match errors against types, e.g. syscall.ECONNREFUSED instead, the k8s client doesn't 27 | // currently properly wrap the underlying error to allow this though 28 | if strings.Contains(errMsg, "connection refused") || 29 | strings.Contains(errMsg, "connection reset by peer") || 30 | strings.Contains(errMsg, "http2: client connection lost") { 31 | // these errors indicate reachability to the node in general but an unstable connection to kubelet 32 | return false, nil 33 | } 34 | 35 | // propagate other errors, e.g. i/o timeout, that may result from things unrelated to kubelet health, 36 | // e.g. security group rules on the instance restricting traffic from the CP 37 | return false, fmt.Errorf("could not reach /healthz endpoint for node %s: %w", nodeName, nodeHealthResponse.Error()) 38 | } 39 | 40 | var statusCode int 41 | nodeHealthResponse.StatusCode(&statusCode) 42 | return statusCode == 200, nil 43 | } 44 | -------------------------------------------------------------------------------- /test/cases/workload/main_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package workload 4 | 5 | import ( 6 | "context" 7 | "flag" 8 | "fmt" 9 | "log" 10 | "os" 11 | "os/signal" 12 | "testing" 13 | "time" 14 | 15 | "sigs.k8s.io/e2e-framework/pkg/env" 16 | "sigs.k8s.io/e2e-framework/pkg/envconf" 17 | ) 18 | 19 | const ( 20 | defaultWorkloadTestTimeout = 10 * time.Minute 21 | ) 22 | 23 | var ( 24 | testenv env.Environment 25 | workloadTestCommand *string 26 | workloadTestImage *string 27 | workloadTestName *string 28 | workloadTestResources *string 29 | workloadTestTimeout *time.Duration 30 | ) 31 | 32 | func TestMain(m *testing.M) { 33 | workloadTestCommand = flag.String("workloadTestCommand", "", "command for workload test") 34 | workloadTestImage = flag.String("workloadTestImage", "", "image for workload test") 35 | workloadTestName = flag.String("workloadTestName", "workload-test", "name for workload test") 36 | workloadTestResources = flag.String("workloadTestResources", "", "JSON map of resources for workload test (e.g., '{\"nvidia.com/gpu\": \"1\"}')") 37 | workloadTestTimeout = flag.Duration("workloadTestTimeout", defaultWorkloadTestTimeout, fmt.Sprintf("timeout for workload test (default: %s)", defaultWorkloadTestTimeout)) 38 | cfg, err := envconf.NewFromFlags() 39 | if err != nil { 40 | log.Fatalf("failed to initialize test environment: %v", err) 41 | } 42 | testenv = env.NewWithConfig(cfg) 43 | ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) 44 | defer cancel() 45 | testenv = testenv.WithContext(ctx) 46 | 47 | testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { 48 | log.Println("Starting workload test suite...") 49 | return ctx, nil 50 | }) 51 | 52 | os.Exit(testenv.Run(m)) 53 | } 54 | -------------------------------------------------------------------------------- /test/cases/nvidia/manifests/job-hpc-benchmarks.yaml: -------------------------------------------------------------------------------- 1 | kind: Job 2 | apiVersion: batch/v1 3 | metadata: 4 | name: hpc-benckmarks-job 5 | labels: 6 | app: hpc-benckmarks-job 7 | spec: 8 | completions: 1 9 | parallelism: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: hpc-benckmarks-job 14 | spec: 15 | volumes: 16 | - name: dshm 17 | emptyDir: 18 | medium: Memory 19 | containers: 20 | - name: hpc-benchmarks 21 | image: "nvcr.io/nvidia/hpc-benchmarks:25.04" 22 | command: 23 | - mpirun 24 | - --allow-run-as-root 25 | - -np 26 | - "{{.GpuPerNode}}" 27 | - -bind-to 28 | - none 29 | - -x 30 | - NCCL_DEBUG=INFO 31 | - -x 32 | - HPL_FCT_COMM_POLICY=1 33 | - -x 34 | - HPL_USE_NVSHMEM=0 35 | # TODO: for arm it will be 36 | # - hpl-aarch64.sh 37 | - hpl.sh 38 | - --mem-affinity 39 | - 0:0:0:0:1:1:1:1 40 | # --cpu-affinity needs to be tuned depending on the number of CPUs 41 | # available on the instance type. 42 | - --cpu-affinity 43 | - 0-13:14-27:28-41:42-55:56-69:70-83:84-97:98-111 44 | - --no-multinode 45 | - --dat 46 | - hpl-linux-x86_64/sample-dat/HPL-dgx-1N.dat 47 | # TODO: the path differs for arm64 48 | # - hpl-linux-aarch64-gpu/sample-dat/HPL-dgx-1N.dat 49 | volumeMounts: 50 | - mountPath: /dev/shm 51 | name: dshm 52 | imagePullPolicy: Always 53 | resources: 54 | limits: 55 | nvidia.com/gpu: {{.GpuPerNode}} 56 | env: 57 | - name: UCX_TLS 58 | value: "^sysv" 59 | restartPolicy: Never 60 | backoffLimit: 4 61 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/ami_resolver_test.go: -------------------------------------------------------------------------------- 1 | //go:build integration 2 | 3 | package eksapi 4 | 5 | import ( 6 | "context" 7 | "testing" 8 | 9 | "github.com/aws/aws-sdk-go-v2/config" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func TestAMIResolver(t *testing.T) { 14 | ctx := context.Background() 15 | awsCfg, err := config.LoadDefaultConfig(ctx) 16 | assert.NoError(t, err) 17 | 18 | amiResolver := NewAMIResolver(newAWSClients(awsCfg, "")) 19 | 20 | t.Run("AL2023-nvidia", func(t *testing.T) { 21 | opts := deployerOptions{ 22 | UserDataFormat: UserDataNodeadm, 23 | KubernetesVersion: "1.33", 24 | } 25 | t.Run("nvidia", func(t *testing.T) { 26 | opts := opts 27 | opts.InstanceTypes = []string{"g5.xlarge"} 28 | 29 | ami, err := amiResolver.Resolve(ctx, &opts) 30 | assert.NoError(t, err) 31 | assert.Regexp(t, "ami-.*", ami) 32 | }) 33 | t.Run("standard", func(t *testing.T) { 34 | opts := opts 35 | opts.InstanceTypes = []string{"m5.xlarge"} 36 | 37 | ami, err := amiResolver.Resolve(ctx, &opts) 38 | assert.NoError(t, err) 39 | assert.Regexp(t, "ami-.*", ami) 40 | }) 41 | }) 42 | 43 | t.Run("Bottlerocket", func(t *testing.T) { 44 | opts := deployerOptions{ 45 | UserDataFormat: UserDataBottlerocket, 46 | KubernetesVersion: "1.33", 47 | } 48 | t.Run("nvidia", func(t *testing.T) { 49 | opts := opts 50 | opts.InstanceTypes = []string{"g5.xlarge"} 51 | 52 | ami, err := amiResolver.Resolve(ctx, &opts) 53 | assert.NoError(t, err) 54 | assert.Regexp(t, "ami-.*", ami) 55 | }) 56 | t.Run("standard", func(t *testing.T) { 57 | opts := opts 58 | opts.InstanceTypes = []string{"m5.xlarge"} 59 | 60 | ami, err := amiResolver.Resolve(ctx, &opts) 61 | assert.NoError(t, err) 62 | assert.Regexp(t, "ami-.*", ami) 63 | }) 64 | }) 65 | } 66 | -------------------------------------------------------------------------------- /test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # container image from: https://github.com/aws/deep-learning-containers/blob/master/available_images.md 3 | apiVersion: kubeflow.org/v2beta1 4 | kind: MPIJob 5 | metadata: 6 | name: pytorch-training-single-node 7 | spec: 8 | slotsPerWorker: 4 9 | runPolicy: 10 | cleanPodPolicy: Running 11 | mpiImplementation: OpenMPI 12 | mpiReplicaSpecs: 13 | Launcher: 14 | replicas: 1 15 | template: 16 | spec: 17 | restartPolicy: OnFailure 18 | containers: 19 | - image: {{.PytorchTestImage}} 20 | name: gpu-test 21 | command: 22 | - mpirun 23 | - --allow-run-as-root 24 | - -np 25 | - "1" 26 | - -mca 27 | - btl_tcp_if_exclude 28 | - lo 29 | - -mca 30 | - pml 31 | - ob1 32 | - -mca 33 | - btl 34 | - ^openib 35 | - --bind-to 36 | - none 37 | - -map-by 38 | - slot 39 | - -x 40 | - LD_LIBRARY_PATH 41 | - -x 42 | - PATH 43 | - -x 44 | - NCCL_SOCKET_IFNAME=eth0 45 | - -x 46 | - NCCL_DEBUG=INFO 47 | - -x 48 | - MXNET_CUDNN_AUTOTUNE_DEFAULT=0 49 | - python 50 | - -c 51 | - import os; os.system("git clone https://github.com/pytorch/examples.git pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python pytorch-examples/mnist/main.py --epochs 1") 52 | resources: 53 | limits: 54 | nvidia.com/gpu: 1 55 | -------------------------------------------------------------------------------- /test/images/efa/scripts/unit-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eu 4 | 5 | get_instance_type() 6 | { 7 | 8 | local token=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null) 9 | 10 | if [ -n "$token" ]; then 11 | curl -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-type 12 | else 13 | curl http://169.254.169.254/latest/meta-data/instance-type 14 | fi 15 | } 16 | 17 | get_expected_efa_device_count() 18 | { 19 | aws ec2 describe-instance-types --instance-type="$EC2_INSTANCE_TYPE" | jq -r '.InstanceTypes[].NetworkInfo.EfaInfo.MaximumEfaInterfaces' 20 | } 21 | 22 | EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-$(get_instance_type)} 23 | EXPECTED_EFA_DEVICE_COUNT=${EXPECTED_EFA_DEVICE_COUNT:-$(get_expected_efa_device_count)} 24 | 25 | echo "Running test on a $EC2_INSTANCE_TYPE" 26 | 27 | fi_info -p efa 28 | DGRAM_ENDPOINT_COUNT=$(fi_info -p efa | grep 'type:\sFI_EP_DGRAM$' | wc -l) 29 | if ! test $EXPECTED_EFA_DEVICE_COUNT -le $DGRAM_ENDPOINT_COUNT; then 30 | echo "Expected at least $EXPECTED_EFA_DEVICE_COUNT DGRAM endpoint(s) but found $DGRAM_ENDPOINT_COUNT" 31 | exit 1 32 | else 33 | echo "Verified at least $EXPECTED_EFA_DEVICE_COUNT DGRAM endpoint(s) are available (found $DGRAM_ENDPOINT_COUNT)" 34 | fi 35 | 36 | RDM_ENDPOINT_COUNT=$(fi_info -p efa | grep 'type:\sFI_EP_RDM$' | wc -l) 37 | if ! test $EXPECTED_EFA_DEVICE_COUNT -le $RDM_ENDPOINT_COUNT; then 38 | echo "Expected at least $EXPECTED_EFA_DEVICE_COUNT RDM endpoint(s) but found $RDM_ENDPOINT_COUNT" 39 | exit 1 40 | else 41 | echo "Verified at least $EXPECTED_EFA_DEVICE_COUNT RDM endpoint(s) are available (found $RDM_ENDPOINT_COUNT)" 42 | fi 43 | 44 | 45 | echo "Running single-node efa test" 46 | 47 | # Run efa_test.sh, a utility added during the build while installing EFA 48 | efa_test.sh 49 | 50 | echo "Success!" -------------------------------------------------------------------------------- /internal/testers/ginkgov1/kubectl/kubectl.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2019 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package kubectl 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | 23 | "sigs.k8s.io/kubetest2/pkg/exec" 24 | ) 25 | 26 | const ( 27 | kubectl = "kubectl" 28 | ) 29 | 30 | // APIServerURL obtains the URL of the k8s master from kubectl 31 | func APIServerURL() (string, error) { 32 | kubecontext, err := execAndResult(kubectl, "config", "view", "-o", "jsonpath=\"{.current-context}\"") 33 | if err != nil { 34 | return "", fmt.Errorf("Could not get kube context: %v", err) 35 | } 36 | 37 | clustername, err := execAndResult(kubectl, "config", "view", "-o", 38 | fmt.Sprintf("jsonpath=\"{.contexts[?(@.name == %s)].context.cluster}\"", kubecontext)) 39 | if err != nil { 40 | return "", fmt.Errorf("Could not get cluster name: %v", err) 41 | } 42 | 43 | apiServerURL, err := execAndResult(kubectl, "config", "view", "-o", 44 | fmt.Sprintf("jsonpath={.clusters[?(@.name == %s)].cluster.server}", clustername)) 45 | if err != nil { 46 | return "", err 47 | } 48 | return apiServerURL, nil 49 | } 50 | 51 | // execAndResult runs command with args and returns the entire output (or error) 52 | func execAndResult(command string, args ...string) (string, error) { 53 | cmd := exec.Command(command, args...) 54 | cmd.SetStderr(os.Stderr) 55 | bytes, err := exec.Output(cmd) 56 | return string(bytes), err 57 | } 58 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | get_instance_type() 4 | { 5 | # Retrieve instance metadata: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#instance-metadata-retrieval-examples 6 | [ -n "$FORCE_INSTANCE_TYPE" ] && echo $FORCE_INSTANCE_TYPE 7 | 8 | local token=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null) 9 | 10 | if [ -n "$token" ]; then 11 | curl -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-type 12 | else 13 | curl http://169.254.169.254/latest/meta-data/instance-type 14 | fi 15 | } 16 | 17 | assert_gpu_unused() 18 | { 19 | cmd="nvidia-smi --query-compute-apps timestamp,gpu_bus_id,gpu_uuid,pid,name,used_memory --format csv,noheader" 20 | assert_equals "" "`$cmd`" "gpu is busy by other task, system misconfig?" 21 | } 22 | 23 | _assert_data() 24 | { 25 | local expected="$1" 26 | local cmd="$2" 27 | local message="${3:-}" 28 | local cmd_out="$ACTUAL_RESULTS/$(basename $expected)" 29 | [[ -z $message ]] || message="$message\n" 30 | 31 | eval "$cmd" > $cmd_out 32 | diff_cmd="diff -up $expected $cmd_out" 33 | diff_out="`$diff_cmd`" 34 | 35 | notify_trace_dbg "_assert_data $diff_cmd, out: $diff_out" 36 | if [ -n "$diff_out" ] 37 | then 38 | fail "$message test data value diff:\n$diff_out" 39 | fi 40 | } 41 | 42 | assert_data() { 43 | _assert_data "$1" "$2" "$3" 44 | } 45 | 46 | generate_data() 47 | { 48 | local expected="$1" 49 | local cmd="$2" 50 | local msg="$3" 51 | local cmd_out="$ACTUAL_RESULTS/$(basename $expected)" 52 | 53 | eval "$cmd" > $expected 54 | _assert_data "$expected" "$cmd" "$msg" 55 | } 56 | 57 | function is_vgpu() 58 | { 59 | local instance_type=${EC2_INSTANCE_TYPE:-$(get_instance_type)} 60 | case "${instance_type}" in 61 | g6f.*|gr6f.*) return ;; 62 | *) return 1 ;; # Not supported 63 | esac 64 | } 65 | -------------------------------------------------------------------------------- /test/manifests/assets/dcgm-exporter.yaml: -------------------------------------------------------------------------------- 1 | # Derived from: Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | apiVersion: apps/v1 3 | kind: DaemonSet 4 | metadata: 5 | name: "dcgm-exporter" 6 | namespace: "kube-system" 7 | labels: 8 | app.kubernetes.io/name: "dcgm-exporter" 9 | app.kubernetes.io/version: "4.1.3" 10 | spec: 11 | updateStrategy: 12 | type: RollingUpdate 13 | selector: 14 | matchLabels: 15 | app.kubernetes.io/name: "dcgm-exporter" 16 | app.kubernetes.io/version: "4.1.3" 17 | template: 18 | metadata: 19 | labels: 20 | app.kubernetes.io/name: "dcgm-exporter" 21 | app.kubernetes.io/version: "4.1.3" 22 | name: "dcgm-exporter" 23 | spec: 24 | containers: 25 | - image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.1.3-ubuntu22.04" 26 | env: 27 | - name: "DCGM_EXPORTER_LISTEN" 28 | value: ":9400" 29 | - name: "DCGM_EXPORTER_INTERVAL" 30 | value: "100" 31 | - name: "DCGM_EXPORTER_KUBERNETES" 32 | value: "true" 33 | name: "dcgm-exporter" 34 | ports: 35 | - name: "metrics" 36 | containerPort: 9400 37 | securityContext: 38 | runAsNonRoot: false 39 | runAsUser: 0 40 | capabilities: 41 | add: ["SYS_ADMIN"] 42 | volumeMounts: 43 | - name: "pod-gpu-resources" 44 | readOnly: true 45 | mountPath: "/var/lib/kubelet/pod-resources" 46 | volumes: 47 | - name: "pod-gpu-resources" 48 | hostPath: 49 | path: "/var/lib/kubelet/pod-resources" 50 | 51 | --- 52 | 53 | kind: Service 54 | apiVersion: v1 55 | metadata: 56 | name: "dcgm-exporter" 57 | namespace: "kube-system" 58 | labels: 59 | app.kubernetes.io/name: "dcgm-exporter" 60 | app.kubernetes.io/version: "4.1.3" 61 | spec: 62 | clusterIP: "None" 63 | selector: 64 | app.kubernetes.io/name: "dcgm-exporter" 65 | app.kubernetes.io/version: "4.1.3" 66 | ports: 67 | - name: "metrics" 68 | port: 9400 -------------------------------------------------------------------------------- /test/manifests/assets/efa-device-plugin.yaml: -------------------------------------------------------------------------------- 1 | # Source: https://raw.githubusercontent.com/aws-samples/aws-efa-eks/main/manifest/efa-k8s-device-plugin.yml 2 | apiVersion: apps/v1 3 | kind: DaemonSet 4 | metadata: 5 | name: aws-efa-k8s-device-plugin-daemonset 6 | namespace: kube-system 7 | spec: 8 | selector: 9 | matchLabels: 10 | name: aws-efa-k8s-device-plugin 11 | updateStrategy: 12 | type: RollingUpdate 13 | template: 14 | metadata: 15 | labels: 16 | name: aws-efa-k8s-device-plugin 17 | spec: 18 | serviceAccount: default 19 | tolerations: 20 | - key: CriticalAddonsOnly 21 | operator: Exists 22 | - key: aws.amazon.com/efa 23 | operator: Exists 24 | effect: NoSchedule 25 | # Mark this pod as a critical add-on; when enabled, the critical add-on 26 | # scheduler reserves resources for critical add-on pods so that they can 27 | # be rescheduled after a failure. 28 | # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 29 | priorityClassName: "system-node-critical" 30 | hostNetwork: true 31 | containers: 32 | - image: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.5.8 33 | name: aws-efa-k8s-device-plugin 34 | securityContext: 35 | allowPrivilegeEscalation: false 36 | capabilities: 37 | drop: ["ALL"] 38 | runAsNonRoot: false 39 | volumeMounts: 40 | - name: device-plugin 41 | mountPath: /var/lib/kubelet/device-plugins 42 | - name: infiniband-volume 43 | mountPath: /dev/infiniband 44 | resources: 45 | requests: 46 | cpu: 10m 47 | memory: 20Mi 48 | volumes: 49 | - name: device-plugin 50 | hostPath: 51 | path: /var/lib/kubelet/device-plugins 52 | - name: infiniband-volume 53 | hostPath: 54 | path: /dev/infiniband 55 | -------------------------------------------------------------------------------- /test/images/efa/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/amazonlinux/amazonlinux:2023 2 | 3 | ARG EFA_BIN_PATH="/opt/amazon/efa/bin" 4 | 5 | RUN dnf -y swap gnupg2-minimal gnupg2 && \ 6 | dnf install -y \ 7 | gcc gcc-c++ make \ 8 | ca-certificates \ 9 | cmake \ 10 | emacs \ 11 | git \ 12 | jq \ 13 | wget \ 14 | unzip \ 15 | vim \ 16 | zlib-devel \ 17 | openssl \ 18 | openssl-devel \ 19 | sqlite-devel \ 20 | gdbm-devel \ 21 | glibc-devel \ 22 | bzip2-devel \ 23 | ncurses-devel \ 24 | tk-devel \ 25 | libffi-devel \ 26 | libcap-devel \ 27 | tar \ 28 | gnupg2 29 | 30 | ENV PATH="$PATH:$EFA_BIN_PATH" 31 | 32 | RUN cd $HOME \ 33 | && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ 34 | && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ 35 | && cat aws-efa-installer.key | gpg --fingerprint \ 36 | && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ 37 | && tar -xf aws-efa-installer-latest.tar.gz \ 38 | && cd aws-efa-installer \ 39 | && ./efa_installer.sh -y -d --skip-kmod --skip-limit-conf --no-verify \ 40 | # TODO: remove this in favor of letting the efa installer add it if that ever becomes an option. 41 | # At the moment, this is only installed if omitting --no-verify, which would require 42 | # building in a context with EFA available 43 | && install -T -m 0755 efa_test.sh "${EFA_BIN_PATH}/efa_test.sh" \ 44 | && cd $HOME \ 45 | && rm -rf aws-efa-installer 46 | 47 | RUN dnf clean all 48 | 49 | RUN INSTALL_DIR=$(mktemp -d) && \ 50 | cd $INSTALL_DIR && \ 51 | curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" && \ 52 | unzip awscliv2.zip && \ 53 | ./aws/install && \ 54 | cd && \ 55 | rm -rf $INSTALL_DIR 56 | 57 | COPY test/images/efa/scripts ./scripts 58 | 59 | RUN chmod -R +x ./scripts -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/README.md: -------------------------------------------------------------------------------- 1 | # What 2 | 3 | gpu_unit_tests is the unit tests for gpu enabled platforms. Idea is to create compact 4 | set of tests which will cover most of performance critical aspects for gpu 5 | platforms. Test designed to run on single instance. 6 | # Usage 7 | 8 | ``` 9 | # Run tests 10 | ./unit_test 11 | ``` 12 | 13 | **Generate test data for new instance type** 14 | 15 | Step 1: Copy the `gpu_unit_tests` folder to the EC2 instance where you want to generate the data. 16 | 17 | Step 2: Execute the following command in the `gpu_unit_tests` directory on the EC2 instance: 18 | ``` 19 | GENERATE_DATA=1 ./unit_test 20 | ``` 21 | Step 3: 22 | Copy the files from `tests/test_sysinfo.sh.data` (e.g., `tests/test_sysinfo.sh.data/p3.2xlarge`) to your local repository. 23 | 24 | Step 4: 25 | Create PR with the new `tests/test_sysinfo.sh.data/xxx` 26 | 27 | # Test list 28 | 29 | - test_sysinfo.sh :: Validate basic system configuration by comparing it with test config 30 | - test_numa_topo_topo :: check cpu/numa topology 31 | - test_nvidia_gpu_count :: fail if one of GPUs is broken or is not visiable 32 | - test_nvidia_fabric_status :: fail if fabric manager is not active 33 | - test_nvidia_smi_topo :: fail if nvidia-smi topology is differ 34 | - test_nvidia_persistence_status :: validate persistence state 35 | - test_nvidia_gpu_unused :: Check that no other process are using GPUs, fail is a signal system misconfiguration. 36 | 37 | 38 | - 10_test_basic_cuda.sh :: Execute trivial cuda binaries, fail if cuda subsys is not healthy 39 | Use demo-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests 40 | If this test suite fail this is a sign that cuda subsystem is not usable at all. 41 | Usually this is side effect of system misconfiguration (driver or fabric manager is not loaded) 42 | - test_01_device_query 43 | - test_02_vector_add 44 | - test_03_nvbandwidth 45 | - test_04_dcgm_diagnostics 46 | 47 | 48 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/aws.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "github.com/aws/aws-sdk-go-v2/aws" 5 | "github.com/aws/aws-sdk-go-v2/service/autoscaling" 6 | "github.com/aws/aws-sdk-go-v2/service/cloudformation" 7 | "github.com/aws/aws-sdk-go-v2/service/ec2" 8 | "github.com/aws/aws-sdk-go-v2/service/eks" 9 | "github.com/aws/aws-sdk-go-v2/service/iam" 10 | "github.com/aws/aws-sdk-go-v2/service/s3" 11 | "github.com/aws/aws-sdk-go-v2/service/ssm" 12 | ) 13 | 14 | type awsClients struct { 15 | _eks *eks.Client 16 | _cfn *cloudformation.Client 17 | _ec2 *ec2.Client 18 | _asg *autoscaling.Client 19 | _ssm *ssm.Client 20 | _iam *iam.Client 21 | _s3 *s3.Client 22 | _s3Presign *s3.PresignClient 23 | } 24 | 25 | func newAWSClients(config aws.Config, eksEndpointURL string) *awsClients { 26 | clients := awsClients{ 27 | _cfn: cloudformation.NewFromConfig(config), 28 | _ec2: ec2.NewFromConfig(config), 29 | _asg: autoscaling.NewFromConfig(config), 30 | _ssm: ssm.NewFromConfig(config), 31 | _iam: iam.NewFromConfig(config), 32 | _s3: s3.NewFromConfig(config), 33 | } 34 | clients._s3Presign = s3.NewPresignClient(clients._s3) 35 | if eksEndpointURL != "" { 36 | clients._eks = eks.NewFromConfig(config, func(o *eks.Options) { 37 | o.BaseEndpoint = aws.String(eksEndpointURL) 38 | }) 39 | } else { 40 | clients._eks = eks.NewFromConfig(config) 41 | } 42 | return &clients 43 | } 44 | 45 | func (c *awsClients) EKS() *eks.Client { 46 | return c._eks 47 | } 48 | 49 | func (c *awsClients) CFN() *cloudformation.Client { 50 | return c._cfn 51 | } 52 | 53 | func (c *awsClients) EC2() *ec2.Client { 54 | return c._ec2 55 | } 56 | 57 | func (c *awsClients) ASG() *autoscaling.Client { 58 | return c._asg 59 | } 60 | 61 | func (c *awsClients) SSM() *ssm.Client { 62 | return c._ssm 63 | } 64 | 65 | func (c *awsClients) IAM() *iam.Client { 66 | return c._iam 67 | } 68 | 69 | func (c *awsClients) S3() *s3.Client { 70 | return c._s3 71 | } 72 | 73 | func (c *awsClients) S3Presign() *s3.PresignClient { 74 | return c._s3Presign 75 | } 76 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/kubeconfig.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os" 7 | "text/template" 8 | 9 | "k8s.io/klog" 10 | ) 11 | 12 | const kubeconfigPerm = 0666 13 | 14 | var kubeconfigTemplate = `--- 15 | apiVersion: v1 16 | kind: Config 17 | clusters: 18 | - cluster: 19 | certificate-authority-data: {{ .ClusterCertificateAuthority }} 20 | server: {{ .ClusterEndpoint }} 21 | name: {{ .ClusterARN }} 22 | contexts: 23 | - context: 24 | cluster: {{ .ClusterARN }} 25 | user: {{ .ClusterARN }} 26 | name: {{ .ClusterARN }} 27 | current-context: {{ .ClusterARN }} 28 | preferences: {} 29 | users: 30 | - name: {{ .ClusterARN }} 31 | user: 32 | exec: 33 | apiVersion: client.authentication.k8s.io/v1beta1 34 | command: aws 35 | args: 36 | - eks 37 | - get-token 38 | - --cluster-name 39 | - {{ .ClusterName }} 40 | ` 41 | 42 | type kubeconfigTemplateParameters struct { 43 | ClusterCertificateAuthority string 44 | ClusterARN string 45 | ClusterEndpoint string 46 | ClusterName string 47 | } 48 | 49 | func writeKubeconfig(cluster *Cluster, kubeconfigPath string) error { 50 | if cluster == nil { 51 | return fmt.Errorf("Cluster is nil, you might need set --static-cluster-name or set --up to initial cluster resrouces") 52 | } 53 | klog.Infof("writing kubeconfig to %s for cluster: %s", kubeconfigPath, cluster.arn) 54 | templateParams := kubeconfigTemplateParameters{ 55 | ClusterCertificateAuthority: cluster.certificateAuthorityData, 56 | ClusterARN: cluster.arn, 57 | ClusterEndpoint: cluster.endpoint, 58 | ClusterName: cluster.name, 59 | } 60 | 61 | kubeconfig := bytes.Buffer{} 62 | 63 | t, err := template.New("kubeconfig").Parse(kubeconfigTemplate) 64 | if err != nil { 65 | return err 66 | } 67 | err = t.Execute(&kubeconfig, templateParams) 68 | if err != nil { 69 | return err 70 | } 71 | 72 | err = os.WriteFile(kubeconfigPath, kubeconfig.Bytes(), kubeconfigPerm) 73 | if err != nil { 74 | return err 75 | } 76 | 77 | klog.Infof("wrote kubeconfig: %s\n%s", kubeconfigPath, kubeconfig.String()) 78 | return nil 79 | } 80 | -------------------------------------------------------------------------------- /internal/util/cloudformation.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strings" 7 | 8 | "github.com/aws/aws-sdk-go-v2/aws" 9 | "github.com/aws/aws-sdk-go-v2/service/cloudformation" 10 | types "github.com/aws/aws-sdk-go-v2/service/cloudformation/types" 11 | ) 12 | 13 | // TODO: implement AWS client wrappers, and incorporate this into the cfn:CreateStack call 14 | func WrapCFNStackFailure(ctx context.Context, cfnClient *cloudformation.Client, createStackErr error, stackName string) error { 15 | if createStackErr == nil { 16 | return nil 17 | } 18 | resourceByFailureMode := make(map[string][]string) 19 | eventsPaginator := cloudformation.NewDescribeStackEventsPaginator(cfnClient, &cloudformation.DescribeStackEventsInput{ 20 | StackName: &stackName, 21 | }) 22 | for eventsPaginator.HasMorePages() { 23 | page, err := eventsPaginator.NextPage(ctx) 24 | if err != nil { 25 | return createStackErr 26 | } 27 | for _, event := range page.StackEvents { 28 | if event.ResourceStatus == types.ResourceStatusCreateFailed { 29 | if _, ok := resourceByFailureMode[aws.ToString(event.ResourceStatusReason)]; !ok { 30 | resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = []string{} 31 | } 32 | resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = append(resourceByFailureMode[aws.ToString(event.ResourceStatusReason)], aws.ToString(event.LogicalResourceId)) 33 | } 34 | } 35 | } 36 | nonCancellationFailure := len(resourceByFailureMode) > 1 37 | var enhancedDetails []string 38 | for reason, resources := range resourceByFailureMode { 39 | if nonCancellationFailure && reason == "Resource creation cancelled" { 40 | // Ignore resource cancellation errors if there's another failure reported, those failures 41 | // would just be a consequence of that failure. If all the failures are resource cancellation, 42 | // then there was likely a user initiated delete of the whole stack based on a timeout 43 | // waiting for one of the resources to create 44 | continue 45 | } 46 | enhancedDetails = append(enhancedDetails, fmt.Sprintf("%s: %s", strings.Join(resources, ","), reason)) 47 | } 48 | return fmt.Errorf("%w: %s", createStackErr, strings.Join(enhancedDetails, "--")) 49 | } 50 | -------------------------------------------------------------------------------- /test/manifests/assets/nvidia-device-plugin.yaml: -------------------------------------------------------------------------------- 1 | # Source: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/main/deployments/static/nvidia-device-plugin.yml 2 | 3 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | apiVersion: apps/v1 18 | kind: DaemonSet 19 | metadata: 20 | name: nvidia-device-plugin-daemonset 21 | namespace: kube-system 22 | spec: 23 | selector: 24 | matchLabels: 25 | name: nvidia-device-plugin-ds 26 | updateStrategy: 27 | type: RollingUpdate 28 | template: 29 | metadata: 30 | labels: 31 | name: nvidia-device-plugin-ds 32 | spec: 33 | tolerations: 34 | - key: nvidia.com/gpu 35 | operator: Exists 36 | effect: NoSchedule 37 | # Mark this pod as a critical add-on; when enabled, the critical add-on 38 | # scheduler reserves resources for critical add-on pods so that they can 39 | # be rescheduled after a failure. 40 | # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 41 | priorityClassName: "system-node-critical" 42 | containers: 43 | - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.2 44 | name: nvidia-device-plugin-ctr 45 | env: 46 | - name: FAIL_ON_INIT_ERROR 47 | value: "false" 48 | securityContext: 49 | allowPrivilegeEscalation: false 50 | capabilities: 51 | drop: ["ALL"] 52 | volumeMounts: 53 | - name: device-plugin 54 | mountPath: /var/lib/kubelet/device-plugins 55 | volumes: 56 | - name: device-plugin 57 | hostPath: 58 | path: /var/lib/kubelet/device-plugins 59 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/amazonlinux/amazonlinux:2023 AS builder 2 | ARG TARGETOS 3 | ARG TARGETARCH 4 | RUN dnf install -y git tar gzip make unzip gcc rsync wget jq 5 | ARG GO_MINOR_VERSION=1.25 6 | RUN curl https://go.dev/dl/?mode=json | jq -r .[].version | grep "^go${GO_MINOR_VERSION}" | head -n1 > go-version.txt 7 | RUN wget -O go.tar.gz https://go.dev/dl/$(cat go-version.txt).${TARGETOS}-${TARGETARCH}.tar.gz && \ 8 | rm -rf /usr/local/go && \ 9 | tar -C /usr/local -xzf go.tar.gz 10 | ENV GOPATH=/usr/local/go 11 | ENV PATH=$PATH:$GOPATH/bin 12 | ENV GOPROXY=direct 13 | 14 | WORKDIR $GOPATH/src/github.com/aws/aws-k8s-tester 15 | COPY . . 16 | RUN go install ./... 17 | RUN go test -c -tags=e2e ./test/... -o $GOPATH/bin/ 18 | 19 | RUN go install sigs.k8s.io/kubetest2 && \ 20 | go install sigs.k8s.io/kubetest2/kubetest2-tester-exec && \ 21 | go install sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo 22 | 23 | FROM public.ecr.aws/amazonlinux/amazonlinux:2023 24 | ARG TARGETOS 25 | ARG TARGETARCH 26 | WORKDIR /workdir 27 | RUN dnf install -y tar gzip unzip wget openssh diffutils 28 | RUN wget -O awscli.zip https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip && \ 29 | unzip awscli.zip && \ 30 | ./aws/install 31 | # we need gsutil from the gcloud CLI for kubetest-tester-ginkgo 32 | RUN dnf install -y python3.13 33 | ARG GCLOUD_SDK_URL=https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz 34 | RUN wget -O google-cloud-sdk.tar.gz -q $GCLOUD_SDK_URL && \ 35 | tar xzf google-cloud-sdk.tar.gz -C / && \ 36 | rm google-cloud-sdk.tar.gz && \ 37 | /google-cloud-sdk/install.sh \ 38 | --disable-installation-options \ 39 | --bash-completion=false \ 40 | --path-update=false \ 41 | --usage-reporting=false 42 | ENV PATH=$PATH:/google-cloud-sdk/bin 43 | ARG EKSCTL_VERSION=latest 44 | RUN wget -O eksctl.tar.gz "https://github.com/eksctl-io/eksctl/releases/${EKSCTL_VERSION}/download/eksctl_Linux_${TARGETARCH}.tar.gz" && \ 45 | tar xzf eksctl.tar.gz -C /bin/ && \ 46 | rm eksctl.tar.gz 47 | ARG KUBERNETES_MINOR_VERSION 48 | COPY hack/download-kubernetes-binaries.sh . 49 | RUN ./download-kubernetes-binaries.sh "${KUBERNETES_MINOR_VERSION}" "${TARGETOS}" "${TARGETARCH}" 50 | RUN mkdir /info 51 | ENV PATH=$PATH:/info 52 | RUN cp kubernetes-version.txt /info/ 53 | RUN mv kubernetes/*/bin/* /bin/ 54 | RUN rm -rf /workdir 55 | COPY --from=builder /usr/local/go/bin/* /bin/ 56 | -------------------------------------------------------------------------------- /test/cases/neuron/manifests/multi-node-test-neuron.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v2beta1 2 | kind: MPIJob 3 | metadata: 4 | name: multi-node-nccom-test 5 | spec: 6 | slotsPerWorker: {{.NeuronPerNode}} 7 | runPolicy: 8 | backoffLimit: 20 9 | cleanPodPolicy: Running 10 | mpiReplicaSpecs: 11 | Launcher: 12 | replicas: 1 13 | template: 14 | spec: 15 | restartPolicy: OnFailure 16 | containers: 17 | - image: {{.NeuronTestImage}} 18 | imagePullPolicy: Always 19 | name: nccom-test-launcher 20 | env: 21 | - name: POD_IP 22 | valueFrom: 23 | fieldRef: 24 | fieldPath: status.podIP 25 | command: 26 | - /bin/bash 27 | args: 28 | - -c 29 | - | 30 | WORKER_IPS=() 31 | for i in $(seq 0 $(({{.WorkerNodeCount}} - 1))); do 32 | WORKER_IP=$(getent hosts multi-node-nccom-test-worker-$i.multi-node-nccom-test | awk '{print $1}') 33 | WORKER_IPS+=("$WORKER_IP") 34 | done 35 | 36 | export CCOM_SOCKET_IFNAME=eth0 37 | export NEURON_RT_ROOT_COMM_ID=${WORKER_IPS[0]}:63182 38 | nccom-test -r $(({{.NeuronCorePerNode}}*{{.WorkerNodeCount}})) -N {{.WorkerNodeCount}} -b "8" -e "2G" -f "2" -n "5" -w "5" -d "fp32" allr --hosts ${WORKER_IPS[*]} --data-collector-host $POD_IP --data-collector-port 60006 --debug 39 | Worker: 40 | replicas: {{.WorkerNodeCount}} 41 | template: 42 | spec: 43 | securityContext: 44 | runAsUser: 1000 45 | runAsGroup: 2000 46 | fsGroup: 3000 47 | containers: 48 | - image: {{.NeuronTestImage}} 49 | name: nccom-test-worker 50 | command: ["/bin/bash"] 51 | args: ["-c", "echo password | sudo -S /usr/sbin/sshd -D"] 52 | imagePullPolicy: Always 53 | resources: 54 | limits: 55 | aws.amazon.com/neuron: {{.NeuronPerNode}} 56 | aws.amazon.com/neuroncore: {{.NeuronCorePerNode}} 57 | vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}} 58 | requests: 59 | aws.amazon.com/neuron: {{.NeuronPerNode}} 60 | aws.amazon.com/neuroncore: {{.NeuronCorePerNode}} 61 | vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}} -------------------------------------------------------------------------------- /test/cases/nvidia-training/manifests/bert-training.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v2beta1 2 | kind: MPIJob 3 | metadata: 4 | name: bert-training 5 | spec: 6 | slotsPerWorker: {{.SlotsPerWorker}} 7 | runPolicy: 8 | backoffLimit: 20 9 | cleanPodPolicy: Running 10 | mpiReplicaSpecs: 11 | Launcher: 12 | replicas: 1 13 | template: 14 | spec: 15 | restartPolicy: OnFailure 16 | containers: 17 | - image: {{.BertTrainingImage}} 18 | imagePullPolicy: Always 19 | name: bert-training 20 | env: 21 | - name: NCCL_DEBUG 22 | value: "TRACE" 23 | - name: MASTER_ADDR 24 | value: "bert-training" 25 | - name: MASTER_PORT 26 | value: "12355" 27 | command: 28 | - /opt/amazon/openmpi/bin/mpirun 29 | - --allow-run-as-root 30 | - --tag-output 31 | - -np 32 | - "{{.NP}}" # Number of processes derived from node/gpu calculations 33 | - -bind-to 34 | - none 35 | - -map-by 36 | - slot 37 | - -x 38 | - PATH 39 | - -x 40 | - LD_LIBRARY_PATH 41 | - -x 42 | - NCCL_DEBUG 43 | - -x 44 | - MASTER_ADDR 45 | - -x 46 | - MASTER_PORT 47 | - --mca 48 | - pml 49 | - "^cm" 50 | - --mca 51 | - routed 52 | - direct 53 | - --oversubscribe 54 | - --mca 55 | - orte_base_help_aggregate 56 | - "0" 57 | - python 58 | - train.py 59 | Worker: 60 | replicas: {{.WorkerReplicas}} 61 | template: 62 | spec: 63 | volumes: 64 | - name: dshm 65 | emptyDir: 66 | medium: Memory 67 | containers: 68 | - image: {{.BertTrainingImage}} 69 | imagePullPolicy: Always 70 | name: bert-training-worker 71 | volumeMounts: 72 | - mountPath: /dev/shm 73 | name: dshm 74 | resources: 75 | requests: 76 | nvidia.com/gpu: {{.GPUPerNode}} 77 | vpc.amazonaws.com/efa: {{.EFARequested}} 78 | limits: 79 | nvidia.com/gpu: {{.GPUPerNode}} 80 | vpc.amazonaws.com/efa: {{.EFARequested}} 81 | -------------------------------------------------------------------------------- /test/cases/nvidia/containerd_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package nvidia 4 | 5 | import ( 6 | "context" 7 | "log" 8 | "testing" 9 | "time" 10 | 11 | "github.com/aws/aws-k8s-tester/internal/e2e" 12 | 13 | appsv1 "k8s.io/api/apps/v1" 14 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 15 | 16 | "sigs.k8s.io/e2e-framework/klient/wait" 17 | "sigs.k8s.io/e2e-framework/pkg/envconf" 18 | "sigs.k8s.io/e2e-framework/pkg/features" 19 | 20 | _ "embed" 21 | ) 22 | 23 | //go:embed manifests/daemonset-containerd-check.yaml 24 | var containerdCheckDS []byte 25 | 26 | func TestContainerdConfig(t *testing.T) { 27 | feat := features.New("containerd-config-check"). 28 | WithLabel("suite", "nvidia"). 29 | Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 30 | log.Println("[Setup] Applying containerd-check DaemonSet manifest.") 31 | if err := e2e.ApplyManifests(cfg.Client().RESTConfig(), containerdCheckDS); err != nil { 32 | t.Fatalf("Failed to apply containerd-check DS: %v", err) 33 | } 34 | return ctx 35 | }). 36 | Assess("DaemonSet becomes ready", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 37 | dsName := "containerd-check" 38 | dsNS := "default" 39 | 40 | log.Println("[Assess] Waiting up to 1 minute for containerd-check DS to become Ready...") 41 | ds := &appsv1.DaemonSet{ 42 | ObjectMeta: metav1.ObjectMeta{ 43 | Name: dsName, 44 | Namespace: dsNS, 45 | }, 46 | } 47 | err := wait.For( 48 | e2e.NewConditionExtension(cfg.Client().Resources()).DaemonSetReady(ds), 49 | wait.WithTimeout(1*time.Minute), 50 | ) 51 | if err != nil { 52 | t.Logf("[Assess] containerd-check DS did not become Ready: %v", err) 53 | e2e.PrintDaemonSetPodLogs(t, ctx, cfg.Client().RESTConfig(), dsNS, "app=containerd-check") 54 | t.Fatalf("containerd-check DS not Ready within 1 minute") 55 | } 56 | 57 | log.Println("[Assess] containerd-check DS is Ready.") 58 | return ctx 59 | }). 60 | Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 61 | t.Log("[Teardown] Removing containerd-check DS (no additional logs).") 62 | if err := e2e.DeleteManifests(cfg.Client().RESTConfig(), containerdCheckDS); err != nil { 63 | t.Fatalf("Failed to delete containerd-check DS: %v", err) 64 | } 65 | t.Log("[Teardown] containerd-check DS removed successfully.") 66 | return ctx 67 | }). 68 | Feature() 69 | 70 | testenv.Test(t, feat) 71 | } 72 | -------------------------------------------------------------------------------- /internal/e2e/logs.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "testing" 8 | 9 | corev1 "k8s.io/api/core/v1" 10 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | "k8s.io/client-go/kubernetes" 12 | "k8s.io/client-go/rest" 13 | ) 14 | 15 | // PrintDaemonSetPodLogs retrieves logs from each container in each pod of a DaemonSet. 16 | // namespace & labelSelector identify the DaemonSet's pods (e.g. "default", "app=containerd-check"). 17 | func PrintDaemonSetPodLogs( 18 | t *testing.T, 19 | ctx context.Context, 20 | restConfig *rest.Config, 21 | namespace string, 22 | labelSelector string, 23 | ) { 24 | clientset, err := kubernetes.NewForConfig(restConfig) 25 | if err != nil { 26 | t.Logf("failed to create typed clientset: %v", err) 27 | return 28 | } 29 | 30 | pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ 31 | LabelSelector: labelSelector, 32 | }) 33 | if err != nil { 34 | t.Logf("failed to list pods: %v", err) 35 | return 36 | } 37 | if len(pods.Items) == 0 { 38 | t.Logf("No pods found for DaemonSet with label %q in namespace %q.", labelSelector, namespace) 39 | return 40 | } 41 | 42 | for _, pod := range pods.Items { 43 | t.Logf("Pod %s status: %s", pod.Name, pod.Status.Phase) 44 | for _, container := range pod.Spec.Containers { 45 | logs, logErr := ReadPodLogs(ctx, restConfig, pod.Namespace, pod.Name, container.Name) 46 | if logErr != nil { 47 | t.Logf("Failed reading logs from %s/%s: %v", pod.Name, container.Name, logErr) 48 | } else { 49 | t.Logf("=== Logs from %s/%s ===\n%s", pod.Name, container.Name, logs) 50 | } 51 | } 52 | } 53 | } 54 | 55 | // ReadPodLogs streams logs for a specific container in a pod. 56 | func ReadPodLogs( 57 | ctx context.Context, 58 | restConfig *rest.Config, 59 | namespace, podName, containerName string, 60 | ) (string, error) { 61 | clientset, err := kubernetes.NewForConfig(restConfig) 62 | if err != nil { 63 | return "", fmt.Errorf("failed to create typed clientset: %w", err) 64 | } 65 | req := clientset.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{ 66 | Container: containerName, 67 | }) 68 | stream, err := req.Stream(ctx) 69 | if err != nil { 70 | return "", fmt.Errorf("failed to open log stream for %s/%s: %w", podName, containerName, err) 71 | } 72 | defer stream.Close() 73 | 74 | data, err := io.ReadAll(stream) 75 | if err != nil { 76 | return "", fmt.Errorf("error reading logs: %w", err) 77 | } 78 | return string(data), nil 79 | } 80 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/userdata.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "strconv" 7 | "strings" 8 | "text/template" 9 | 10 | "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" 11 | ) 12 | 13 | const ( 14 | UserDataBootstrapSh = "bootstrap.sh" 15 | UserDataNodeadm = "nodeadm" 16 | UserDataBottlerocket = "bottlerocket" 17 | ) 18 | 19 | func generateUserData(cluster *Cluster, opts *deployerOptions) (string, bool, error) { 20 | userDataIsMimePart := true 21 | var t *template.Template 22 | switch opts.UserDataFormat { 23 | case UserDataBootstrapSh: 24 | t = templates.UserDataBootstrapSh 25 | case UserDataNodeadm: 26 | // TODO: replace the YAML template with proper usage of the nodeadm API go types 27 | t = templates.UserDataNodeadm 28 | case UserDataBottlerocket: 29 | t = templates.UserDataBottlerocket 30 | userDataIsMimePart = false 31 | default: 32 | return "", false, fmt.Errorf("unknown user data format: '%s'", opts.UserDataFormat) 33 | } 34 | 35 | kubeletFeatureGates := map[string]bool{} 36 | // DRA is in beta for 1.33, and so needs to be explicitly enabled. 37 | if opts.KubernetesVersion == "1.33" { 38 | kubeletFeatureGates["DynamicResourceAllocation"] = true 39 | } 40 | 41 | nodeadmFeatureGates, err := extractFeatureGates(opts.NodeadmFeatureGates) 42 | if err != nil { 43 | return "", false, err 44 | } 45 | 46 | var buf bytes.Buffer 47 | if err := t.Execute(&buf, templates.UserDataTemplateData{ 48 | APIServerEndpoint: cluster.endpoint, 49 | CertificateAuthority: cluster.certificateAuthorityData, 50 | CIDR: cluster.cidr, 51 | Name: cluster.name, 52 | KubeletFeatureGates: kubeletFeatureGates, 53 | NodeadmFeatureGates: nodeadmFeatureGates, 54 | }); err != nil { 55 | return "", false, err 56 | } 57 | return buf.String(), userDataIsMimePart, nil 58 | } 59 | 60 | func extractFeatureGates(featureGatePairs []string) (map[string]bool, error) { 61 | featureGateMap := make(map[string]bool) 62 | for _, keyValuePair := range featureGatePairs { 63 | components := strings.Split(keyValuePair, "=") 64 | if len(components) != 2 { 65 | return featureGateMap, fmt.Errorf("expected key=value pairs but %s has %d components", keyValuePair, len(components)) 66 | } 67 | boolValue, err := strconv.ParseBool(components[1]) 68 | if err != nil { 69 | return featureGateMap, fmt.Errorf("expected bool value in %s: %v", keyValuePair, err) 70 | } 71 | featureGateMap[components[0]] = boolValue 72 | } 73 | return featureGateMap, nil 74 | } 75 | -------------------------------------------------------------------------------- /test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v2beta1 2 | kind: MPIJob 3 | metadata: 4 | name: {{.JobName}} 5 | spec: 6 | slotsPerWorker: {{.GpuPerNode}} 7 | runPolicy: 8 | # it may take a bit for the workers to get ready (the container image is heavy) 9 | # and we don't want the launcher to reach it's CrashLoopBackoff limit in the meantime 10 | backoffLimit: 20 11 | cleanPodPolicy: Running 12 | mpiReplicaSpecs: 13 | Launcher: 14 | replicas: 1 15 | template: 16 | spec: 17 | restartPolicy: OnFailure 18 | containers: 19 | - image: {{.NvidiaTestImage}} 20 | imagePullPolicy: Always 21 | name: nccl-test-launcher 22 | env: 23 | command: 24 | - mpirun 25 | - --allow-run-as-root 26 | - --tag-output 27 | - -np 28 | - "{{.WorkerNodeGpuCount}}" 29 | - -bind-to 30 | - none 31 | - -map-by 32 | - slot 33 | - -x 34 | - PATH 35 | - -x 36 | - LD_LIBRARY_PATH 37 | - -x 38 | - NCCL_DEBUG=INFO 39 | - -x 40 | - NCCL_BUFFSIZE={{.NcclBuffSize}} 41 | - -x 42 | - NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so 43 | - --mca 44 | - pml 45 | - ^cm,ucx 46 | - --mca 47 | - btl 48 | - tcp,self 49 | - --mca 50 | - btl_tcp_if_exclude 51 | - lo,docker0,veth_def_agent 52 | - /opt/nccl-tests/build/{{.TestName}} 53 | - -b 54 | - "8" 55 | - -e 56 | - {{.MaxBytes}} 57 | - -f 58 | - "2" 59 | - -c 60 | - "1" 61 | - -n 62 | - "10" 63 | Worker: 64 | replicas: {{.WorkerNodeCount}} 65 | template: 66 | spec: 67 | volumes: 68 | - name: dshm 69 | emptyDir: 70 | medium: Memory 71 | containers: 72 | - image: {{.NvidiaTestImage}} 73 | imagePullPolicy: Always 74 | name: nccl-test-worker 75 | volumeMounts: 76 | - mountPath: /dev/shm 77 | name: dshm 78 | resources: 79 | requests: 80 | nvidia.com/gpu: {{.GpuPerNode}} 81 | vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}} 82 | limits: 83 | nvidia.com/gpu: {{.GpuPerNode}} 84 | vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}} 85 | -------------------------------------------------------------------------------- /test/cases/nvidia/capabilities_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package nvidia 4 | 5 | import ( 6 | "context" 7 | "testing" 8 | "time" 9 | 10 | "github.com/aws/aws-k8s-tester/internal/e2e" 11 | 12 | v1 "k8s.io/api/core/v1" 13 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 14 | 15 | "k8s.io/apimachinery/pkg/util/wait" 16 | e2ewait "sigs.k8s.io/e2e-framework/klient/wait" 17 | "sigs.k8s.io/e2e-framework/pkg/envconf" 18 | "sigs.k8s.io/e2e-framework/pkg/features" 19 | 20 | _ "embed" 21 | ) 22 | 23 | //go:embed manifests/nvidia-driver-capabilities-check.yaml 24 | var capabilitiesCheckPod []byte 25 | 26 | const ( 27 | PodName = "moderngl-pod" 28 | PodNamespace = "default" 29 | ) 30 | 31 | func TestNvidiaDriverCapabilities(t *testing.T) { 32 | feat := features.New("nvidia-driver-capabilities-check"). 33 | WithLabel("suite", "nvidia"). 34 | Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 35 | t.Log("Applying nvidia driver capabilities check pod manifest.") 36 | // capabilitiesCheckPod only run moderngl.create_standalone_context() with NVIDIA_DRIVER_CAPABILITIES=all to load all capabilities enabled by nvidia driver. 37 | // If any lib required by any of nvidia driver capabilities is missing, it will failed with exception. 38 | if err := e2e.ApplyManifests(cfg.Client().RESTConfig(), capabilitiesCheckPod); err != nil { 39 | t.Fatalf("Failed to apply capabilities check pod manifest: %v", err) 40 | } 41 | return ctx 42 | }). 43 | Assess("Check Pod becomes ready", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 44 | t.Log("Waiting up to 5 minute for pod to complete...") 45 | pod := &v1.Pod{ 46 | ObjectMeta: metav1.ObjectMeta{ 47 | Name: PodName, 48 | Namespace: PodNamespace, 49 | }, 50 | } 51 | err := e2ewait.For( 52 | e2e.NewConditionExtension(cfg.Client().Resources()).PodSucceeded(pod), 53 | e2ewait.WithTimeout(5*time.Minute), 54 | ) 55 | if err != nil { 56 | if err == wait.ErrWaitTimeout { 57 | t.Fatalf("nvidia capabilities check pod not in compeleted phase (succeeded or failed) within 5 minute and waiter timeout: %v", err) 58 | } 59 | t.Fatalf("nvidia capabilities pod in Failed status, ModernGL check failed. Could be caused by required library missing") 60 | } 61 | t.Log("nvidia driver capabilities check succeeded.") 62 | return ctx 63 | }). 64 | Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 65 | t.Log("Removing nvidia driver capabilities check pod.") 66 | if err := e2e.DeleteManifests(cfg.Client().RESTConfig(), capabilitiesCheckPod); err != nil { 67 | t.Errorf("Failed to delete pod: %v", err) 68 | } 69 | t.Log("all test resources removed successfully.") 70 | return ctx 71 | }). 72 | Feature() 73 | 74 | testenv.Test(t, feat) 75 | } 76 | -------------------------------------------------------------------------------- /internal/metrics/cloudwatch.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | "time" 7 | 8 | "github.com/aws/aws-sdk-go-v2/service/cloudwatch" 9 | "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" 10 | "github.com/aws/aws-sdk-go/aws" 11 | "k8s.io/klog" 12 | ) 13 | 14 | // NewCloudWatchRegistry creates a new metric registry that will emit values using the specified cloudwatch client 15 | func NewCloudWatchRegistry(cw *cloudwatch.Client) MetricRegistry { 16 | return &cloudwatchRegistry{ 17 | cw: cw, 18 | lock: &sync.Mutex{}, 19 | dataByNamespace: make(map[string][]*cloudwatchMetricDatum), 20 | } 21 | } 22 | 23 | type cloudwatchRegistry struct { 24 | cw *cloudwatch.Client 25 | lock *sync.Mutex 26 | dataByNamespace map[string][]*cloudwatchMetricDatum 27 | } 28 | 29 | type cloudwatchMetricDatum struct { 30 | spec *MetricSpec 31 | value float64 32 | dimensions map[string]string 33 | timestamp time.Time 34 | } 35 | 36 | func (r *cloudwatchRegistry) Record(spec *MetricSpec, value float64, dimensions map[string]string) { 37 | r.lock.Lock() 38 | defer r.lock.Unlock() 39 | r.dataByNamespace[spec.Namespace] = append(r.dataByNamespace[spec.Namespace], &cloudwatchMetricDatum{ 40 | spec: spec, 41 | value: value, 42 | dimensions: dimensions, 43 | timestamp: time.Now(), 44 | }) 45 | } 46 | 47 | func (r *cloudwatchRegistry) Emit() error { 48 | r.lock.Lock() 49 | defer r.lock.Unlock() 50 | for namespace, data := range r.dataByNamespace { 51 | for i := 0; i < len(data); { 52 | var metricData []types.MetricDatum 53 | // we can emit up to 1000 values per PutMetricData 54 | for j := 0; j < len(data) && j < 1000; j++ { 55 | datum := data[i] 56 | var dimensions []types.Dimension 57 | for key, val := range datum.dimensions { 58 | dimensions = append(dimensions, types.Dimension{ 59 | Name: aws.String(key), 60 | Value: aws.String(val), 61 | }) 62 | } 63 | metricData = append(metricData, types.MetricDatum{ 64 | MetricName: aws.String(datum.spec.Metric), 65 | Value: aws.Float64(datum.value), 66 | Dimensions: dimensions, 67 | Timestamp: &datum.timestamp, 68 | }) 69 | i++ 70 | } 71 | _, err := r.cw.PutMetricData(context.TODO(), &cloudwatch.PutMetricDataInput{ 72 | Namespace: aws.String(namespace), 73 | MetricData: metricData, 74 | }) 75 | if err != nil { 76 | return err 77 | } 78 | } 79 | klog.Infof("emitted %d metrics to namespace: %s", len(data), namespace) 80 | } 81 | r.dataByNamespace = make(map[string][]*cloudwatchMetricDatum) 82 | return nil 83 | } 84 | 85 | func (r *cloudwatchRegistry) GetRegistered() int { 86 | r.lock.Lock() 87 | defer r.lock.Unlock() 88 | registered := 0 89 | for _, data := range r.dataByNamespace { 90 | registered += len(data) 91 | } 92 | return registered 93 | } 94 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/templates/templates.go: -------------------------------------------------------------------------------- 1 | package templates 2 | 3 | import ( 4 | _ "embed" 5 | "text/template" 6 | ) 7 | 8 | //go:embed infra.yaml 9 | var Infrastructure string 10 | 11 | //go:embed cloudwatch_agent_infra.yaml 12 | var CloudWatchAgentRbac []byte 13 | 14 | var ( 15 | //go:embed unmanaged-nodegroup.yaml.template 16 | unmanagedNodegroupTemplate string 17 | UnmanagedNodegroup = template.Must(template.New("unmanagedNodegroup").Parse(unmanagedNodegroupTemplate)) 18 | ) 19 | 20 | //go:embed cloudwatch-infra.yaml.template 21 | var CloudWatchInfra string 22 | 23 | type NetworkInterface struct { 24 | Description *string 25 | NetworkCardIndex *int 26 | DeviceIndex *int 27 | InterfaceType *string 28 | Groups []string 29 | SubnetId *string 30 | DeleteOnTermination *bool 31 | } 32 | 33 | type UnmanagedNodegroupTemplateData struct { 34 | NetworkInterfaces []NetworkInterface 35 | KubernetesVersion string 36 | InstanceTypes []string 37 | } 38 | 39 | type BusyboxDeploymentTemplateData struct { 40 | Nodes int 41 | } 42 | 43 | type NvidiaStaticClusterNodepoolTemplateData struct { 44 | Arch string 45 | InstanceTypes []string 46 | } 47 | 48 | var ( 49 | //go:embed userdata_bootstrap.sh.mimepart.template 50 | userDataBootstrapShTemplate string 51 | UserDataBootstrapSh = template.Must(template.New("userDataBootstrapSh").Parse(userDataBootstrapShTemplate)) 52 | 53 | //go:embed userdata_nodeadm.yaml.mimepart.template 54 | userDataNodeadmTemplate string 55 | UserDataNodeadm = template.Must(template.New("userDataNodeadm").Parse(userDataNodeadmTemplate)) 56 | 57 | //go:embed userdata_bottlerocket.toml.template 58 | userDataBottlerocketTemplate string 59 | UserDataBottlerocket = template.Must(template.New("userDataBottlerocket").Parse(userDataBottlerocketTemplate)) 60 | 61 | //go:embed busybox_deployment.yaml.template 62 | busyboxDeploymentTemplate string 63 | BusyboxDeployment = template.Must(template.New("busyboxDeployment").Parse(busyboxDeploymentTemplate)) 64 | 65 | //go:embed nvidia_static_cluster_nodepool.yaml.template 66 | nvidiaStaticClusterNodepoolTemplate string 67 | NvidiaStaticClusterNodepool = template.Must(template.New("nvidiaStaticClusterNodepool").Parse(nvidiaStaticClusterNodepoolTemplate)) 68 | ) 69 | 70 | type UserDataTemplateData struct { 71 | Name string 72 | CertificateAuthority string 73 | CIDR string 74 | APIServerEndpoint string 75 | KubeletFeatureGates map[string]bool 76 | NodeadmFeatureGates map[string]bool 77 | } 78 | 79 | var ( 80 | //go:embed auth_map_role.yaml.template 81 | authMapRoleTemplate string 82 | AuthMapRole = template.Must(template.New("authMapRole").Parse(authMapRoleTemplate)) 83 | ) 84 | 85 | type AuthMapRoleTemplateData struct { 86 | NodeNameStrategy string 87 | Rolearn string 88 | } 89 | -------------------------------------------------------------------------------- /test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh: -------------------------------------------------------------------------------- 1 | # Validate basic system configuration by comparing with expected config 2 | # 3 | setup_suite() 4 | { 5 | source common.sh 6 | 7 | EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-$(get_instance_type)} 8 | data=test_sysinfo.sh.data/$EC2_INSTANCE_TYPE 9 | ACTUAL_RESULTS=`mktemp -t -d test_sysinfo.sh.actual-data.XXX` 10 | assert_not_equals "" "$ACTUAL_RESULTS" 11 | notify_trace_info "ACTUAL_RESULTS: $ACTUAL_RESULTS" 12 | 13 | if [ -n "$GENERATE_DATA" ] 14 | then 15 | echo "GENERATE_DATA is enabled..." 16 | mkdir -p $data 17 | function assert_data() { 18 | generate_data "$@" 19 | } 20 | fi 21 | } 22 | 23 | teardown_suite() 24 | { 25 | assert "test -z \"$GENERATE_DATA\"" "GENERATE_DATA was enabled, fail full suite" 26 | assert_gpu_unused 27 | } 28 | 29 | 30 | test_numa_topo_topo() 31 | { 32 | assert_data $data/numa_topo.txt "grep . /sys/devices/system/node/node*/{cpulist,distance}" "Unexpected cpu topology" 33 | } 34 | 35 | test_nvidia_gpu_count() 36 | { 37 | #Just for logging purposesclear 38 | assert_status_code 0 "nvidia-smi -q" 39 | assert_data $data/gpu_count.txt "nvidia-smi --query-gpu=name,index,pci.bus_id --format csv" "Unexpected gpu count" 40 | } 41 | 42 | 43 | test_nvidia_smi_topo() 44 | { 45 | assert_data $data/nvidia_smi_topo.txt "nvidia-smi topo -m | grep GPU | cut -f 1-11" \ 46 | "Unexpected gpu topology, likely broken nvlinks" 47 | } 48 | 49 | 50 | test_nvidia_persistence_status() 51 | { 52 | assert_data $data/nvidia_persistence_status.txt "nvidia-smi --query-gpu=name,pci.bus_id,persistence_mode --format=csv" \ 53 | "Unexpected perfistance status, likely system configuration issue" 54 | } 55 | 56 | test_nvidia_gpu_unused() 57 | { 58 | assert_gpu_unused 59 | } 60 | 61 | test_nvidia_gpu_throttled() 62 | { 63 | 64 | # vGPU instances don't support GPU clock throttling detection. 65 | # This test is not applicable for vGPU instance types. 66 | if is_vgpu; then 67 | skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)" 68 | fi 69 | # https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons 70 | # The only bit allowed is nvmlClocksEventReasonGpuIdle 0x0000000000000001LL 71 | filter="egrep -v -e '(0x0000000000000000|0x0000000000000001|0x0000000000000004)'" 72 | cmd="nvidia-smi --query-gpu index,gpu_bus_id,gpu_uuid,clocks_throttle_reasons.active --format=csv,noheader" 73 | assert_status_code 1 "$cmd | $filter" "Throttled gpu detected" 74 | } 75 | 76 | 77 | test_nvidia_vgpu_license_status() 78 | { 79 | if ! is_vgpu; then 80 | skip "This test only applies to vGPU instances (g6f.*, gr6f.*)" 81 | fi 82 | 83 | assert_data $data/nvidia_vgpu_license_status.txt \ 84 | "nvidia-smi -q | grep 'vGPU Software' -A 2" \ 85 | "vGPU license status validation failed" 86 | } -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: "CI" 2 | on: 3 | pull_request: 4 | types: 5 | - opened 6 | - reopened 7 | - synchronize 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - run: go build ./... 14 | - run: go test ./... 15 | build-test: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v3 19 | - run: go test -c -tags=e2e ./test/... 20 | build-image: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: actions/checkout@v3 24 | - run: ./hack/free-disk-space.sh 25 | - run: docker build --build-arg=KUBERNETES_MINOR_VERSION=latest --file Dockerfile . 26 | build-image-efa: 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v3 30 | - run: ./hack/free-disk-space.sh 31 | - run: docker build --file test/images/efa/Dockerfile . 32 | build-image-neuronx: 33 | runs-on: ubuntu-latest 34 | steps: 35 | - uses: actions/checkout@v3 36 | - run: ./hack/free-disk-space.sh 37 | - run: docker build --file test/images/neuron/Dockerfile . 38 | build-image-nvidia: 39 | runs-on: ubuntu-latest 40 | steps: 41 | - uses: actions/checkout@v3 42 | - run: ./hack/free-disk-space.sh 43 | - run: docker build --file test/images/nvidia/Dockerfile . 44 | build-image-nvidia-training: 45 | runs-on: ubuntu-latest 46 | steps: 47 | - uses: actions/checkout@v3 48 | - run: ./hack/free-disk-space.sh 49 | - run: | 50 | docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training \ 51 | --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=$(($(nproc) - 2)) USE_MKLDNN=0 USE_DISTRIBUTED=0 USE_CUDA=0 USE_ROCM=0 USE_CAFFE2=0 USE_QNNPACK=0 USE_NNPACK=0 USE_XNNPACK=0 USE_MPS=0 BUILD_SHARED_LIBS=OFF USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 BUILD_TEST=0" 52 | build-image-nvidia-inference: 53 | runs-on: ubuntu-latest 54 | steps: 55 | - uses: actions/checkout@v3 56 | - run: ./hack/free-disk-space.sh 57 | - run: | 58 | docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference \ 59 | --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=$(($(nproc) - 2)) USE_MKLDNN=0 USE_DISTRIBUTED=0 USE_CUDA=0 USE_ROCM=0 USE_CAFFE2=0 USE_QNNPACK=0 USE_NNPACK=0 USE_XNNPACK=0 USE_MPS=0 BUILD_SHARED_LIBS=OFF USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 BUILD_TEST=0" 60 | build-image-neuron-training: 61 | runs-on: ubuntu-latest 62 | steps: 63 | - uses: actions/checkout@v3 64 | - run: ./hack/free-disk-space.sh 65 | - run: docker build --file test/images/neuron-training/Dockerfile test/images/neuron-training 66 | build-image-neuron-inference: 67 | runs-on: ubuntu-latest 68 | steps: 69 | - uses: actions/checkout@v3 70 | - run: ./hack/free-disk-space.sh 71 | - run: docker build --file test/images/neuron-inference/Dockerfile test/images/neuron-inference 72 | -------------------------------------------------------------------------------- /test/manifests/assets/cloudwatch-agent.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: prometheus-cwagentconfig 5 | namespace: amazon-cloudwatch 6 | data: 7 | cwagentconfig.json: | 8 | { 9 | "agent": { 10 | "debug": true 11 | }, 12 | "logs": { 13 | "metrics_collected": { 14 | "prometheus": { 15 | "prometheus_config_path": "/etc/prometheusconfig/prometheus.yaml", 16 | "emf_processor": { 17 | "metric_declaration": [ 18 | { 19 | "source_labels": ["job"], 20 | "label_matcher": "dcgm-exporter", 21 | "dimensions": [[{{.DimensionKeys}}]], 22 | "metric_selectors": [ 23 | "^DCGM_FI_DEV_GPU_UTIL$", 24 | "^DCGM_FI_DEV_MEM_COPY_UTIL$", 25 | "^DCGM_FI_DEV_FB_USED$", 26 | "^DCGM_FI_DEV_FB_FREE$", 27 | "^DCGM_FI_DEV_POWER_USAGE$" 28 | ] 29 | } 30 | ] 31 | } 32 | } 33 | }, 34 | "force_flush_interval": 5 35 | } 36 | } 37 | 38 | --- 39 | apiVersion: v1 40 | kind: ConfigMap 41 | metadata: 42 | name: prometheus-config 43 | namespace: amazon-cloudwatch 44 | data: 45 | prometheus.yaml: | 46 | global: 47 | scrape_interval: 1s 48 | scrape_timeout: 1s 49 | scrape_configs: 50 | - job_name: dcgm-exporter 51 | static_configs: 52 | - targets: 53 | - dcgm-exporter.kube-system.svc.cluster.local:9400 54 | metrics_path: /metrics 55 | metric_relabel_configs: 56 | {{- range $key, $value := .MetricDimensions}} 57 | - {action: replace, target_label: {{$key}}, replacement: '{{$value}}'} 58 | {{- end}} 59 | --- 60 | apiVersion: apps/v1 61 | kind: DaemonSet 62 | metadata: 63 | name: cwagent 64 | namespace: amazon-cloudwatch 65 | spec: 66 | selector: 67 | matchLabels: 68 | app: cwagent 69 | template: 70 | metadata: 71 | labels: 72 | app: cwagent 73 | spec: 74 | serviceAccountName: cwagent 75 | dnsPolicy: ClusterFirst 76 | containers: 77 | - name: cloudwatch-agent 78 | image: public.ecr.aws/cloudwatch-agent/cloudwatch-agent:latest 79 | imagePullPolicy: Always 80 | resources: 81 | limits: 82 | cpu: 1000m 83 | memory: 1000Mi 84 | requests: 85 | cpu: 200m 86 | memory: 200Mi 87 | volumeMounts: 88 | - name: prometheus-cwagentconfig 89 | mountPath: /etc/cwagentconfig 90 | - name: prometheus-config 91 | mountPath: /etc/prometheusconfig 92 | volumes: 93 | - name: prometheus-cwagentconfig 94 | configMap: 95 | name: prometheus-cwagentconfig 96 | - name: prometheus-config 97 | configMap: 98 | name: prometheus-config 99 | terminationGracePeriodSeconds: 60 100 | --- -------------------------------------------------------------------------------- /test/manifests/assets/k8s-neuron-device-plugin.yml: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/aws-neuron/aws-neuron-sdk/blob/master/src/k8/k8s-neuron-device-plugin.yml 2 | apiVersion: apps/v1 3 | kind: DaemonSet 4 | metadata: 5 | name: neuron-device-plugin-daemonset 6 | namespace: kube-system 7 | spec: 8 | selector: 9 | matchLabels: 10 | name: neuron-device-plugin-ds 11 | updateStrategy: 12 | type: RollingUpdate 13 | template: 14 | metadata: 15 | # Uncomment the annotation below if k8s version is 1.13 or lower 16 | # annotations: 17 | # scheduler.alpha.kubernetes.io/critical-pod: "" 18 | labels: 19 | name: neuron-device-plugin-ds 20 | spec: 21 | serviceAccount: neuron-device-plugin 22 | tolerations: 23 | - key: CriticalAddonsOnly 24 | operator: Exists 25 | - key: aws.amazon.com/neuron 26 | operator: Exists 27 | effect: NoSchedule 28 | # Mark this pod as a critical add-on; when enabled, the critical add-on 29 | # scheduler reserves resources for critical add-on pods so that they can 30 | # be rescheduled after a failure. 31 | # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 32 | priorityClassName: "system-node-critical" 33 | affinity: 34 | nodeAffinity: 35 | requiredDuringSchedulingIgnoredDuringExecution: 36 | nodeSelectorTerms: 37 | - matchExpressions: 38 | - key: "node.kubernetes.io/instance-type" 39 | operator: In 40 | values: 41 | - inf1.xlarge 42 | - inf1.2xlarge 43 | - inf1.6xlarge 44 | - inf1.24xlarge 45 | - inf2.xlarge 46 | - inf2.8xlarge 47 | - inf2.24xlarge 48 | - inf2.48xlarge 49 | - trn1.2xlarge 50 | - trn1.32xlarge 51 | - trn1n.32xlarge 52 | - trn2.48xlarge 53 | - trn2u.48xlarge 54 | containers: 55 | # Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin 56 | - image: public.ecr.aws/neuron/neuron-device-plugin:2.26.26.0 57 | imagePullPolicy: Always 58 | name: neuron-device-plugin 59 | env: 60 | - name: KUBECONFIG 61 | value: /etc/kubernetes/kubelet.conf 62 | - name: NODE_NAME 63 | valueFrom: 64 | fieldRef: 65 | fieldPath: spec.nodeName 66 | securityContext: 67 | allowPrivilegeEscalation: false 68 | capabilities: 69 | drop: ["ALL"] 70 | volumeMounts: 71 | - name: device-plugin 72 | mountPath: /var/lib/kubelet/device-plugins 73 | - name: infa-map 74 | mountPath: /run 75 | volumes: 76 | - name: device-plugin 77 | hostPath: 78 | path: /var/lib/kubelet/device-plugins 79 | - name: infa-map 80 | hostPath: 81 | path: /run 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /hack/update-image-tags.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | set -o pipefail 6 | 7 | ECR_PUBLIC_REGISTRY="public.ecr.aws" 8 | EKS_CONTAINER_REGISTRY="602401143452.dkr.ecr.us-west-2.amazonaws.com" 9 | 10 | # get_ecr_image_tags 11 | # e.g. get_ecr_image_tags $ECR_PUBLIC_REGISTRY amazonlinux/amazonlinux 12 | get_ecr_image_tags() { 13 | set -e 14 | local REGISTRY=$1 15 | local REPOSITORY=$2 16 | local TOKEN 17 | 18 | # Get ECR public token if image is from a public registry, otherwise use a private token 19 | # An authorization token is required for every ECR HTTP request 20 | if [ "$REGISTRY" = "$ECR_PUBLIC_REGISTRY" ]; then 21 | TOKEN=$(aws ecr-public get-authorization-token --region us-east-1 --output=text --query 'authorizationData.authorizationToken') 22 | local AUTHORIZATION_TYPE="Bearer" 23 | else 24 | TOKEN=$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken') 25 | local AUTHORIZATION_TYPE="Basic" 26 | fi 27 | 28 | curl -s -H "Authorization: ${AUTHORIZATION_TYPE} $TOKEN" "https://$REGISTRY/v2/$REPOSITORY/tags/list" | jq '.tags' 29 | } 30 | 31 | # update_image_uris REPOSITORY IMAGE_TAG 32 | update_image_uris() { 33 | local REPOSITORY=$1 34 | local NEW_TAG=$2 35 | PREFIX="image: ${REPOSITORY}" 36 | find ./test/manifests -type f -exec sed -i "s#$PREFIX:.*#$PREFIX:$NEW_TAG#g" {} + 37 | } 38 | 39 | # update the nvidia k8s device plugin 40 | echo "Updating Nvidia device plugin image" 41 | NVIDIA_DEVICE_PLUGIN_TAG=$(curl -s 'https://catalog.ngc.nvidia.com/api/containers/images?orgName=nvidia&name=k8s-device-plugin&isPublic=true' | jq -r '.images | sort_by(.updatedDate) | reverse | map(select(.tag | test("^v[0-9]+.[0-9]+.[0-9]+$"))) | first | .tag') 42 | update_image_uris nvcr.io/nvidia/k8s-device-plugin $NVIDIA_DEVICE_PLUGIN_TAG 43 | 44 | # below updates require authentication and should not exit early with a failure. 45 | # TODO: remove this once the aws credentials are setup and the paths are expected to succeed. 46 | set +e 47 | 48 | # update the neuron k8s device plugin 49 | echo "Updating Neuron device plugin image" 50 | NEURON_DEVICE_PLUGIN_REPOSITORY_NAME="neuron/neuron-device-plugin" 51 | NEURON_DEVICE_PLUGIN_TAGS=$(get_ecr_image_tags $ECR_PUBLIC_REGISTRY $NEURON_DEVICE_PLUGIN_REPOSITORY_NAME) 52 | if [ $? -eq 0 ]; then 53 | LATEST_NEURON_DEVICE_PLUGIN_TAG=$(echo $NEURON_DEVICE_PLUGIN_TAGS | jq -r 'max_by(split(".") | map(tonumber))') 54 | update_image_uris "${ECR_PUBLIC_REGISTRY}/${NEURON_DEVICE_PLUGIN_REPOSITORY_NAME}" $LATEST_NEURON_DEVICE_PLUGIN_TAG 55 | fi 56 | 57 | # update the efa k8s device plugin 58 | echo "Updating EFA device plugin image" 59 | EFA_DEVICE_PLUGIN_REPOSITORY_NAME="eks/aws-efa-k8s-device-plugin" 60 | EFA_DEVICE_PLUGIN_TAGS=$(get_ecr_image_tags $EKS_CONTAINER_REGISTRY $EFA_DEVICE_PLUGIN_REPOSITORY_NAME) 61 | if [ $? -eq 0 ]; then 62 | LATEST_EFA_DEVICE_PLUGIN_TAG=$(echo $EFA_DEVICE_PLUGIN_TAGS | jq -r 'map(split("-") | .[0]) | max_by(sub("^v"; "") | split(".") | map(tonumber))') 63 | update_image_uris "${EKS_CONTAINER_REGISTRY}/${EFA_DEVICE_PLUGIN_REPOSITORY_NAME}" $LATEST_EFA_DEVICE_PLUGIN_TAG 64 | fi -------------------------------------------------------------------------------- /test/cases/quick/node_topology_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package quick 4 | 5 | import ( 6 | "context" 7 | _ "embed" 8 | "strconv" 9 | "strings" 10 | "testing" 11 | 12 | "github.com/aws/aws-k8s-tester/internal/e2e" 13 | "github.com/aws/aws-sdk-go-v2/aws" 14 | v1 "k8s.io/api/core/v1" 15 | cloudprovider "k8s.io/cloud-provider-aws/pkg/providers/v1" 16 | "sigs.k8s.io/e2e-framework/pkg/envconf" 17 | "sigs.k8s.io/e2e-framework/pkg/features" 18 | ) 19 | 20 | func TestNodeTopology(t *testing.T) { 21 | topology := features.New("node-topology"). 22 | WithLabel("suite", "node-topology"). 23 | Assess("Nodes have correct network topology labels", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 24 | 25 | var nodes v1.NodeList 26 | cfg.Client().Resources().List(ctx, &nodes) 27 | 28 | if len(nodes.Items) == 0 { 29 | t.Fatal("no nodes found in the cluster") 30 | } 31 | 32 | nodeMap := make(map[string]v1.Node) 33 | var instanceIDs []string 34 | ec2Client := e2e.NewEC2Client() 35 | for _, node := range nodes.Items { 36 | providerIDParts := strings.Split(node.Spec.ProviderID, "/") 37 | instanceID := providerIDParts[len(providerIDParts)-1] 38 | instanceIDs = append(instanceIDs, instanceID) 39 | nodeMap[instanceID] = node 40 | } 41 | 42 | nodeTopologies, err := ec2Client.DescribeInstanceTopology(instanceIDs) 43 | if err != nil { 44 | t.Fatalf("could not describe instance topologies: %v", err) 45 | } 46 | 47 | t.Logf("checking instance topologies for %d node(s) (out of %d node(s) in the cluster)", len(nodeTopologies), len(instanceIDs)) 48 | 49 | for _, nodeTopology := range nodeTopologies { 50 | node := nodeMap[aws.ToString(nodeTopology.InstanceId)] 51 | instanceType := node.Labels["node.kubernetes.io/instance-type"] 52 | 53 | t.Logf("verifying instance topology for node %s (type: %s)", node.Name, instanceType) 54 | 55 | for i, networkNode := range nodeTopology.NetworkNodes { 56 | // https://github.com/kubernetes/cloud-provider-aws/blob/b47d2cf2a33ae655cd353ec42ea43362b804c397/pkg/providers/v1/well_known_labels.go#L26 57 | expectedLabel := cloudprovider.LabelNetworkNodePrefix + strconv.Itoa(i+1) 58 | if actualValue, ok := node.Labels[expectedLabel]; !ok { 59 | t.Errorf("node %s (type: %s) does not have expected network label %s", node.Name, instanceType, expectedLabel) 60 | } else if actualValue != networkNode { 61 | t.Errorf("node %s (type: %s) has incorrect value for label %s: expected %s, got %s", node.Name, instanceType, expectedLabel, networkNode, actualValue) 62 | } 63 | } 64 | 65 | // https://github.com/kubernetes/cloud-provider-aws/blob/b47d2cf2a33ae655cd353ec42ea43362b804c397/pkg/providers/v1/well_known_labels.go#L22C2-L22C13 66 | if aws.ToString(nodeTopology.ZoneId) != node.Labels[cloudprovider.LabelZoneID] { 67 | t.Logf("node %s (type: %s) has incorrect value for label %s: expected %s, got %s", node.Name, instanceType, cloudprovider.LabelZoneID, aws.ToString(nodeTopology.ZoneId), node.Labels[cloudprovider.LabelZoneID]) 68 | t.Fail() 69 | } 70 | } 71 | 72 | return ctx 73 | }).Feature() 74 | 75 | testenv.Test(t, topology) 76 | } 77 | -------------------------------------------------------------------------------- /test/cases/efa/commons.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package efa 4 | 5 | import ( 6 | "context" 7 | _ "embed" 8 | "fmt" 9 | "log" 10 | 11 | "github.com/aws/aws-k8s-tester/internal/e2e" 12 | "github.com/aws/aws-sdk-go-v2/aws" 13 | corev1 "k8s.io/api/core/v1" 14 | v1 "k8s.io/api/core/v1" 15 | "k8s.io/client-go/kubernetes" 16 | "sigs.k8s.io/e2e-framework/pkg/env" 17 | "sigs.k8s.io/e2e-framework/pkg/envconf" 18 | 19 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 20 | ) 21 | 22 | var ( 23 | testenv env.Environment 24 | ec2Client e2e.EC2Client 25 | 26 | testImage *string 27 | 28 | pingPongSize *string 29 | pingPongIters *int 30 | pingPongDeadlineSeconds *int 31 | 32 | nodeType *string 33 | expectedEFADeviceCount *int 34 | 35 | verbose *bool 36 | ) 37 | 38 | const ( 39 | EFA_RESOURCE_NAME = "vpc.amazonaws.com/efa" 40 | TEST_NAMESPACE_NAME = "efa-tests" 41 | ) 42 | 43 | func getEfaCapacity(node corev1.Node) int { 44 | capacity, ok := node.Status.Capacity[v1.ResourceName(EFA_RESOURCE_NAME)] 45 | if !ok { 46 | return 0 47 | } 48 | return int(capacity.Value()) 49 | } 50 | 51 | func getEfaNodes(ctx context.Context, config *envconf.Config) ([]corev1.Node, error) { 52 | var efaNodes []corev1.Node 53 | clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig()) 54 | if err != nil { 55 | return []corev1.Node{}, fmt.Errorf("failed to create Kubernetes client: %w", err) 56 | } 57 | 58 | nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) 59 | if err != nil { 60 | return []corev1.Node{}, fmt.Errorf("failed to list nodes: %w", err) 61 | } 62 | 63 | if len(nodes.Items) == 0 { 64 | return []corev1.Node{}, fmt.Errorf("no nodes found in the cluster") 65 | } 66 | 67 | for _, node := range nodes.Items { 68 | instanceType := node.Labels["node.kubernetes.io/instance-type"] 69 | 70 | if aws.ToString(nodeType) != "" && instanceType != aws.ToString(nodeType) { 71 | log.Printf("[INFO] Skipping node %s (type: %s), node is not of target type %s", node.Name, instanceType, aws.ToString(nodeType)) 72 | continue 73 | } 74 | 75 | numEfaDevices, err := e2e.GetNonZeroResourceCapacity(&node, EFA_RESOURCE_NAME) 76 | if err != nil { 77 | log.Printf("[INFO] Skipping node %s (type: %s): %v", node.Name, instanceType, err) 78 | continue 79 | } 80 | 81 | expectedDeviceCount := aws.ToInt(expectedEFADeviceCount) 82 | if expectedDeviceCount < 0 { 83 | instanceInfo, err := ec2Client.DescribeInstanceType(instanceType) 84 | if err != nil { 85 | return []corev1.Node{}, err 86 | } 87 | expectedDeviceCount = int(aws.ToInt32(instanceInfo.NetworkInfo.EfaInfo.MaximumEfaInterfaces)) 88 | } 89 | 90 | if expectedDeviceCount != numEfaDevices { 91 | return []corev1.Node{}, fmt.Errorf("unexpected EFA device capacity on node %s: expected %d, got %d", node.Name, expectedDeviceCount, numEfaDevices) 92 | } 93 | 94 | efaNodes = append(efaNodes, node) 95 | } 96 | 97 | if len(efaNodes) == 0 { 98 | return []corev1.Node{}, fmt.Errorf("no nodes with EFA capacity found in the cluster") 99 | } 100 | 101 | return efaNodes, nil 102 | } 103 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/addons.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strings" 7 | "time" 8 | 9 | "github.com/aws/aws-sdk-go-v2/aws" 10 | "github.com/aws/aws-sdk-go-v2/service/eks" 11 | "k8s.io/klog/v2" 12 | ) 13 | 14 | const ( 15 | addonCreationTimeout = 5 * time.Minute 16 | ) 17 | 18 | type AddonManager struct { 19 | clients *awsClients 20 | } 21 | 22 | func NewAddonManager(clients *awsClients) *AddonManager { 23 | return &AddonManager{ 24 | clients: clients, 25 | } 26 | } 27 | 28 | func (m *AddonManager) createAddons(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error { 29 | ctx := context.TODO() 30 | 31 | addonMap := map[string]string{} 32 | for _, addon := range opts.Addons { 33 | addonParts := strings.Split(addon, ":") 34 | if len(addonParts) != 2 { 35 | return fmt.Errorf("invalid addon format: %s", addon) 36 | } 37 | name := addonParts[0] 38 | version := addonParts[1] 39 | klog.Infof("resolving addon %s version: %s", name, version) 40 | resolvedVersion, err := m.resolveAddonVersion(name, version, opts.KubernetesVersion) 41 | if err != nil { 42 | return err 43 | } 44 | // dedupe addons with the same name. last provided entry wins. 45 | addonMap[name] = resolvedVersion 46 | } 47 | 48 | for addonName, addonVersion := range addonMap { 49 | klog.Infof("creating addon %s version: %s", addonName, addonVersion) 50 | input := eks.CreateAddonInput{ 51 | AddonName: aws.String(addonName), 52 | AddonVersion: aws.String(addonVersion), 53 | ClusterName: aws.String(cluster.name), 54 | } 55 | _, err := m.clients.EKS().CreateAddon(ctx, &input) 56 | if err != nil { 57 | return fmt.Errorf("failed to create addon: %v", err) 58 | } 59 | klog.Infof("waiting for addon to be active: %s", addonName) 60 | err = eks.NewAddonActiveWaiter(m.clients.EKS()). 61 | Wait(ctx, &eks.DescribeAddonInput{ 62 | AddonName: aws.String(addonName), 63 | ClusterName: aws.String(cluster.name), 64 | }, addonCreationTimeout) 65 | if err != nil { 66 | return fmt.Errorf("failed to wait for addon to be active: %v", err) 67 | } 68 | } 69 | 70 | return nil 71 | } 72 | 73 | func (m *AddonManager) resolveAddonVersion(name string, versionMarker string, kubernetesVersion string) (string, error) { 74 | input := eks.DescribeAddonVersionsInput{ 75 | AddonName: aws.String(name), 76 | KubernetesVersion: aws.String(kubernetesVersion), 77 | } 78 | descOutput, err := m.clients.EKS().DescribeAddonVersions(context.TODO(), &input) 79 | if err != nil { 80 | return "", err 81 | } 82 | for _, addon := range descOutput.Addons { 83 | for _, versionInfo := range addon.AddonVersions { 84 | switch versionMarker { 85 | case "latest": 86 | return *versionInfo.AddonVersion, nil 87 | case "default": 88 | for _, compatibility := range versionInfo.Compatibilities { 89 | if compatibility.DefaultVersion { 90 | return *versionInfo.AddonVersion, nil 91 | } 92 | } 93 | default: 94 | if *versionInfo.AddonVersion == versionMarker { 95 | return *versionInfo.AddonVersion, nil 96 | } 97 | } 98 | } 99 | } 100 | return "", fmt.Errorf("failed to resolve addon version: %s=%s", name, versionMarker) 101 | } 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tools for testing Kubernetes on AWS 2 | 3 | ## Installation 4 | 5 | This project will use rolling releases going forward; we recommend fetching the latest commit: 6 | ``` 7 | go install github.com/aws/aws-k8s-tester/...@HEAD 8 | ``` 9 | 10 | You'll need the standard `kubetest` tools as well: 11 | ``` 12 | go install sigs.k8s.io/kubetest2/...@latest 13 | ``` 14 | 15 | ## `kubetest2` deployers and testers for EKS 16 | 17 | 18 | ### Usage 19 | 20 | **Auto-detect cluster version** 21 | 22 | The deployers will search for a file called `kubernetes-version.txt` on your `PATH`. 23 | This file should contain a valid tag for a Kubernetes release. 24 | The `--kubernetes-version` flag can be omitted if this file exists. 25 | 26 | --- 27 | 28 | ### `eksctl` deployer 29 | 30 | This deployer is a thin wrapper around `eksctl`. 31 | 32 | The simplest usage is: 33 | ``` 34 | kubetest2 \ 35 | eksctl \ 36 | --kubernetes-version=X.XX \ 37 | --up \ 38 | --down \ 39 | --test=exec \ 40 | -- echo "Hello world" 41 | ``` 42 | 43 | **Additional flags** 44 | 45 | - `--instance-types` - comma-separated list of instance types to use for nodes 46 | - `--ami` - AMI ID for nodes 47 | - `--nodes` - number of nodes 48 | - `--region` - AWS region 49 | - `--config-file` - Path to eksctl config file (**if provided, other flags are ignored**) 50 | - `--availability-zones` - Node availability zones 51 | - `--ami-family` - AMI family to use: `AmazonLinux2023` | `Bottlerocket` 52 | - `--efa-enabled` - Enable Elastic Fabric Adapter for the nodegroup 53 | - `--volume-size` - Size of the node root volume in GB 54 | - `--private-networking` - Use private networking for nodes 55 | - `--with-oidc` - Enable OIDC provider for IAM roles for service accounts 56 | - `--deploy-target` - The target to deploy: `cluster` | `nodegroup` (defaults to `cluster`) 57 | - `--cluster-name` - Name of the EKS cluster (defaults to RunID if not specified) 58 | - `--unmanaged-nodegroup` - Use unmanaged nodegroup instead of managed nodegroup 59 | - `--nodegroup-name` - Name of the nodegroup (defaults to `ng-1`) 60 | 61 | --- 62 | 63 | ### `eksapi` deployer 64 | 65 | This deployer calls the EKS API directly, instead of using CloudFormation for EKS resources. 66 | 67 | The simplest usage is: 68 | ``` 69 | kubetest2 \ 70 | eksapi \ 71 | --kubernetes-version=X.XX \ 72 | --up \ 73 | --down \ 74 | --test=exec \ 75 | -- echo "Hello world" 76 | ``` 77 | 78 | **Additional flags** 79 | 80 | - `--instance-types` - comma-separated list of instance types to use for nodes 81 | - `--ami` - AMI ID for nodes 82 | - `--nodes` - number of nodes 83 | - `--region` - AWS region 84 | - `--endpoint-url` - Override the EKS endpoint URL 85 | - `--cluster-role-service-principal` - Additional service principal that can assume the cluster IAM role. 86 | 87 | --- 88 | 89 | ### `multi` tester 90 | 91 | This tester wraps multiple executions of other testers. 92 | 93 | Tester argument groups are separated by `--`, with the first group being passed to the `multi` tester itself. 94 | 95 | The first positional argument of each subsequent group should be the name of a tester. 96 | 97 | ``` 98 | kubetest2 \ 99 | noop \ 100 | --test=multi \ 101 | -- \ 102 | --fail-fast=true \ 103 | -- \ 104 | ginkgo \ 105 | --focus-regex='\[Conformance\]' \ 106 | --parallel=4 \ 107 | -- \ 108 | exec \ 109 | go test ./my/test/package 110 | ``` 111 | -------------------------------------------------------------------------------- /test/images/nvidia-inference/Dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Base image, arguments, and environment 3 | ############################################################################### 4 | ARG CUDA_MAJOR_VERSION=12 5 | ARG CUDA_MINOR_VERSION=8 6 | 7 | FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04 8 | 9 | ARG CUDA_MAJOR_VERSION 10 | ARG CUDA_MINOR_VERSION 11 | 12 | # Disable interactive prompts 13 | ENV DEBIAN_FRONTEND=noninteractive 14 | 15 | ############################################################################### 16 | # System packages 17 | ############################################################################### 18 | RUN apt update \ 19 | && apt upgrade -y \ 20 | && apt install -y --no-install-recommends \ 21 | build-essential \ 22 | ca-certificates \ 23 | cmake \ 24 | curl \ 25 | emacs \ 26 | git \ 27 | jq \ 28 | libopencv-dev \ 29 | software-properties-common \ 30 | wget \ 31 | unzip \ 32 | vim \ 33 | pkg-config \ 34 | gdb \ 35 | lcov \ 36 | libbz2-dev \ 37 | zlib1g-dev \ 38 | openssl \ 39 | libssl-dev \ 40 | libsqlite3-dev \ 41 | libgdbm-dev \ 42 | libc6-dev \ 43 | libbz2-dev \ 44 | libncurses-dev \ 45 | tk-dev \ 46 | libffi-dev \ 47 | libcap-dev \ 48 | gnupg2 \ 49 | gpg-agent \ 50 | && rm -rf /var/lib/apt/lists/* 51 | 52 | ############################################################################### 53 | # Build and install Python from source 54 | ############################################################################### 55 | ARG PYTHON=python3.10 56 | ARG PYTHON_VERSION=3.10.12 57 | 58 | RUN curl -sL https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz | tar xvz -C /tmp \ 59 | && cd /tmp/Python-$PYTHON_VERSION \ 60 | && ./configure --enable-shared --prefix=/usr/local \ 61 | && make -j$(nproc) \ 62 | && make install \ 63 | && cd && rm -rf /tmp/Python-$PYTHON_VERSION 64 | 65 | RUN ln -s /usr/local/bin/pip3 /usr/bin/pip \ 66 | && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ 67 | && pip3 --no-cache-dir install --upgrade pip setuptools 68 | 69 | ############################################################################### 70 | # Install Pytorch from Source 71 | ############################################################################### 72 | ARG PYTORCH_BRANCH=v2.6.0 73 | ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0" 74 | 75 | # envs needed to make the path of NVCC known to the compilation 76 | ENV CUDA_HOME=/usr/local/cuda 77 | ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 78 | ENV PATH=$PATH:$CUDA_HOME/bin 79 | ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.7;8.9;9.0;10.0;12.0" 80 | 81 | RUN pip3 install typing-extensions sympy pyyaml 82 | RUN git clone https://github.com/pytorch/pytorch.git /tmp/pytorch \ 83 | --recursive \ 84 | --branch $PYTORCH_BRANCH \ 85 | && cd /tmp/pytorch \ 86 | && eval "$PYTORCH_BUILD_ENV python3 setup.py install" \ 87 | && cd && rm -rf /tmp/pytorch 88 | 89 | ############################################################################### 90 | # Application files and Python dependencies 91 | ############################################################################### 92 | WORKDIR /app 93 | COPY infer.py /app/ 94 | COPY requirements.txt /app/ 95 | RUN pip install --no-cache-dir -r requirements.txt 96 | -------------------------------------------------------------------------------- /test/cases/efa/main_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package efa 4 | 5 | import ( 6 | "context" 7 | _ "embed" 8 | "flag" 9 | "log" 10 | "os" 11 | "os/signal" 12 | "testing" 13 | "time" 14 | 15 | "github.com/aws/aws-k8s-tester/internal/e2e" 16 | "github.com/aws/aws-k8s-tester/test/manifests" 17 | appsv1 "k8s.io/api/apps/v1" 18 | corev1 "k8s.io/api/core/v1" 19 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 20 | "sigs.k8s.io/e2e-framework/klient/wait" 21 | "sigs.k8s.io/e2e-framework/pkg/env" 22 | "sigs.k8s.io/e2e-framework/pkg/envconf" 23 | ) 24 | 25 | func getTestNamespace() *corev1.Namespace { 26 | return &corev1.Namespace{ 27 | ObjectMeta: metav1.ObjectMeta{ 28 | Name: TEST_NAMESPACE_NAME, 29 | }, 30 | } 31 | } 32 | 33 | func deployEFAPlugin(ctx context.Context, config *envconf.Config) (context.Context, error) { 34 | err := e2e.ApplyManifests(config.Client().RESTConfig(), manifests.EfaDevicePluginManifest) 35 | if err != nil { 36 | return ctx, err 37 | } 38 | efaDS := appsv1.DaemonSet{ 39 | ObjectMeta: metav1.ObjectMeta{Name: "aws-efa-k8s-device-plugin-daemonset", Namespace: "kube-system"}, 40 | } 41 | err = wait.For(e2e.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&efaDS), 42 | wait.WithContext(ctx), 43 | wait.WithTimeout(5*time.Minute), 44 | ) 45 | if err != nil { 46 | return ctx, err 47 | } 48 | 49 | return ctx, nil 50 | } 51 | 52 | func TestMain(m *testing.M) { 53 | testImage = flag.String("testImage", "", "container image to use for tests") 54 | pingPongSize = flag.String("pingPongSize", "all", "sizes to use for ping pong") 55 | pingPongIters = flag.Int("pingPongIters", 10000, "number of iterations to use for ping pong") 56 | pingPongDeadlineSeconds = flag.Int("pingPongDeadlineSeconds", 120, "maximum run time for a ping pong attempt") 57 | nodeType = flag.String("nodeType", "", "instance type to target for tests") 58 | expectedEFADeviceCount = flag.Int("expectedEFADeviceCount", -1, "expected number of efa devices for the target nodes") 59 | verbose = flag.Bool("verbose", true, "use verbose mode for tests") 60 | 61 | cfg, err := envconf.NewFromFlags() 62 | if err != nil { 63 | log.Fatalf("failed to initialize test environment: %v", err) 64 | } 65 | 66 | if *testImage == "" { 67 | log.Fatal("--testImage must be set, use https://github.com/aws/aws-k8s-tester/blob/main/test/efa/Dockerfile to build the image") 68 | } 69 | 70 | ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) 71 | defer cancel() 72 | timedCtx, cancel := context.WithTimeout(ctx, 55*time.Minute) 73 | defer cancel() 74 | 75 | testenv = env.NewWithConfig(cfg) 76 | testenv = testenv.WithContext(timedCtx) 77 | 78 | ec2Client = e2e.NewEC2Client() 79 | 80 | testenv.Setup( 81 | deployEFAPlugin, 82 | func(ctx context.Context, config *envconf.Config) (context.Context, error) { 83 | select { 84 | case <-ctx.Done(): 85 | // Cooldown to let device plugin update node object with resources 86 | case <-time.After(15 * time.Second): 87 | } 88 | 89 | return ctx, cfg.Client().Resources().Create(ctx, getTestNamespace()) 90 | }, 91 | ) 92 | 93 | testenv.Finish( 94 | func(ctx context.Context, config *envconf.Config) (context.Context, error) { 95 | cfg.Client().Resources().Delete(context.TODO(), getTestNamespace()) 96 | err := e2e.DeleteManifests(cfg.Client().RESTConfig(), manifests.EfaDevicePluginManifest) 97 | if err != nil { 98 | return ctx, err 99 | } 100 | return ctx, nil 101 | }, 102 | ) 103 | 104 | os.Exit(testenv.Run(m)) 105 | } 106 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/aws/aws-k8s-tester/issues), or [recently closed](https://github.com/aws/aws-k8s-tester/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-k8s-tester/labels/help%20wanted) issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](https://github.com/aws/aws-k8s-tester/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /internal/deployers/eksctl/cluster_config.go: -------------------------------------------------------------------------------- 1 | package eksctl 2 | 3 | import ( 4 | "fmt" 5 | 6 | eksctl_api "github.com/weaveworks/eksctl/pkg/apis/eksctl.io/v1alpha5" 7 | "k8s.io/klog" 8 | "sigs.k8s.io/yaml" 9 | ) 10 | 11 | // CreateClusterConfig constructs an eksctl_api.ClusterConfig object based on UpOptions. 12 | // This function replaces the string-based template rendering. 13 | func (d *deployer) CreateClusterConfig() (*eksctl_api.ClusterConfig, error) { 14 | d.initClusterName() 15 | 16 | cfg := eksctl_api.NewClusterConfig() 17 | // Metadata 18 | cfg.Metadata.Name = d.clusterName 19 | cfg.Metadata.Region = d.Region 20 | cfg.Metadata.Version = d.KubernetesVersion 21 | // IAM 22 | cfg.IAM.WithOIDC = &d.WithOIDC 23 | 24 | amiFamily := d.AMIFamily 25 | if amiFamily == "" { 26 | amiFamily = eksctl_api.NodeImageFamilyAmazonLinux2 27 | } 28 | nodeGroupName := d.NodegroupName 29 | if nodeGroupName == "" { 30 | nodeGroupName = "ng-1" 31 | } 32 | // Create node group or managed node group (MNG) 33 | if d.UseUnmanagedNodegroup { 34 | ng := cfg.NewNodeGroup() 35 | // TODO: update this when we add support for SSH. 36 | ng.SSH = nil 37 | ng.AMIFamily = amiFamily 38 | ng.Name = nodeGroupName 39 | if len(d.InstanceTypes) > 0 { 40 | ng.InstanceType = d.InstanceTypes[0] 41 | } 42 | if d.Nodes >= 0 { 43 | ng.MinSize = &d.Nodes 44 | ng.MaxSize = &d.Nodes 45 | ng.DesiredCapacity = &d.Nodes 46 | } 47 | if d.VolumeSize >= 0 { 48 | ng.VolumeSize = &d.VolumeSize 49 | } 50 | ng.PrivateNetworking = d.PrivateNetworking 51 | ng.EFAEnabled = &d.EFAEnabled 52 | if len(d.AvailabilityZones) > 0 { 53 | ng.AvailabilityZones = d.AvailabilityZones 54 | } 55 | if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyAmazonLinux2 { 56 | bootstrapCommand := fmt.Sprintf(`#!/bin/bash 57 | source /var/lib/cloud/scripts/eksctl/bootstrap.helper.sh 58 | /etc/eks/bootstrap.sh %s --kubelet-extra-args "--node-labels=${NODE_LABELS}"`, d.clusterName) 59 | ng.OverrideBootstrapCommand = &bootstrapCommand 60 | } 61 | } else { 62 | // Create managed node group 63 | mng := eksctl_api.NewManagedNodeGroup() 64 | cfg.ManagedNodeGroups = append(cfg.ManagedNodeGroups, mng) 65 | // TODO: update this when we add support for SSH. 66 | mng.SSH = nil 67 | mng.AMIFamily = amiFamily 68 | mng.Name = nodeGroupName 69 | mng.InstanceTypes = d.InstanceTypes 70 | if d.Nodes >= 0 { 71 | mng.MinSize = &d.Nodes 72 | mng.MaxSize = &d.Nodes 73 | mng.DesiredCapacity = &d.Nodes 74 | } 75 | if d.VolumeSize >= 0 { 76 | mng.VolumeSize = &d.VolumeSize 77 | } 78 | mng.PrivateNetworking = d.PrivateNetworking 79 | mng.EFAEnabled = &d.EFAEnabled 80 | if len(d.AvailabilityZones) > 0 { 81 | mng.AvailabilityZones = d.AvailabilityZones 82 | } 83 | if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyAmazonLinux2 { 84 | bootstrapCommand := fmt.Sprintf(`#!/bin/bash 85 | source /var/lib/cloud/scripts/eksctl/bootstrap.helper.sh 86 | /etc/eks/bootstrap.sh %s --kubelet-extra-args "--node-labels=${NODE_LABELS}"`, d.clusterName) 87 | mng.OverrideBootstrapCommand = &bootstrapCommand 88 | } else if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyBottlerocket { 89 | mng.AMI = d.AMI 90 | } 91 | } 92 | return cfg, nil 93 | } 94 | 95 | type clusterConfigTemplateParams struct { 96 | UpOptions 97 | ClusterName string 98 | Region string 99 | } 100 | 101 | func (d *deployer) RenderClusterConfig() ([]byte, error) { 102 | 103 | cfg, err := d.CreateClusterConfig() 104 | if err != nil { 105 | klog.Errorf("failed to create ClusterConfig with the deployer: %v", err) 106 | } 107 | klog.Infof("rendering cluster config yaml based on the ClusterConfig: %v", cfg) 108 | return yaml.Marshal(cfg) 109 | } 110 | -------------------------------------------------------------------------------- /test/cases/nvidia/manifests/daemonset-containerd-check.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: containerd-check 5 | namespace: default 6 | labels: 7 | app: containerd-check 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: containerd-check 12 | template: 13 | metadata: 14 | labels: 15 | app: containerd-check 16 | spec: 17 | containers: 18 | - name: containerd-check 19 | image: public.ecr.aws/amazonlinux/amazonlinux:latest 20 | command: 21 | - sh 22 | - -c 23 | - | 24 | # 1. Ensure the script fails on any command or pipeline error 25 | set -e 26 | set -o pipefail 27 | 28 | echo "=== content read by the container ===" 29 | cat /host-etc/containerd/config.toml 30 | 31 | # 2. Check containerd config version and look for appropriate sandbox field 32 | # In containerd config version = 2 expect to find pattern `sandbox_image = "registry.k8s.io/pause:3.10.1"` 33 | # In containerd config version = 3 expect to find pattern `sandbox = "registry.k8s.io/pause:3.10.1"` 34 | # For more details: https://github.com/containerd/containerd/blob/main/docs/cri/config.md 35 | version_line=$(grep -E '^version\s*=' /host-etc/containerd/config.toml || true) 36 | if [ -z "$version_line" ]; then 37 | echo "FAIL: no version line found in containerd config" 38 | exit 1 39 | fi 40 | 41 | version=$(echo "$version_line" | cut -d'=' -f2 | tr -d ' ') 42 | echo "INFO: containerd config version = $version" 43 | if [ "$version" = "2" ]; then 44 | sandbox_line=$(grep -E 'sandbox_image\s*=' /host-etc/containerd/config.toml || true) 45 | elif [ "$version" = "3" ]; then 46 | sandbox_line=$(grep -E 'sandbox\s*=' /host-etc/containerd/config.toml || true) 47 | else 48 | echo "FAIL: unsupported containerd config version: $version" 49 | exit 1 50 | fi 51 | 52 | # 3. If no sandbox configuration is found, fail explicitly 53 | if [ -z "$sandbox_line" ]; then 54 | echo "FAIL: no sandbox_image or sandbox line found" 55 | echo "=== debug ===" 56 | exit 1 57 | fi 58 | sandbox_image=$(echo "$sandbox_line" | cut -d'"' -f2) 59 | 60 | # 4. Check that $sandbox_image references .ecr. or is provided on the instance 61 | if [[ "$sandbox_image" == "localhost"* ]]; then 62 | echo "INFO: skipping .ecr. check for localhost sandbox image" 63 | else 64 | if [[ "$sandbox_image" != *".ecr."* ]]; then 65 | echo "FAIL: no .ecr. reference in $sandbox_image" 66 | echo "=== debug ===" 67 | exit 1 68 | fi 69 | fi 70 | 71 | # 5. Check for 'nvidia-container-runtime' 72 | if ! grep -q "nvidia-container-runtime" /host-etc/containerd/config.toml; then 73 | echo "FAIL: no nvidia-container-runtime found" 74 | echo "=== debug ===" 75 | exit 1 76 | fi 77 | 78 | # 6. Check for 'systemd_cgroup = true' or 'SystemdCgroup = true' 79 | if ! ( grep -q 'systemd_cgroup = true' /host-etc/containerd/config.toml || \ 80 | grep -q 'SystemdCgroup = true' /host-etc/containerd/config.toml ); then 81 | echo "FAIL: no systemd cgroup setting" 82 | echo "=== debug ===" 83 | exit 1 84 | fi 85 | 86 | echo "containerd config check PASSED." 87 | # Keep container running so DS can be marked Ready 88 | tail -f /dev/null 89 | volumeMounts: 90 | - name: containerd-config 91 | mountPath: /host-etc/containerd 92 | readOnly: true 93 | volumes: 94 | - name: containerd-config 95 | hostPath: 96 | path: /etc/containerd 97 | -------------------------------------------------------------------------------- /test/cases/quick/io_uring_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package quick 4 | 5 | import ( 6 | "context" 7 | "log" 8 | "testing" 9 | "time" 10 | 11 | "github.com/aws/aws-k8s-tester/internal/e2e" 12 | 13 | corev1 "k8s.io/api/core/v1" 14 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 15 | 16 | "sigs.k8s.io/e2e-framework/klient/k8s" 17 | "sigs.k8s.io/e2e-framework/klient/wait" 18 | "sigs.k8s.io/e2e-framework/pkg/envconf" 19 | "sigs.k8s.io/e2e-framework/pkg/features" 20 | ) 21 | 22 | func TestNpmInstallWithCPULimits(t *testing.T) { 23 | feat := features.New("npm-install"). 24 | WithLabel("suite", "quick"). 25 | Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 26 | log.Println("[Setup] Verifying cluster nodes...") 27 | var nodeList corev1.NodeList 28 | if err := cfg.Client().Resources().List(ctx, &nodeList); err != nil { 29 | t.Fatalf("Failed to list nodes: %v", err) 30 | } 31 | 32 | // Log node information 33 | for _, node := range nodeList.Items { 34 | arch := node.Labels["kubernetes.io/arch"] 35 | kernelVersion := node.Status.NodeInfo.KernelVersion 36 | t.Logf("Node: %s, Architecture: %s, Kernel: %s", node.Name, arch, kernelVersion) 37 | } 38 | return ctx 39 | }). 40 | Assess("Pod can successfully run npm install", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 41 | podName := "npm-install-test" 42 | podNS := "default" 43 | 44 | pod := &corev1.Pod{ 45 | ObjectMeta: metav1.ObjectMeta{ 46 | Name: podName, 47 | Namespace: podNS, 48 | Labels: map[string]string{ 49 | "app": "npm-install-test", 50 | }, 51 | }, 52 | Spec: corev1.PodSpec{ 53 | Containers: []corev1.Container{ 54 | { 55 | Name: "test-container", 56 | Image: "public.ecr.aws/ubuntu/ubuntu:noble", 57 | Command: []string{"/bin/sh", "-c"}, 58 | Args: []string{` 59 | set -x 60 | echo "[Test] Starting npm installation test..." 61 | mkdir asd && 62 | cd asd && 63 | apt-get update && 64 | apt-get install -y npm nodejs && 65 | echo "[Test] Starting npm install webpack..." 66 | npm install webpack --loglevel verbose || exit 1 67 | echo "[Test] npm install completed successfully" 68 | `}, 69 | }, 70 | }, 71 | RestartPolicy: corev1.RestartPolicyNever, 72 | }, 73 | } 74 | 75 | if err := cfg.Client().Resources().Create(ctx, pod); err != nil { 76 | t.Fatalf("[Assess] Failed to create pod: %v", err) 77 | } 78 | 79 | log.Printf("[Assess] Waiting up to 10 minutes for pod %s to complete...", podName) 80 | err := wait.For( 81 | e2e.NewConditionExtension(cfg.Client().Resources()).ResourceMatch(pod, func(object k8s.Object) bool { 82 | pod := object.(*corev1.Pod) 83 | return pod.Status.Phase == corev1.PodSucceeded 84 | }), 85 | wait.WithTimeout(10*time.Minute), 86 | ) 87 | if err != nil { 88 | t.Logf("[Assess] Pod did not complete successfully: %v", err) 89 | e2e.PrintDaemonSetPodLogs(t, ctx, cfg.Client().RESTConfig(), podNS, "app=npm-install-test") 90 | t.Fatal("Pod did not complete within 10 minutes - possible io_uring hang detected") 91 | } 92 | 93 | log.Printf("[Assess] Pod %s completed successfully", podName) 94 | return ctx 95 | }). 96 | Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 97 | podName := "npm-install-test" 98 | podNS := "default" 99 | 100 | t.Logf("[Teardown] Cleaning up pod %s/%s...", podNS, podName) 101 | pod := &corev1.Pod{ 102 | ObjectMeta: metav1.ObjectMeta{ 103 | Name: podName, 104 | Namespace: podNS, 105 | }, 106 | } 107 | if err := cfg.Client().Resources().Delete(ctx, pod); err != nil { 108 | t.Logf("[Teardown] Failed to delete pod: %v", err) 109 | } 110 | return ctx 111 | }). 112 | Feature() 113 | 114 | testenv.Test(t, feat) 115 | } 116 | -------------------------------------------------------------------------------- /internal/deployers/eksctl/deployer.go: -------------------------------------------------------------------------------- 1 | package eksctl 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | 9 | "github.com/aws/aws-k8s-tester/internal" 10 | "github.com/aws/aws-k8s-tester/internal/awssdk" 11 | "github.com/aws/aws-sdk-go-v2/aws" 12 | "github.com/aws/aws-sdk-go-v2/service/eks" 13 | "github.com/urfave/sflags/gen/gpflag" 14 | "github.com/spf13/pflag" 15 | "k8s.io/klog" 16 | "sigs.k8s.io/kubetest2/pkg/types" 17 | "sigs.k8s.io/yaml" 18 | ) 19 | 20 | // DeployerName is the name of the deployer 21 | const DeployerName = "eksctl" 22 | 23 | type deployer struct { 24 | // generic parts 25 | commonOptions types.Options 26 | *UpOptions 27 | awsConfig aws.Config 28 | eksClient *eks.Client 29 | KubeconfigPath string `flag:"kubeconfig" desc:"Path to kubeconfig"` 30 | // ClusterName is the effective cluster name (from flag or RunID) 31 | clusterName string 32 | } 33 | 34 | // NewDeployer implements deployer.New for EKS using eksctl 35 | func NewDeployer(opts types.Options) (types.Deployer, *pflag.FlagSet) { 36 | // create a deployer object and set fields that are not flag controlled 37 | awsConfig := awssdk.NewConfig() 38 | d := &deployer{ 39 | commonOptions: opts, 40 | awsConfig: awsConfig, 41 | eksClient: eks.NewFromConfig(awsConfig), 42 | } 43 | // register flags and return 44 | return d, bindFlags(d) 45 | } 46 | 47 | func (d *deployer) DumpClusterLogs() error { 48 | return nil 49 | } 50 | 51 | func (d *deployer) Kubeconfig() (string, error) { 52 | if d.KubeconfigPath != "" { 53 | return d.KubeconfigPath, nil 54 | } 55 | return filepath.Join(d.commonOptions.RunDir(), "kubeconfig"), nil 56 | } 57 | 58 | func (d *deployer) Version() string { 59 | return internal.Version 60 | } 61 | 62 | // bindFlags is a helper used to create & bind a flagset to the deployer 63 | func bindFlags(d *deployer) *pflag.FlagSet { 64 | flags, err := gpflag.Parse(d) 65 | if err != nil { 66 | klog.Fatalf("unable to bind flags for deployer") 67 | return nil 68 | } 69 | klog.InitFlags(nil) 70 | flags.AddGoFlagSet(flag.CommandLine) 71 | return flags 72 | } 73 | 74 | // initClusterName sets the effective cluster name with this precedence: 75 | // 1. config file 76 | // 2. --cluster-name flag 77 | // 3. RunID of the kubetest 78 | func (d *deployer) initClusterName() { 79 | // First priority: config file if provided 80 | if d.UpOptions.ConfigFile != "" { 81 | clusterName, err := d.parseClusterNameFromConfig(d.UpOptions.ConfigFile) 82 | if err == nil { 83 | d.clusterName = clusterName 84 | klog.V(2).Infof("Using cluster name from config file: %s", d.clusterName) 85 | return 86 | } 87 | klog.Warningf("Failed to extract cluster name from config file: %v", err) 88 | // Continue with other methods if parsing fails 89 | } 90 | 91 | if d.UpOptions.ClusterName != "" { 92 | d.clusterName = d.UpOptions.ClusterName 93 | klog.V(2).Infof("Using cluster name from flag: %s", d.clusterName) 94 | } else { 95 | d.clusterName = d.commonOptions.RunID() 96 | klog.V(2).Infof("Using RunID for cluster name: %s", d.clusterName) 97 | } 98 | } 99 | 100 | // parseClusterNameFromConfig extracts the cluster name from an eksctl config file 101 | func (d *deployer) parseClusterNameFromConfig(configFilePath string) (string, error) { 102 | configData, err := os.ReadFile(configFilePath) 103 | if err != nil { 104 | return "", fmt.Errorf("failed to read config file: %v", err) 105 | } 106 | 107 | // Simple YAML parsing to extract the cluster name 108 | var configMap map[string]interface{} 109 | if err := yaml.Unmarshal(configData, &configMap); err != nil { 110 | return "", fmt.Errorf("failed to parse config file YAML: %v", err) 111 | } 112 | 113 | // Extract metadata section 114 | metadata, ok := configMap["metadata"].(map[string]interface{}) 115 | if !ok { 116 | return "", fmt.Errorf("metadata section missing in config file") 117 | } 118 | 119 | // Extract name field 120 | name, ok := metadata["name"].(string) 121 | if !ok || name == "" { 122 | return "", fmt.Errorf("cluster name not found in config file metadata") 123 | } 124 | 125 | return name, nil 126 | } 127 | 128 | // assert that deployer implements types.DeployerWithKubeconfig 129 | var _ types.DeployerWithKubeconfig = &deployer{} 130 | -------------------------------------------------------------------------------- /test/cases/quick/limit_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | package quick 4 | 5 | import ( 6 | "bytes" 7 | "context" 8 | _ "embed" 9 | "io" 10 | "strings" 11 | "testing" 12 | "time" 13 | 14 | fwext "github.com/aws/aws-k8s-tester/internal/e2e" 15 | corev1 "k8s.io/api/core/v1" 16 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 17 | "k8s.io/client-go/kubernetes" 18 | "sigs.k8s.io/e2e-framework/klient/k8s" 19 | "sigs.k8s.io/e2e-framework/klient/wait" 20 | "sigs.k8s.io/e2e-framework/klient/wait/conditions" 21 | "sigs.k8s.io/e2e-framework/pkg/envconf" 22 | "sigs.k8s.io/e2e-framework/pkg/features" 23 | ) 24 | 25 | var ( 26 | //go:embed manifests/ulimit.yaml 27 | ulimitManifest []byte 28 | 29 | expectedResourceLimit = map[string]string{ 30 | "-R": "unlimited", 31 | "-c": "unlimited", 32 | "-d": "unlimited", 33 | "-e": "0", 34 | "-f": "unlimited", 35 | "-i": "30446", 36 | "-l": "unlimited", 37 | "-m": "unlimited", 38 | "-n": "1048576", 39 | "-p": "8", 40 | "-q": "819200", 41 | "-r": "0", 42 | "-s": "10240", 43 | "-t": "unlimited", 44 | "-u": "unlimited", 45 | "-v": "unlimited", 46 | "-x": "unlimited", 47 | } 48 | ) 49 | 50 | func TestUserLimits(t *testing.T) { 51 | f1 := features.New("ulimit pod"). 52 | WithLabel("type", "ulimit"). 53 | Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 54 | err := fwext.ApplyManifests(cfg.Client().RESTConfig(), ulimitManifest) 55 | if err != nil { 56 | t.Fatalf("failed to apply manifests: %v", err) 57 | } 58 | pod := &corev1.Pod{ 59 | ObjectMeta: metav1.ObjectMeta{Name: "ulimit", Namespace: "default"}, 60 | } 61 | err = wait.For(conditions.New(cfg.Client().Resources()).ResourceMatch(pod, containerTerminated), 62 | wait.WithTimeout(time.Minute*5)) 63 | if err != nil { 64 | t.Fatalf("encounter error when waiting for container finished running commands: %v", err) 65 | } 66 | return ctx 67 | }). 68 | Assess("Use default resources limit", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 69 | client, err := kubernetes.NewForConfig(cfg.Client().RESTConfig()) 70 | if err != nil { 71 | t.Fatal(err) 72 | } 73 | tailLine := int64(10000) 74 | podLogOptions := corev1.PodLogOptions{ 75 | Container: "al2023", 76 | TailLines: &tailLine, 77 | } 78 | req := client.CoreV1().Pods("default").GetLogs("ulimit", &podLogOptions) 79 | logs, err := req.Stream(ctx) 80 | if err != nil { 81 | t.Fatalf("error in opening stream: %v", err) 82 | } 83 | defer logs.Close() 84 | compareResourceLimitsWithExpectedValues(t, logs) 85 | return ctx 86 | }). 87 | Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { 88 | err := fwext.DeleteManifests(cfg.Client().RESTConfig(), ulimitManifest) 89 | if err != nil { 90 | t.Fatalf("failed to delete manifests: %v", err) 91 | } 92 | return ctx 93 | }).Feature() 94 | 95 | // test feature 96 | testenv.Test(t, f1) 97 | } 98 | 99 | func compareResourceLimitsWithExpectedValues(t *testing.T, logs io.ReadCloser) { 100 | buf := new(bytes.Buffer) 101 | _, err := io.Copy(buf, logs) 102 | if err != nil { 103 | t.Fatalf("error in copy information from podLogs to buf: %v", err) 104 | } 105 | str := buf.String() 106 | 107 | lines := strings.Split(str, "\n") 108 | for _, line := range lines[:len(lines)-1] { 109 | info := strings.Split(line, " ") 110 | marker := getMarker(info[len(info)-2]) 111 | value := info[len(info)-1] 112 | if expectedResourceLimit[marker] != value { 113 | t.Errorf("resource limit doesn't match with the default value, limit we get %v, but default value is %v", line, expectedResourceLimit[marker]) 114 | } else { 115 | t.Logf("resrouce limit fetched from ulimit: %v. Equal to the default value %v", line, expectedResourceLimit[marker]) 116 | } 117 | } 118 | } 119 | 120 | func containerTerminated(obj k8s.Object) bool { 121 | j := obj.(*corev1.Pod) 122 | containerTerminatedState := j.Status.ContainerStatuses[0].State.Terminated 123 | return containerTerminatedState.Reason == "Completed" 124 | } 125 | 126 | func getMarker(str string) string { 127 | startIndex := 0 128 | if str[:1] == "(" { 129 | startIndex = 1 130 | } 131 | return str[startIndex : len(str)-1] 132 | } 133 | -------------------------------------------------------------------------------- /test/images/neuron/tests/testNeuronMlp.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/aws/deep-learning-containers/blob/master/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronMlp 2 | import os 3 | import time 4 | import torch 5 | 6 | from torchvision.datasets import mnist 7 | from torch.utils.data import DataLoader 8 | from torchvision.transforms import ToTensor 9 | 10 | # XLA imports 11 | import torch_xla.core.xla_model as xm 12 | import torch_xla.runtime as xr 13 | 14 | # XLA imports for parallel loader and multi-processing 15 | import torch_xla.distributed.parallel_loader as pl 16 | from torch.utils.data.distributed import DistributedSampler 17 | 18 | # Initialize XLA process group for torchrun 19 | import torch_xla.distributed.xla_backend 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | 23 | torch.distributed.init_process_group('xla') 24 | 25 | # Global constants 26 | EPOCHS = 4 27 | WARMUP_STEPS = 2 28 | BATCH_SIZE = 32 29 | 30 | # Load MNIST train dataset 31 | train_dataset = mnist.MNIST(root=os.path.join(os.path.expanduser("~") + '/MNIST_DATA_train', str(xr.global_ordinal())), 32 | train=True, download=True, transform=ToTensor()) 33 | 34 | # Declare 3-layer MLP for MNIST dataset 35 | class MLP(nn.Module): 36 | def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]): 37 | super(MLP, self).__init__() 38 | self.fc1 = nn.Linear(input_size, layers[0]) 39 | self.fc2 = nn.Linear(layers[0], layers[1]) 40 | self.fc3 = nn.Linear(layers[1], output_size) 41 | 42 | def forward(self, x): 43 | x = F.relu(self.fc1(x)) 44 | x = F.relu(self.fc2(x)) 45 | x = self.fc3(x) 46 | return F.log_softmax(x, dim=1) 47 | 48 | 49 | def main(): 50 | # XLA MP: get world size 51 | world_size = xr.world_size() 52 | # multi-processing: ensure each worker has same initial weights 53 | torch.manual_seed(0) 54 | 55 | # Move model to device and declare optimizer and loss function 56 | device = 'xla' 57 | model = MLP().to(device) 58 | # For multiprocessing, scale up learning rate 59 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * world_size) 60 | loss_fn = torch.nn.NLLLoss() 61 | 62 | # Prepare data loader 63 | train_sampler = None 64 | if world_size > 1: 65 | train_sampler = DistributedSampler(train_dataset, 66 | num_replicas=world_size, 67 | rank=xr.global_ordinal(), 68 | shuffle=True) 69 | train_loader = DataLoader(train_dataset, 70 | batch_size=BATCH_SIZE, 71 | sampler=train_sampler, 72 | shuffle=False if train_sampler else True) 73 | # XLA MP: use MpDeviceLoader from torch_xla.distributed 74 | train_device_loader = pl.MpDeviceLoader(train_loader, device) 75 | 76 | # Run the training loop 77 | print('----------Training ---------------') 78 | model.train() 79 | for epoch in range(EPOCHS): 80 | start = time.time() 81 | for idx, (train_x, train_label) in enumerate(train_device_loader): 82 | optimizer.zero_grad() 83 | train_x = train_x.view(train_x.size(0), -1) 84 | output = model(train_x) 85 | loss = loss_fn(output, train_label) 86 | loss.backward() 87 | xm.optimizer_step(optimizer) # XLA MP: performs grad allreduce and optimizer step 88 | if idx < WARMUP_STEPS: # skip warmup iterations 89 | start = time.time() 90 | 91 | # Compute statistics for the last epoch 92 | interval = idx - WARMUP_STEPS # skip warmup iterations 93 | throughput = interval / (time.time() - start) 94 | print("Train throughput (iter/sec): {}".format(throughput)) 95 | print("Final loss is {:0.4f}".format(loss.detach().to('cpu'))) 96 | 97 | # Save checkpoint for evaluation (xm.save ensures only one process save) 98 | os.makedirs(os.path.expanduser("~") + "/checkpoints", exist_ok=True) 99 | checkpoint = {'state_dict': model.state_dict()} 100 | xm.save(checkpoint, os.path.expanduser("~") + '/checkpoints/checkpoint.pt') 101 | 102 | print('----------End Training ---------------') 103 | 104 | if __name__ == '__main__': 105 | main() 106 | -------------------------------------------------------------------------------- /internal/e2e/conditions.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | appsv1 "k8s.io/api/apps/v1" 8 | batchv1 "k8s.io/api/batch/v1" 9 | v1 "k8s.io/api/core/v1" 10 | apimachinerywait "k8s.io/apimachinery/pkg/util/wait" 11 | 12 | "sigs.k8s.io/e2e-framework/klient/k8s" 13 | "sigs.k8s.io/e2e-framework/klient/k8s/resources" 14 | ) 15 | 16 | type ConditionExtension struct { 17 | resources *resources.Resources 18 | } 19 | 20 | func NewConditionExtension(r *resources.Resources) *ConditionExtension { 21 | return &ConditionExtension{resources: r} 22 | } 23 | 24 | // ResourceMatch is a helper function used to check if the resource under question has met a pre-defined state. This can 25 | // be leveraged for checking fields on a resource that may not be immediately present upon creation. 26 | func (c *ConditionExtension) ResourceMatch(obj k8s.Object, matchFetcher func(object k8s.Object) bool) apimachinerywait.ConditionWithContextFunc { 27 | return func(ctx context.Context) (done bool, err error) { 28 | if err := c.resources.Get(ctx, obj.GetName(), obj.GetNamespace(), obj); err != nil { 29 | return false, err 30 | } 31 | return matchFetcher(obj), nil 32 | } 33 | } 34 | 35 | func (c *ConditionExtension) PodRunning(pod k8s.Object) apimachinerywait.ConditionWithContextFunc { 36 | return func(ctx context.Context) (done bool, err error) { 37 | if err := c.resources.Get(ctx, pod.GetName(), pod.GetNamespace(), pod); err != nil { 38 | return false, err 39 | } 40 | status := pod.(*v1.Pod).Status 41 | switch status.Phase { 42 | case v1.PodRunning: 43 | return true, nil 44 | case v1.PodPending: 45 | return false, nil 46 | default: 47 | return false, fmt.Errorf("pod cannot transition to running from current status: %s", status.Phase) 48 | } 49 | } 50 | } 51 | 52 | func (c *ConditionExtension) PodSucceeded(pod k8s.Object) apimachinerywait.ConditionWithContextFunc { 53 | return func(ctx context.Context) (done bool, err error) { 54 | if err := c.resources.Get(ctx, pod.GetName(), pod.GetNamespace(), pod); err != nil { 55 | return false, err 56 | } 57 | status := pod.(*v1.Pod).Status 58 | if status.Phase == v1.PodSucceeded { 59 | return true, nil 60 | } else if status.Phase == v1.PodFailed { 61 | return false, fmt.Errorf("Pod in Failed status") 62 | } 63 | return false, nil 64 | } 65 | } 66 | 67 | func (c *ConditionExtension) DaemonSetReady(daemonset k8s.Object) apimachinerywait.ConditionWithContextFunc { 68 | return func(ctx context.Context) (done bool, err error) { 69 | if err := c.resources.Get(ctx, daemonset.GetName(), daemonset.GetNamespace(), daemonset); err != nil { 70 | return false, err 71 | } 72 | status := daemonset.(*appsv1.DaemonSet).Status 73 | if status.NumberReady == status.DesiredNumberScheduled && status.NumberUnavailable == 0 { 74 | done = true 75 | } 76 | return 77 | } 78 | } 79 | 80 | func (c *ConditionExtension) JobSucceeded(job k8s.Object) apimachinerywait.ConditionWithContextFunc { 81 | return func(ctx context.Context) (done bool, err error) { 82 | if err := c.resources.Get(ctx, job.GetName(), job.GetNamespace(), job); err != nil { 83 | return false, err 84 | } 85 | batchJob := job.(*batchv1.Job) 86 | status := batchJob.Status 87 | spec := batchJob.Spec 88 | for _, condition := range status.Conditions { 89 | if condition.Type == batchv1.JobFailed && condition.Status == v1.ConditionTrue { 90 | return false, fmt.Errorf("job failed") 91 | } 92 | } 93 | if status.Succeeded != *spec.Completions { 94 | return false, nil 95 | } 96 | return true, nil 97 | } 98 | } 99 | 100 | func (c *ConditionExtension) AllNodesHaveNonZeroResourceCapacity(resourceLabel string) apimachinerywait.ConditionWithContextFunc { 101 | return func(ctx context.Context) (done bool, err error) { 102 | nodeList := &v1.NodeList{} 103 | if err := c.resources.List(ctx, nodeList); err != nil { 104 | return false, fmt.Errorf("failed to list nodes: %w", err) 105 | } 106 | if len(nodeList.Items) == 0 { 107 | return false, fmt.Errorf("no nodes found in the cluster") 108 | } 109 | for _, node := range nodeList.Items { 110 | resource, ok := node.Status.Capacity[v1.ResourceName(resourceLabel)] 111 | if !ok { 112 | return false, nil 113 | } 114 | if resource.Value() <= 0 { 115 | return false, nil 116 | } 117 | } 118 | return true, nil 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /internal/deployers/eksapi/ami_resolver.go: -------------------------------------------------------------------------------- 1 | package eksapi 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/aws/aws-sdk-go-v2/aws" 8 | "github.com/aws/aws-sdk-go-v2/service/ec2" 9 | ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" 10 | "github.com/aws/aws-sdk-go-v2/service/ssm" 11 | "k8s.io/klog/v2" 12 | ) 13 | 14 | func NewAMIResolver(awsClients *awsClients) *amiResolver { 15 | return &amiResolver{ 16 | clients: awsClients, 17 | } 18 | } 19 | 20 | type amiResolver struct { 21 | clients *awsClients 22 | } 23 | 24 | func (r *amiResolver) Resolve(ctx context.Context, opts *deployerOptions) (string, error) { 25 | switch opts.UserDataFormat { 26 | case UserDataBootstrapSh: 27 | // TODO: AL2 is not a high priority, skipping for now. 28 | return "", fmt.Errorf("%s is not handled", opts.UserDataFormat) 29 | case UserDataNodeadm: 30 | return r.ResolveAL2023(ctx, opts) 31 | case UserDataBottlerocket: 32 | return r.ResolveBottlerocket(ctx, opts) 33 | default: 34 | return "", fmt.Errorf("unhandled userdata format: %s", opts.UserDataFormat) 35 | } 36 | } 37 | 38 | func (r *amiResolver) ResolveAL2023(ctx context.Context, opts *deployerOptions) (string, error) { 39 | describeInstanceTypesResponse, err := r.clients.EC2().DescribeInstanceTypes(ctx, &ec2.DescribeInstanceTypesInput{ 40 | InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(r.getInstance(opts))}, 41 | }) 42 | if err != nil { 43 | return "", err 44 | } 45 | instanceTypeInfo := describeInstanceTypesResponse.InstanceTypes[0] 46 | 47 | arch, err := r.resolveArch(instanceTypeInfo) 48 | if err != nil { 49 | return "", err 50 | } 51 | 52 | variant := "standard" 53 | if instanceTypeInfo.NeuronInfo != nil { 54 | if len(instanceTypeInfo.NeuronInfo.NeuronDevices) > 0 { 55 | variant = "neuron" 56 | } 57 | } else if instanceTypeInfo.GpuInfo != nil { 58 | for _, gpu := range instanceTypeInfo.GpuInfo.Gpus { 59 | if aws.ToString(gpu.Manufacturer) == "NVIDIA" { 60 | variant = "nvidia" 61 | break 62 | } 63 | } 64 | } 65 | 66 | getParameterReponse, err := r.clients.SSM().GetParameter(ctx, &ssm.GetParameterInput{ 67 | Name: aws.String(fmt.Sprintf("/aws/service/eks/optimized-ami/%s/amazon-linux-2023/%s/%s/recommended/image_id", opts.KubernetesVersion, arch, variant)), 68 | }) 69 | if err != nil { 70 | return "", err 71 | } 72 | 73 | return aws.ToString(getParameterReponse.Parameter.Value), nil 74 | } 75 | 76 | func (r *amiResolver) ResolveBottlerocket(ctx context.Context, opts *deployerOptions) (string, error) { 77 | describeInstanceTypesResponse, err := r.clients.EC2().DescribeInstanceTypes(ctx, &ec2.DescribeInstanceTypesInput{ 78 | InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(r.getInstance(opts))}, 79 | }) 80 | if err != nil { 81 | return "", err 82 | } 83 | instanceTypeInfo := describeInstanceTypesResponse.InstanceTypes[0] 84 | 85 | arch, err := r.resolveArch(instanceTypeInfo) 86 | if err != nil { 87 | return "", err 88 | } 89 | 90 | // TODO: enable fips 91 | flavorSuffix := "" 92 | if instanceTypeInfo.GpuInfo != nil { 93 | for _, gpu := range instanceTypeInfo.GpuInfo.Gpus { 94 | if aws.ToString(gpu.Manufacturer) == "NVIDIA" { 95 | flavorSuffix = "-nvidia" 96 | break 97 | } 98 | } 99 | } 100 | 101 | getParameterResponse, err := r.clients.SSM().GetParameter(ctx, &ssm.GetParameterInput{ 102 | Name: aws.String(fmt.Sprintf("/aws/service/bottlerocket/aws-k8s-%s%s/%s/latest/image_id", opts.KubernetesVersion, flavorSuffix, arch)), 103 | }) 104 | if err != nil { 105 | return "", err 106 | } 107 | 108 | return aws.ToString(getParameterResponse.Parameter.Value), nil 109 | } 110 | 111 | func (r *amiResolver) getInstance(opts *deployerOptions) string { 112 | instanceType := opts.InstanceTypes[0] 113 | if len(opts.InstanceTypes) > 1 { 114 | klog.Warningf("only resolving AMI based on first instance type: %s", instanceType) 115 | } 116 | return instanceType 117 | } 118 | 119 | func (r *amiResolver) resolveArch(instanceTypeInfo ec2types.InstanceTypeInfo) (string, error) { 120 | // TODO: the ordering might be weird because old instances might support 121 | // both i386 and x8664. 122 | switch arch := instanceTypeInfo.ProcessorInfo.SupportedArchitectures[0]; arch { 123 | case ec2types.ArchitectureTypeArm64, ec2types.ArchitectureTypeX8664: 124 | return string(arch), nil 125 | default: 126 | return "", fmt.Errorf("unhandled arch: %s", arch) 127 | } 128 | } 129 | --------------------------------------------------------------------------------