├── gke
    ├── data.tf
    ├── versions.tf
    ├── providers.tf
    ├── outputs.tf
    ├── terraform.tfvars
    ├── variables.tf
    ├── main.tf
    └── README.md
├── .gitignore
├── eks
    ├── versions.tf
    ├── setup-kube-config.sh
    ├── provider.tf
    ├── data.tf
    ├── output.tf
    ├── terraform.tfvars
    ├── main.tf
    ├── variables.tf
    └── README.md
├── aks
    ├── versions.tf
    ├── outputs.tf
    ├── providers.tf
    ├── terraform.tfvars
    ├── variables.tf
    ├── main.tf
    └── README.md
├── CONTRIBUTING.md
├── README.md
└── LICENSE


/gke/data.tf:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | data "google_client_config" "provider" {}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | **/.terraform/*
4 | *.tfstate
5 | *.tfstate.*
6 | crash.log
7 | tfplan
8 | 


--------------------------------------------------------------------------------
/eks/versions.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | terraform {
 5 |   required_providers {
 6 |     aws = {
 7 |       source  = "hashicorp/aws"
 8 |       version = "~>5.93.0"
 9 |     }
10 |     kubernetes = {
11 |       source  = "hashicorp/kubernetes"
12 |       version = "~>2.19.0"
13 |     }
14 |   }
15 | 
16 |   required_version = ">= 1.2.4"
17 | }
18 | 


--------------------------------------------------------------------------------
/aks/versions.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | terraform {
 5 |   required_providers {
 6 |     azurerm = {
 7 |       source  = "hashicorp/azurerm"
 8 |       version = "~>4.25.0"
 9 |     }
10 |     kubernetes = {
11 |       source  = "hashicorp/kubernetes"
12 |       version = "~>2.19.0"
13 |     }
14 |   }
15 | 
16 |   required_version = ">= 1.3.4"
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/gke/versions.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | terraform {
 4 |   required_providers {
 5 |     google = {
 6 |       source  = "hashicorp/google"
 7 |       version = "~>6.27.0"
 8 |     }
 9 |     google-beta = {
10 |       source  = "hashicorp/google-beta"
11 |       version = "~>6.27.0"
12 |     }
13 |     kubernetes = {
14 |       source  = "hashicorp/kubernetes"
15 |       version = "~>2.19.0"
16 |     }
17 |   }
18 | 
19 |   required_version = ">= 1.3.4"
20 | }
21 | 


--------------------------------------------------------------------------------
/eks/setup-kube-config.sh:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #!/bin/bash
 5 | 
 6 | eval "$(jq -r '@sh "_aws_profile=\(.aws_profile) _aws_region=\(.aws_region) _eks_cluster_name=\(.eks_cluster_name) _eks_cluster_arn=\(.eks_cluster_arn)"')"
 7 | 
 8 | if aws --profile "${_aws_profile}" eks update-kubeconfig --region "${_aws_region}" --name "${_eks_cluster_name}" 1> /dev/null 2>&1; then
 9 |   jq -n --arg command_to_use "kubectl config use-context ${_eks_cluster_arn}" '{"status":"success","commandToUse":$command_to_use}'
10 | else
11 |   jq -n '{"status":"failure"}'
12 | fi


--------------------------------------------------------------------------------
/eks/provider.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | 
 5 | provider "aws" {
 6 |   region  = var.aws_region # Ensure this variable is properly set
 7 | }
 8 | 
 9 | 
10 | provider "helm" {
11 |   kubernetes {
12 |     host                   = data.aws_eks_cluster.eks.endpoint
13 |     cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks.certificate_authority.0.data)
14 |     exec {
15 |       api_version = "client.authentication.k8s.io/v1"
16 |       args        = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.eks.name]
17 |       command     = "aws"
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/gke/providers.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | provider "kubernetes" {
 5 |   host  = "https://${google_container_cluster.gke.endpoint}"
 6 |   token = data.google_client_config.provider.access_token
 7 |   cluster_ca_certificate = base64decode(
 8 |     google_container_cluster.gke.master_auth.0.cluster_ca_certificate,
 9 |   )
10 | }
11 | 
12 | provider "helm" {
13 |   kubernetes {
14 |     token = data.google_client_config.provider.access_token
15 |     host  = "https://${google_container_cluster.gke.endpoint}"
16 |     cluster_ca_certificate = base64decode(
17 |       google_container_cluster.gke.master_auth.0.cluster_ca_certificate,
18 |     )
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/aks/outputs.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | output "resource_group_name" {
 5 |   value = var.existing_resource_group_name == null ? azurerm_resource_group.aks[0].name : data.azurerm_resource_group.existing[0].name
 6 | }
 7 | 
 8 | output "kubernetes_cluster_name" {
 9 |   value = azurerm_kubernetes_cluster.aks.name
10 | }
11 | 
12 | output "client_certificate" {
13 |   sensitive = true
14 |   value     = azurerm_kubernetes_cluster.aks.kube_config.0.client_certificate
15 | }
16 | 
17 | output "kube_config" {
18 |   value     = azurerm_kubernetes_cluster.aks.kube_config_raw
19 |   sensitive = true
20 | }
21 | 
22 | output "location" {
23 |   value = azurerm_kubernetes_cluster.aks.location
24 | }
25 | 


--------------------------------------------------------------------------------
/eks/data.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | data "aws_availability_zones" "available" {}
 5 | data "aws_region" "current" {}
 6 | 
 7 | data "aws_ami" "lookup" {
 8 |   most_recent = true
 9 |   owners      = local.ami_lookup.owners
10 |   dynamic "filter" {
11 |     for_each = local.ami_lookup.filters
12 |     content {
13 |       name   = filter.value["name"]
14 |       values = filter.value["values"]
15 |     }
16 |   }
17 | }
18 | 
19 | data "aws_instances" "nodes" {
20 |   filter {
21 |     name   = "tag:aws:autoscaling:groupName"
22 |     values = module.eks.eks_managed_node_groups["gpu_node_pool"]["node_group_autoscaling_group_names"]
23 |   }
24 |   instance_state_names = ["running"]
25 | }
26 | 
27 | data "aws_eks_cluster" "eks" {
28 |   name = module.eks.cluster_id
29 | }
30 | 


--------------------------------------------------------------------------------
/aks/providers.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | provider "azurerm" {
 5 |   features {}
 6 |   
 7 |   subscription_id = var.subscription_id
 8 | }
 9 | 
10 | provider "kubernetes" {
11 |   host                   = azurerm_kubernetes_cluster.aks.kube_admin_config.0.host
12 |   client_certificate     = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.client_certificate)
13 |   client_key             = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.client_key)
14 |   cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.cluster_ca_certificate)
15 | }
16 | 
17 | provider "helm" {
18 |   kubernetes {
19 |     host                   = azurerm_kubernetes_cluster.aks.kube_admin_config.0.host
20 |     client_certificate     = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.client_certificate)
21 |     client_key             = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.client_key)
22 |     cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.cluster_ca_certificate)
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/eks/output.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | output "private_subnet_ids" {
 5 |   value = module.vpc.*.private_subnets
 6 | }
 7 | 
 8 | output "public_subnet_ids" {
 9 |   value = module.vpc.*.public_subnets
10 | }
11 | 
12 | output "nodes" {
13 |   value = data.aws_instances.nodes.public_ips
14 | }
15 | 
16 | output "cluster_endpoint" {
17 |   value = module.eks.cluster_endpoint
18 | }
19 | 
20 | output "cpu_node_role_name" {
21 |   description = "IAM Node Role Bane for CPU node pools"
22 |   value       = module.eks.eks_managed_node_groups.cpu_node_pool.iam_role_name
23 | }
24 | 
25 | output "gpu_node_role_name" {
26 |   description = "IAM Node Role Name for GPU node pools"
27 |   value       = module.eks.eks_managed_node_groups.gpu_node_pool.iam_role_name
28 | }
29 | 
30 | output "oidc_endpoint" {
31 |   value = module.eks.oidc_provider
32 | }
33 | output "cluster_ca_certificate" {
34 |   value     = module.eks.cluster_certificate_authority_data
35 |   sensitive = true
36 | }
37 | 
38 | output "kube_exec_api_version" {
39 |   value = "client.authentication.k8s.io/v1beta1"
40 | }
41 | 
42 | output "kube_exec_command" {
43 |   value = "aws"
44 | }
45 | 
46 | output "kube_exec_args" {
47 |   value = [
48 |     # "--profile",
49 |     # var.aws_profile,
50 |     "eks",
51 |     "get-token",
52 |     "--region",
53 |     data.aws_region.current.name,
54 |     "--cluster-name",
55 |     module.eks.cluster_id
56 |   ]
57 | }
58 | 


--------------------------------------------------------------------------------
/gke/outputs.tf:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | /***************************
 5 | Outputs
 6 | ***************************/
 7 | 
 8 | output "region" {
 9 |   value       = var.region
10 |   description = "Region for Kubernetes Resources to be created in when using this module"
11 | }
12 | 
13 | output "project_id" {
14 |   value       = var.project_id
15 |   description = "GCloud Project ID"
16 | }
17 | 
18 | /***************************
19 | VPC Network Outputs
20 | ***************************/
21 | 
22 | output "vpc_project" {
23 |   value       = google_compute_network.gke-vpc[*].project
24 |   description = "Project of the VPC network (can be different from the project launching Kubernetes resources)"
25 | }
26 | 
27 | output "subnet_cidr_range" {
28 |   value       = google_compute_subnetwork.gke-subnet[*].ip_cidr_range
29 |   description = "The IPs and CIDRs of the subnets"
30 | }
31 | 
32 | output "subnet_region" {
33 |   value       = google_compute_subnetwork.gke-subnet[*].region
34 |   description = "The region of the VPC subnet used in this module"
35 | }
36 | /***************************
37 | GKE Outputs
38 | ***************************/
39 | output "kubernetes_cluster_name" {
40 |   value       = google_container_cluster.gke.name
41 |   description = "GKE Cluster Name"
42 | }
43 | 
44 | output "kubernetes_cluster_endpoint_ip" {
45 |   value       = google_container_cluster.gke.endpoint
46 |   description = "GKE Cluster IP Endpoint"
47 | }
48 | 
49 | output "kubernetes_config_file" {
50 |   value       = google_container_cluster.gke.master_auth.0.cluster_ca_certificate
51 |   description = "GKE Cluster IP Endpoint"
52 |   sensitive   = true
53 | }
54 | 


--------------------------------------------------------------------------------
/aks/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # Sample tfvars file. Uncomment out values to use
 5 | # Do not commit this file to Git with sensitive values
 6 | 
 7 | 
 8 | # MANDATORY PARAMETER:
 9 | admin_group_object_ids       = ["xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"]
10 | subscription_id              = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
11 | 
12 | # Uncomment and Modify below parameters if needed:
13 | # cluster_name                 = "aks-cluster-tf"
14 | # kubernetes_version           = "1.32"
15 | # location                     = "eastus"
16 | # cpu_machine_type             = "Standard_D16_v5"
17 | # cpu_node_pool_count          = 1
18 | # cpu_node_pool_disk_size      = 100
19 | # cpu_node_pool_max_count      = 5
20 | # cpu_node_pool_min_count      = 1
21 | # cpu_os_sku                   = "Ubuntu"
22 | # existing_resource_group_name = ""
23 | # gpu_machine_type             = "Standard_NC4as_T4_v3"
24 | # gpu_node_pool_count          = 2
25 | # gpu_node_pool_disk_size      = 100
26 | # gpu_node_pool_max_count      = 5
27 | # gpu_node_pool_min_count      = 1 
28 | 
29 | 
30 | 
31 | ########################
32 | # GPU OPERATOR         #
33 | ########################
34 | 
35 | # install_gpu_operator              = "true"
36 | # gpu_operator_driver_version       = "570.124.06"
37 | # gpu_operator_namespace            = "gpu-operator"
38 | # gpu_operator_version              = "v25.3.0"
39 | 
40 | 
41 | ########################
42 | # NIM OPERATOR         #
43 | ########################
44 | 
45 | # install_nim_operator              = "false"
46 | # nim_operator_version              = "v1.0.1"
47 | # nim_operator_namespace            = "nim-operator"
48 | 


--------------------------------------------------------------------------------
/gke/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # Sample tfvars file. Uncomment out values to use
 5 | # Do not commit this file to Git with sensitive values
 6 | 
 7 | 
 8 | ########################
 9 | #  CLUSTER PARAMETERS  #
10 | ########################
11 | # Useful link: https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#limitations
12 | 
13 | # MANDATORY PARAMETER:
14 | project_id                          = "xx-xxxx-xxxx"
15 | 
16 | 
17 | # Uncomment and Modify below parameters if needed:
18 | # cluster_name                      = "gke-cluster-tf"
19 | # min_master_version                = "1.32"
20 | # cpu_instance_type                 = "n1-standard-4"
21 | # cpu_max_node_count                = "5"
22 | # cpu_min_node_count                = "1"
23 | # disk_size_gb                      = "512"
24 | # gpu_count                         = "1"
25 | # gpu_instance_tags                 = []
26 | # gpu_instance_type                 = "n1-standard-4"
27 | # gpu_max_node_count                = "5"
28 | # gpu_min_node_count                = "2"
29 | # gpu_type                          = "nvidia-tesla-t4"
30 | # network                           = ""
31 | # num_cpu_nodes                     = 1
32 | # num_gpu_nodes                     = 2
33 | # region                            = "us-west1"
34 | # node_zones                        = ["us-west1-b"]
35 | # release_channel                   = "REGULAR"
36 | # subnetwork                        = ""
37 | # use_cpu_spot_instances            = false
38 | # use_gpu_spot_instances            = false
39 | # vpc_enabled                       = true
40 | 
41 | 
42 | ########################
43 | # GPU OPERATOR         #
44 | ########################
45 | 
46 | # install_gpu_operator              = "true"
47 | # gpu_operator_driver_version       = "570.124.06"
48 | # gpu_operator_namespace            = "gpu-operator"
49 | # gpu_operator_version              = "v25.3.0"
50 | 
51 | 
52 | ########################
53 | # NIM OPERATOR         #
54 | ########################
55 | 
56 | # install_nim_operator              = "false"
57 | # nim_operator_version              = "v1.0.1"
58 | # nim_operator_namespace            = "nim-operator"
59 | 
60 | 


--------------------------------------------------------------------------------
/eks/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # Sample tfvars file. Uncomment out values to use
 5 | # Do not commit this file to Git with sensitive values
 6 | 
 7 | 
 8 | ########################
 9 | #  CLUSTER PARAMETERS  #
10 | ########################
11 | # For Instances refer https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html
12 | 
13 | 
14 | # cluster_name                          = "eks-cluster-tf"
15 | # cluster_version                       = "1.32"
16 | # aws_region                            = "us-west-2"
17 | 
18 | 
19 | # additional_node_security_groups_rules = {}
20 | # additional_security_group_ids         = []
21 | # additional_user_data                  = ""
22 | # aws_profile                           = "development"
23 | # cidr_block                            = "10.0.0.0/16"
24 | # cpu_instance_type                     = "t2.xlarge"
25 | # cpu_node_pool_additional_user_data    = ""
26 | # cpu_node_pool_delete_on_termination   = true
27 | # cpu_node_pool_root_disk_size_gb       = 512
28 | # cpu_node_pool_root_volume_type        = "gp2"
29 | # desired_count_cpu_nodes               = "1"
30 | # desired_count_gpu_nodes               = "2"
31 | # enable_dns_hostnames                  = true
32 | # enable_dns_support                    = true
33 | # enable_nat_gateway                    = true
34 | # existing_vpc_details                  = ""
35 | # gpu_ami_id                            = ""
36 | # gpu_instance_type                     = "g4dn.2xlarge"
37 | # gpu_node_pool_additional_user_data    = ""
38 | # gpu_node_pool_delete_on_termination   = true
39 | # gpu_node_pool_root_disk_size_gb       = 512
40 | # gpu_node_pool_root_volume_type        = "gp2"
41 | # max_cpu_nodes                         = "2"
42 | # max_gpu_nodes                         = "5"
43 | # min_cpu_nodes                         = "0"
44 | # min_gpu_nodes                         = "1"
45 | # private_subnets = [
46 | #   "10.0.0.0/19",
47 | #   "10.0.32.0/19",
48 | #   "10.0.64.0/19"
49 | # ]
50 | # public_subnets = [
51 | #   "10.0.96.0/19",
52 | #   "10.0.128.0/19",
53 | #   "10.0.160.0/19"
54 | # ]
55 | # single_nat_gateway = false
56 | # ssh_key            = ""
57 | 
58 | 
59 | ########################
60 | # GPU OPERATOR         #
61 | ########################
62 | 
63 | #install_gpu_operator                  = "true"
64 | #gpu_operator_driver_version           = "570.124.06"
65 | #gpu_operator_namespace                = "gpu-operator"
66 | #gpu_operator_version                  = "v25.3.0"
67 | 
68 | 
69 | ########################
70 | # NIM OPERATOR         #
71 | ########################
72 | 
73 | # install_nim_operator                  = "false"
74 | # nim_operator_version                  = "v1.0.0"
75 | # nim_operator_namespace                = "nim-operator"
76 | 
77 | 


--------------------------------------------------------------------------------
/aks/variables.tf:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | /****************************
  5 | Azure Resource Group Variables
  6 | ****************************/
  7 | 
  8 | variable "existing_resource_group_name" {
  9 |   description = "The name of an existing resource group the Kubernetes cluster should be deployed into. Defaults to the name of the cluster + `-rg` if none is specified"
 10 |   default     = null
 11 |   type        = string
 12 | }
 13 | 
 14 | variable "location" {
 15 |   description = "The region to create resources in"
 16 |   default     = "eastus"
 17 | }
 18 | 
 19 | /****************************
 20 | AKS Variables
 21 | ****************************/
 22 | 
 23 | variable "cluster_name" {
 24 |   default     = "aks-cluster-tf"
 25 |   description = "The name of the AKS Cluster to be created"
 26 | }
 27 | 
 28 | variable "kubernetes_version" {
 29 |   default     = "1.32"
 30 |   description = "Version of Kubernetes to turn on. Run 'az aks get-versions --location <location> --output table' to view all available versions "
 31 | }
 32 | 
 33 | variable "cpu_node_pool_disk_size" {
 34 |   description = "Disk size in GB of nodes in the Default GPU pool"
 35 |   default     = 100
 36 | }
 37 | 
 38 | variable "cpu_node_pool_count" {
 39 |   description = "Count of nodes in Default GPU pool"
 40 |   default     = 1
 41 | }
 42 | 
 43 | variable "cpu_node_pool_min_count" {
 44 |   description = "Min ount of number of nodes in Default CPU pool"
 45 |   default     = 1
 46 | }
 47 | variable "cpu_node_pool_max_count" {
 48 |   description = "Max count of nodes in Default CPU pool"
 49 |   default     = 5
 50 | }
 51 | variable "cpu_machine_type" {
 52 |   default     = "Standard_D16_v5"
 53 |   description = "Machine instance type of the AKS CPU node pool"
 54 | }
 55 | variable "cpu_os_sku" {
 56 |   description = "Specifies the OS SKU used by the agent pool. Possible values include: Ubuntu, CBLMariner, Mariner, Windows2019, Windows2022"
 57 |   default     = "Ubuntu"
 58 | }
 59 | 
 60 | /****************************
 61 | GPU Node Pool Variables
 62 | ****************************/
 63 | variable "gpu_node_pool_disk_size" {
 64 |   description = "Disk size in GB of nodes in the Default GPU pool"
 65 |   default     = 100
 66 | }
 67 | variable "gpu_node_pool_count" {
 68 |   description = "Count of nodes in Default GPU pool"
 69 |   default     = 2
 70 | }
 71 | variable "gpu_node_pool_min_count" {
 72 |   description = "Min count of number of nodes in Default GPU pool"
 73 |   default     = 2
 74 | }
 75 | variable "gpu_node_pool_max_count" {
 76 |   description = "Max count of nodes in Default GPU pool"
 77 |   default     = 5
 78 | }
 79 | variable "gpu_machine_type" {
 80 |   default     = "Standard_NC4as_T4_v3"
 81 |   description = "Machine instance type of the AKS GPU node pool"
 82 | }
 83 | variable "gpu_os_sku" {
 84 |   description = "Specifies the OS SKU used by the agent pool. Possible values include: Ubuntu, CBLMariner, Mariner, Windows2019, Windows2022"
 85 |   default     = "Ubuntu"
 86 | }
 87 | /****************************
 88 | GPU Operator Variables
 89 | ****************************/
 90 | variable "install_gpu_operator" {
 91 |   default     = "true"
 92 |   description = "Whether to Install GPU Operator. Defaults to false available."
 93 | }
 94 | 
 95 | variable "gpu_operator_version" {
 96 |   default     = "v25.3.0"
 97 |   description = "Version of the GPU operator to be installed"
 98 | }
 99 | 
100 | variable "gpu_operator_namespace" {
101 |   type        = string
102 |   default     = "gpu-operator"
103 |   description = "The namespace to deploy the NVIDIA GPU operator into"
104 | }
105 | 
106 | variable "gpu_operator_driver_version" {
107 |   type        = string
108 |   default     = "570.124.06"
109 |   description = "The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available."
110 | }
111 | 
112 | /************************
113 |   NIM Operator Variables
114 | *************************/
115 | variable "install_nim_operator" {
116 |   default     = "false"
117 |   description = "Whether to Install NIM Operator. Defaults to false available."
118 | }
119 | 
120 | variable "nim_operator_version" {
121 |   default     = "v1.0.1"
122 |   description = "Version of the GPU Operator to deploy. Defaults to latest available."
123 | }
124 | 
125 | variable "nim_operator_namespace" {
126 |   type        = string
127 |   default     = "nim-operator"
128 |   description = "The namespace for the GPU operator deployment"
129 | }
130 | 
131 | /****************************
132 | Active Directory Variables
133 | ****************************/
134 | variable "admin_group_object_ids" {
135 |   type        = list(any)
136 |   description = <<EOH
137 |   (Required) A list of Object IDs (GUIDs) of Azure Active Directory Groups which should have Owner Role on the Cluster. 
138 |   This is not the email address of the group, the GUID can be found in the Azure panel by searching for the AD Group
139 |   NOTE: You will need Azure "Owner" role (not "Contributor") to attach an AD role to the Kubernetes cluster.
140 |   EOH
141 | }
142 | 
143 | 
144 | 
145 | /****************************
146 | Subscription ID Variables
147 | ****************************/
148 | variable "subscription_id" {
149 |   description = "Subscription ID"
150 | }
151 | 


--------------------------------------------------------------------------------
/gke/variables.tf:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | /***************************
  4 | GCP Variables
  5 | ***************************/
  6 | 
  7 | variable "project_id" {
  8 |   description = "GCP Project ID for the VPC and K8s Cluster. This module currently does not support projects with a SharedVPC"
  9 | }
 10 | 
 11 | variable "region" {
 12 |   description = "The Region resources (VPC, GKE, Compute Nodes) will be created in"
 13 |   default     = "us-west1"
 14 | }
 15 | 
 16 | variable "vpc_enabled" {
 17 |   default     = true
 18 |   type        = bool
 19 |   description = "Variable to control nvidia-kubernetes GKE module VPC creation"
 20 | }
 21 | 
 22 | variable "network" {
 23 |   default     = ""
 24 |   type        = string
 25 |   description = "Network CIDR for VPC"
 26 | }
 27 | 
 28 | variable "subnetwork" {
 29 |   type        = string
 30 |   default     = ""
 31 |   description = "Subnet name used for k8s cluster nodes"
 32 | }
 33 | 
 34 | /***************************
 35 | GKE Variables
 36 | ***************************/
 37 | variable "cluster_name" {
 38 |   default     = "gke-cluster-tf"
 39 |   description = "Name of the Kubernetes Cluster to provision"
 40 |   type        = string
 41 | }
 42 | 
 43 | variable "node_zones" {
 44 |   default     = ["us-west1-b"]
 45 |   description = "Specify zones to put nodes in (must be in same region defined above)"
 46 |   type        = list(any)
 47 | }
 48 | 
 49 | variable "release_channel" {
 50 |   default     = "REGULAR"
 51 |   description = "Configuration options for the Release channel feature, which provide more control over automatic upgrades of your GKE clusters. When updating this field, GKE imposes specific version requirements"
 52 | }
 53 | 
 54 | variable "min_master_version" {
 55 |   default     = "1.32"
 56 |   description = "The minimum cluster version of the master."
 57 | }
 58 | 
 59 | /***************************
 60 | GKE CPU Node Pool Variables
 61 | ***************************/
 62 | 
 63 | variable "cpu_min_node_count" {
 64 |   default     = "1"
 65 |   description = "Number of CPU nodes in CPU nodepool"
 66 | }
 67 | 
 68 | variable "cpu_max_node_count" {
 69 |   default     = "5"
 70 |   description = "Max Number of CPU nodes in CPU nodepool"
 71 | }
 72 | 
 73 | variable "use_cpu_spot_instances" {
 74 |   default     = false
 75 |   description = "Use Spot instance for CPU pool"
 76 | }
 77 | 
 78 | variable "cpu_instance_type" {
 79 |   default     = "n1-standard-4"
 80 |   description = "Machine Type for CPU node pool"
 81 | }
 82 | 
 83 | variable "num_cpu_nodes" {
 84 |   default     = 1
 85 |   description = "Number of CPU nodes when pool is created"
 86 | }
 87 | 
 88 | /***************************
 89 | GKE GPU  Node Pool Variables
 90 | ***************************/
 91 | variable "gpu_type" {
 92 |   default     = "nvidia-tesla-t4"
 93 |   description = "GPU SKU To attach to NVIDIA GPU Node (eg. nvidia-tesla-k80)"
 94 | }
 95 | variable "gpu_min_node_count" {
 96 |   default     = "2"
 97 |   description = "Min number of GPU nodes in GPU nodepool"
 98 | }
 99 | 
100 | variable "gpu_max_node_count" {
101 |   default     = "5"
102 |   description = "Max Number of GPU nodes in GPU nodepool"
103 | }
104 | 
105 | variable "use_gpu_spot_instances" {
106 |   default     = false
107 |   description = "Use Spot instance for GPU pool"
108 | }
109 | 
110 | variable "num_gpu_nodes" {
111 |   default     = 2
112 |   description = "Number of GPU nodes when pool is created"
113 | }
114 | 
115 | variable "gpu_count" {
116 |   default     = "1"
117 |   description = "Number of GPUs to attach to each node in GPU pool"
118 | }
119 | 
120 | variable "gpu_instance_type" {
121 |   default     = "n1-standard-4"
122 |   description = "Machine Type for GPU node pool"
123 | }
124 | 
125 | variable "gpu_instance_tags" {
126 |   type        = list(string)
127 |   default     = []
128 |   description = "GPU instance nodes tags"
129 | }
130 | variable "disk_size_gb" {
131 |   default = "512"
132 |   type    = string
133 | }
134 | 
135 | /***************************
136 | GPU Operator Variables
137 | ***************************/
138 | variable "install_gpu_operator" {
139 |   default     = "true"
140 |   description = "Whether to Install GPU Operator. Defaults to false available."
141 | }
142 | 
143 | variable "gpu_operator_version" {
144 |   default     = "v25.3.0"
145 |   description = "Version of the GPU Operator to deploy. Defaults to latest available"
146 | }
147 | 
148 | variable "gpu_operator_driver_version" {
149 |   type        = string
150 |   default     = "570.124.06"
151 |   description = "The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available"
152 | }
153 | 
154 | variable "gpu_operator_namespace" {
155 |   type        = string
156 |   default     = "gpu-operator"
157 |   description = "The namespace to deploy the NVIDIA GPU operator into"
158 | }
159 | 
160 | /************************
161 |   NIM Operator Variables
162 | *************************/
163 | variable "install_nim_operator" {
164 |   default     = "false"
165 |   description = "Whether to Install NIM Operator. Defaults to false available."
166 | }
167 | 
168 | variable "nim_operator_version" {
169 |   default     = "v1.0.1"
170 |   description = "Version of the GPU Operator to deploy. Defaults to latest available"
171 | }
172 | 
173 | variable "nim_operator_namespace" {
174 |   type        = string
175 |   default     = "nim-operator"
176 |   description = "The namespace for the GPU operator deployment"
177 | }
178 | 
179 | 


--------------------------------------------------------------------------------
/aks/main.tf:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | data "azurerm_resource_group" "existing" {
  5 |   count = var.existing_resource_group_name == null ? 0 : 1
  6 |   name  = var.existing_resource_group_name
  7 | }
  8 | 
  9 | resource "azurerm_resource_group" "aks" {
 10 |   count    = var.existing_resource_group_name == null ? 1 : 0
 11 |   name     = "${var.cluster_name}-rg"
 12 |   location = var.location
 13 |   tags = {
 14 |     group      = "aks"
 15 |     managed_by = "Terraform"
 16 |   }
 17 | }
 18 | 
 19 | resource "azurerm_kubernetes_cluster" "aks" {
 20 |   name                = var.cluster_name
 21 |   kubernetes_version  = var.kubernetes_version
 22 |   resource_group_name = var.existing_resource_group_name == null ? azurerm_resource_group.aks[0].name : data.azurerm_resource_group.existing[0].name
 23 |   location            = var.existing_resource_group_name == null ? azurerm_resource_group.aks[0].location : data.azurerm_resource_group.existing[0].location
 24 |   dns_prefix          = var.cluster_name
 25 | 
 26 |   default_node_pool {
 27 |     name                = "akscpu"
 28 |     node_count          = var.cpu_node_pool_count
 29 |     vm_size             = var.cpu_machine_type
 30 |     os_disk_size_gb     = var.cpu_node_pool_disk_size
 31 |   }
 32 | 
 33 |   azure_active_directory_role_based_access_control {
 34 |     azure_rbac_enabled     = true
 35 |     admin_group_object_ids = var.admin_group_object_ids
 36 |   }
 37 | 
 38 |   identity {
 39 |     type = "SystemAssigned"
 40 |   }
 41 | 
 42 |   tags = {
 43 |     group      = "aks"
 44 |     managed_by = "Terraform"
 45 |   }
 46 | 
 47 |   // As the cluster is being created, have the az CLI update the users kubeconfig file
 48 |   provisioner "local-exec" {
 49 |     command = "az aks get-credentials --resource-group ${var.existing_resource_group_name == null ? azurerm_resource_group.aks[0].name : data.azurerm_resource_group.existing[0].name} --name ${var.cluster_name} --overwrite-existing"
 50 |   }
 51 | 
 52 |   provisioner "local-exec" {
 53 |     command = "kubelogin convert-kubeconfig -l azurecli"
 54 |   }
 55 | }
 56 | 
 57 | data "azurerm_kubernetes_cluster" "akscluster" {
 58 |   name                = azurerm_kubernetes_cluster.aks.name
 59 |   resource_group_name = var.existing_resource_group_name == null ? azurerm_resource_group.aks[0].name : data.azurerm_resource_group.existing[0].name
 60 |   depends_on          = [azurerm_kubernetes_cluster.aks]
 61 | }
 62 | 
 63 | resource "azurerm_kubernetes_cluster_node_pool" "aks" {
 64 |   depends_on            = [azurerm_kubernetes_cluster.aks]
 65 |   name                  = "aksgpu1"
 66 |   kubernetes_cluster_id = azurerm_kubernetes_cluster.aks.id
 67 |   node_count            = var.gpu_node_pool_count
 68 |   vm_size               = var.gpu_machine_type
 69 |   os_disk_size_gb       = var.gpu_node_pool_disk_size
 70 | 
 71 | # Enable autoscaling using min_count and max_count
 72 |   auto_scaling_enabled = true
 73 |   min_count = var.gpu_node_pool_min_count
 74 |   max_count = var.gpu_node_pool_max_count
 75 | 
 76 |   tags = {
 77 |     group      = "aks"
 78 |     managed_by = "Terraform"
 79 |     SkipGPUDriverInstall = true
 80 |   }
 81 | }
 82 | 
 83 | /***************************
 84 | Create GPU Operator Namespace
 85 | ***************************/
 86 | resource "kubernetes_namespace_v1" "gpu-operator" {
 87 |   metadata {
 88 |     annotations = {
 89 |       name = "gpu-operator"
 90 |     }
 91 | 
 92 |     labels = {
 93 |       cluster    = var.cluster_name
 94 |       managed_by = "Terraform"
 95 |     }
 96 | 
 97 |     name = var.gpu_operator_namespace
 98 |   }
 99 | }
100 | 
101 | /***************************
102 | GPU Operator Configuration
103 | ***************************/
104 | resource "helm_release" "gpu-operator" {
105 |   depends_on       = [azurerm_kubernetes_cluster_node_pool.aks, kubernetes_namespace_v1.gpu-operator]
106 |   count            = var.install_gpu_operator ? 1 : 0
107 |   name             = "gpu-operator"
108 |   repository       = "https://helm.ngc.nvidia.com/nvidia"
109 |   chart            = "gpu-operator"
110 |   version          = var.gpu_operator_version
111 |   namespace        = var.gpu_operator_namespace
112 |   create_namespace = false
113 |   atomic           = true
114 |   cleanup_on_fail  = true
115 |   reset_values     = true
116 |   replace          = true
117 | 
118 |   set {
119 |     name  = "toolkit.enabled"
120 |     value = "true"
121 |   }
122 | 
123 |   set {
124 |     name  = "operator.cleanupCRD"
125 |     value = "true"
126 |   }
127 | 
128 |   set {
129 |     name  = "driver.enabled"
130 |     value = "true"
131 |   }
132 | 
133 |   set {
134 |     name  = "driver.version"
135 |     value = var.gpu_operator_driver_version
136 |   }
137 | 
138 | }
139 | 
140 | /***************************
141 | Create NIM Operator Namespace
142 | ***************************/
143 | resource "kubernetes_namespace_v1" "nim-operator" {
144 |   metadata {
145 |     annotations = {
146 |       name = "nim-operator"
147 |     }
148 | 
149 |     labels = {
150 |       cluster    = var.cluster_name
151 |       managed_by = "Terraform"
152 |     }
153 | 
154 |     name = var.nim_operator_namespace
155 |   }
156 | }
157 | 
158 | 
159 | /********************************************
160 |  NIM Operator Configuration
161 | ********************************************/
162 | resource "helm_release" "nim_operator" {
163 |   depends_on       = [azurerm_kubernetes_cluster_node_pool.aks, kubernetes_namespace_v1.nim-operator]
164 |   count            = var.install_nim_operator ? 1 : 0
165 |   name             = "nim-operator"
166 |   repository       = "https://helm.ngc.nvidia.com/nvidia"
167 |   chart            = "k8s-nim-operator"
168 |   version          = var.nim_operator_version
169 |   namespace        = var.nim_operator_namespace
170 |   create_namespace = true
171 |   atomic           = true
172 |   cleanup_on_fail  = true
173 |   reset_values     = true
174 |   replace          = true
175 | }
176 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## NVIDIA Terraform Modules OSS Contribution Rules
  3 | 
  4 | #### Issue Tracking
  5 | 
  6 | * All enhancement, bugfix, or change requests must begin with the creation of an [Issue Request](https://github.com/nvidia/nvidia-terraform-modules/issues).
  7 |   * The issue request must be reviewed by NVIDIA engineers and approved prior to code review.
  8 | 
  9 | 
 10 | #### Coding Guidelines
 11 | 
 12 | - All source code contributions must be formatted prior to checkin by running `terraform fmt -recursive`, and changes should not break validation checks provided by the `terraform validate` command.
 13 | 
 14 | - When updating a variable, output, or provider, update the documentation accordingly. We use [terraform-docs](https://terraform-docs.io/) to generate the Terraform documentation.
 15 |     - Run `terraform-docs markdown .` to generate the documentation and replace the bottom of the README section.
 16 |     - Run `terraform-docs tfvars hcl .` to generate the tfvar defaults, replace the existing comments of `terraform.tfvars` file.
 17 | 
 18 | - In addition, please follow the existing conventions in the relevant file, submodule, module, and project when you add new code or when you extend/fix existing functionality.
 19 | - Avoid introducing unnecessary complexity into existing code so that maintainability and readability are preserved.
 20 | 
 21 | - Try to keep pull requests (PRs) as concise as possible:
 22 |   - Avoid committing commented-out code.
 23 |   - Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
 24 | 
 25 | - Write commit titles using imperative mood and [these rules](https://chris.beams.io/posts/git-commit/), and reference the Issue number corresponding to the PR. Following is the recommended format for commit texts:
 26 | ```
 27 | #<Issue Number> - <Commit Title>
 28 | 
 29 | <Commit Body>
 30 | ```
 31 | 
 32 | - Make sure that you can contribute your work to open source (no license and/or patent conflict is introduced by your code). You will need to [`sign`](#signing-your-work) your commit.
 33 | 
 34 | - Thanks in advance for your patience as we review your contributions; we do appreciate them!
 35 | 
 36 | 
 37 | #### Pull Requests
 38 | Developer workflow for code contributions is as follows:
 39 | 
 40 | 1. Developers must first [fork](https://help.github.com/en/articles/fork-a-repo) the [upstream](https://github.com/nvidia/nvidia-terraform-modules) NVIDIA Terraform Modules repository.
 41 | 
 42 | 2. Git clone the forked repository and push changes to the personal fork.
 43 | 
 44 |   ```bash
 45 | git clone https://github.com/YOUR_USERNAME/YOUR_FORK.git NVIDIAK8s
 46 | # Checkout the targeted branch and commit changes
 47 | # Push the commits to a branch on the fork (remote).
 48 | git push -u origin <local-branch>:<remote-branch>
 49 |   ```
 50 | 
 51 | 3. Once the code changes are staged on the fork and ready for review, a [Pull Request](https://help.github.com/en/articles/about-pull-requests) (PR) can be [requested](https://help.github.com/en/articles/creating-a-pull-request) to merge the changes from a branch of the fork into a selected branch of upstream.
 52 |   * Exercise caution when selecting the source and target branches for the PR.
 53 |     Note that versioned releases of TensorRT OSS are posted to `release/` branches of the upstream repo.
 54 |   * Creation of a PR creation kicks off the code review process.
 55 |   * Atleast one TensorRT engineer will be assigned for the review.
 56 |   * While under review, mark your PRs as work-in-progress by prefixing the PR title with [WIP].
 57 | 
 58 | 4. Since there is no CI/CD process in place yet, the PR will be accepted and the corresponding issue closed only after adequate testing has been completed, manually, by the developer and/or NVIDIA engineer reviewing the code.
 59 | 
 60 | 
 61 | #### Signing Your Work
 62 | 
 63 | * We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
 64 | 
 65 |   * Any contribution which contains commits that are not Signed-Off will not be accepted.
 66 | 
 67 | * To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
 68 |   ```bash
 69 |   $ git commit -s -m "Add cool feature."
 70 |   ```
 71 |   This will append the following to your commit message:
 72 |   ```
 73 |   Signed-off-by: Your Name <your@email.com>
 74 |   ```
 75 | 
 76 | * Full text of the DCO:
 77 | 
 78 |   ```
 79 |     Developer Certificate of Origin
 80 |     Version 1.1
 81 |     
 82 |     Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 83 |     1 Letterman Drive
 84 |     Suite D4700
 85 |     San Francisco, CA, 94129
 86 |     
 87 |     Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
 88 |   ```
 89 | 
 90 |   ```
 91 |     Developer's Certificate of Origin 1.1
 92 |     
 93 |     By making a contribution to this project, I certify that:
 94 |     
 95 |     (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
 96 |     
 97 |     (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
 98 |     
 99 |     (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
100 |     
101 |     (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
102 |   ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NVIDIA Terraform Kubernetes Modules
  2 | 
  3 | ## Objective 
  4 | 
  5 | NVIDIA Terraform Modules provide a reference architecture for deploying CSP managed Kubernetes clusters equipped with NVIDIA softwares: 
  6 | 
  7 | - NVIDIA GPU Operator.
  8 | - NVIDIA NIM Operator.
  9 | 
 10 | All the components listed below have been tested successfully together.
 11 | 
 12 | 
 13 | ## Life Cycle 
 14 | 
 15 | When NVIDIA Terraform Modules is released, the previous release enters maintenance support and only receives patch release updates. All prior batches enter end-of-life (EOL) and are no longer supported and do not receive patch updates.
 16 | 
 17 | 
 18 | |  Release  | Status              |
 19 | | :-----: | :--------------:|
 20 | | [25.4.0](https://github.com/NVIDIA/cloud-native-stack/releases/tag/v25.4.0)                   | Generally Available |
 21 | | [24.11.0](https://github.com/NVIDIA/cloud-native-stack/releases/tag/v24.11.0)                   | Maintenance | 
 22 | 
 23 | 
 24 | ## Support Matrix
 25 | 
 26 | The Kubernetes clusters provisioned by the modules in this repository provide tested and certified versions of Kubernetes, the NVIDIA GPU operator, and NVIDIA NIM Operator.
 27 | 
 28 | If your application does not require a specific version of Kubernetes, we recommend using the latest available version. We also recommend you plan to upgrade your version of Kubernetes at least every 6 months.
 29 | 
 30 | 
 31 | NVIDIA Terraform Modules 25.4.0 Release:
 32 | 
 33 | | TF Modules               | K8s 1.32                                   | K8s 1.31                                   | K8s 1.30 |
 34 | | :---------               | :--------                                  | :-------                                   | :------- |
 35 | | Platforms                | Amazon EKS <br> Azure AKS <br> Google GKE  | Amazon EKS <br> Azure AKS <br> Google GKE  | Amazon EKS <br> Azure AKS <br> Google GKE  |
 36 | | Supported OS             | Ubuntu 24.04 LTS (GKE) <br> Ubuntu 22.04 LTS (AKS/EKS)     | Ubuntu 24.04 LTS (GKE) <br> Ubuntu 22.04 LTS (AKS/EKS)     | Ubuntu 24.04 LTS (GKE) <br> Ubuntu 22.04 LTS (AKS/EKS)     |
 37 | | Kernel                   | EKS: 6.8.0-1024-aws <br> AKS: 5.15.0-1082-azure <br> GKE: 6.8.0-1017-gke | EKS: 6.8.0-1024-aws <br> AKS: 5.15.0-1082-azure <br> GKE: 6.8.0-1017-gke | EKS: 6.8.0-1024-aws <br> AKS: 5.15.0-1082-azure <br> GKE: 6.8.0-1017-gke |
 38 | | Containerd               | EKS: 1.7.24 <br> AKS: 1.7.26-1 <br> GKE: 1.7.24 | EKS: 1.7.24 <br> AKS: 1.7.26-1 <br> GKE: 1.7.24 | EKS: 1.7.24 <br> AKS: 1.7.26-1 <br> GKE: 1.7.24 |
 39 | | CNI                      | CSP dependent                              | CSP dependent                              | CSP dependent                              |
 40 | | CSI                      | CSP dependent                              | CSP dependent                              | CSP dependent                              |
 41 | | NVIDIA GPU Operator      | 25.3.0                                     | 25.3.0                                     | 25.3.0                                     |
 42 | | NVIDIA GPU Operator Operands | NVIDIA Container Toolkit: 1.17.5 <br> NVIDIA Device Plugin: 0.17.1 <br> NVIDIA MIG Manager: 0.12.1 <br> NVIDIA DCGM Exporter: 4.1.1-4.0.4 | NVIDIA Container Toolkit: 1.17.5 <br> NVIDIA Device Plugin: 0.17.1 <br> NVIDIA MIG Manager: 0.12.1 <br> NVIDIA DCGM Exporter: 4.1.1-4.0.4 | NVIDIA Container Toolkit: 1.17.5 <br> NVIDIA Device Plugin: 0.17.1 <br> NVIDIA MIG Manager: 0.12.1 <br> NVIDIA DCGM Exporter: 4.1.1-4.0.4 |
 43 | | NVIDIA DataCenter Driver | 570.124.06                                 | 570.124.06                                 | 570.124.06                                 |
 44 | | NVIDIA NIM Operator      | 1.0.1                                      | 1.0.1                                      | 1.0.1                                      | 
 45 | | Helm                     | 3.17.2                                     | 3.17.2                                     | 3.17.2                                     |
 46 | 
 47 | 
 48 | ### CSP Managed K8s Services Life Cycle
 49 | 
 50 | Each CSP has its own end of life date for the versions of Kubernetes they support. For more information see:
 51 | 
 52 | - [Amazon EKS release calendar](https://docs.aws.amazon.com/eks/latest/userguide/kubernetes-versions.html#kubernetes-release-calendar)
 53 | - [Azure AKS release calendar](https://learn.microsoft.com/en-us/azure/aks/supported-kubernetes-versions?tabs=azure-cli#aks-kubernetes-release-calendar)
 54 | - [GCP GKE release calendar](https://cloud.google.com/kubernetes-engine/docs/release-schedule#schedule_for_static_no-channel_versions).
 55 | 
 56 | 
 57 | ## Getting Started
 58 | 
 59 | Infrastructure as code for GPU accelerated managed Kubernetes clusters. These scripts automate the deployment of GPU-Enabled Kubernetes clusters on various cloud service platforms.
 60 | 
 61 | Terraform is an open-source infrastructure as code software tool that we will use to automate the deployment of Kubernetes clusters with the required add-ons to enable NVIDIA GPUs. This repository contains Terraform [modules](https://developer.hashicorp.com/terraform/tutorials/modules/module), which are sets of Terraform configuration files ready for deployment. The modules in this repository can be incorporated into existing Terraform-managed infrastructure, or used to set up new infrastructure from scratch. You can learn more about Terraform [here](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/infrastructure-as-code).
 62 | 
 63 | You can download Terraform (CLI) [here](https://developer.hashicorp.com/terraform/downloads).
 64 | 
 65 | ### Usage
 66 | 
 67 | Clone the repo
 68 |         
 69 |   ```
 70 |   git clone https://github.com/NVIDIA/nvidia-terraform-modules.git
 71 |   ```
 72 | 
 73 | #### Provision a GPU enabled Kubernetes Cluster
 74 | 
 75 | Select the CSP managed K8s cluster and follow steps indicated in the corresponding page:
 76 | 
 77 | - Create an [EKS Cluster](./eks/README.md)
 78 | - Create an [AKS Cluster](./aks/README.md)
 79 | - Create a [GKE Cluster](./gke/README.md)
 80 | 
 81 | 
 82 | ### State Management
 83 | These modules do not set up state management for the generated Terraform state file, deleting the statefile (`terraform.tfstate`) generated by Terraform could result in cloud resources needing to be manually deleted. We strongly encourage you [configure remote state](https://developer.hashicorp.com/terraform/language/state/remote).
 84 | 
 85 | Please see the [Terraform Documentation](https://developer.hashicorp.com/terraform/language/state) for more information.
 86 | 
 87 | ## Contributing
 88 | 
 89 | Pull requests are welcome! Please see our [contribution guidelines](./CONTRIBUTING.md).
 90 | 
 91 | ## Getting help or Providing feedback
 92 | 
 93 | Please open an [issue](https://github.com/NVIDIA/nvidia-terraform-modules/issues) on the GitHub project for any questions. Your feedback is appreciated.
 94 | 
 95 | 
 96 | ## Useful Links
 97 | - [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/overview.html)
 98 | - [NVIDIA NIM Operator](https://docs.nvidia.com/nim-operator/latest/index.html)
 99 | - [NVIDIA GPU Cloud (NGC)](https://catalog.ngc.nvidia.com/)
100 | 


--------------------------------------------------------------------------------
/eks/main.tf:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | /********************************************
  5 |   Network Config
  6 | ********************************************/
  7 | module "vpc" {
  8 |   source  = "terraform-aws-modules/vpc/aws"
  9 |   version = "4.0.2"
 10 |   count   = var.existing_vpc_details == null ? 1 : 0
 11 |   name    = "tf-${var.cluster_name}-vpc"
 12 |   cidr    = var.cidr_block
 13 |   azs     = data.aws_availability_zones.available.names
 14 |   # FUTURE: Make configurable, or set statically for the max number of pods a cluster can handle
 15 |   private_subnets         = var.private_subnets
 16 |   public_subnets          = var.public_subnets
 17 |   enable_nat_gateway      = var.enable_nat_gateway
 18 |   single_nat_gateway      = var.single_nat_gateway # Future: Revisit the VPC defaults
 19 |   enable_dns_hostnames    = var.enable_dns_hostnames
 20 |   map_public_ip_on_launch = true
 21 | }
 22 | 
 23 | 
 24 | /********************************************
 25 |   Kubernetes Cluster Configuration
 26 | ********************************************/
 27 | 
 28 | locals {
 29 |   node_security_group_additional_rules = {
 30 |     ingress_self_all = {
 31 |       description = "Node to node ingress, no external ingress"
 32 |       protocol    = "-1"
 33 |       from_port   = 0
 34 |       to_port     = 0
 35 |       type        = "ingress"
 36 |       self        = true
 37 |     }
 38 | 
 39 |     egress_all = {
 40 |       description      = "Node egress to open internet"
 41 |       protocol         = "-1"
 42 |       from_port        = 0
 43 |       to_port          = 0
 44 |       type             = "egress"
 45 |       cidr_blocks      = ["0.0.0.0/0"]
 46 |       ipv6_cidr_blocks = ["::/0"]
 47 |     }
 48 |   }
 49 | }
 50 | 
 51 | module "eks" {
 52 |   source                          = "terraform-aws-modules/eks/aws"
 53 |   version                         = "18.29.0"
 54 |   cluster_name                    = "tf-${var.cluster_name}"
 55 |   cluster_version                 = var.cluster_version
 56 |   cluster_endpoint_private_access = true
 57 |   cluster_endpoint_public_access  = true
 58 |   create_cloudwatch_log_group     = false
 59 |   vpc_id                          = var.existing_vpc_details == null ? module.vpc[0].vpc_id : var.existing_vpc_details.vpc_id
 60 |   enable_irsa                     = true
 61 |   subnet_ids                      = var.existing_vpc_details == null ? module.vpc[0].private_subnets : var.existing_vpc_details.subnet_ids
 62 |   control_plane_subnet_ids        = var.existing_vpc_details == null ? module.vpc[0].private_subnets : var.existing_vpc_details.subnet_ids
 63 |   # KMS Config
 64 |   create_kms_key                  = true
 65 |   enable_kms_key_rotation         = true
 66 |   kms_key_deletion_window_in_days = 7
 67 |   kms_key_enable_default_policy   = true
 68 |   cluster_encryption_config = [
 69 |     {
 70 |       resources : ["secrets"]
 71 |     }
 72 |   ]
 73 |   # Cluster Security Group
 74 |   cluster_security_group_additional_rules = {
 75 |     egress_nodes_ephemeral_ports_tcp = {
 76 |       description                = "Control plane egress to nodes on TCP Ports 1025-65535"
 77 |       protocol                   = "tcp"
 78 |       from_port                  = 1025
 79 |       to_port                    = 65535
 80 |       type                       = "egress"
 81 |       source_node_security_group = true
 82 |     }
 83 |   }
 84 |   # NodeGroup Config
 85 |   node_security_group_additional_rules = merge(local.node_security_group_additional_rules, var.additional_node_security_groups_rules)
 86 | 
 87 |   eks_managed_node_groups = {
 88 |     gpu_node_pool = {
 89 |       name                       = "tf-gpu"
 90 |       instance_types             = [var.gpu_instance_type]
 91 |       min_size                   = var.min_gpu_nodes
 92 |       max_size                   = var.max_gpu_nodes
 93 |       desired_size               = var.desired_count_gpu_nodes
 94 |       ami_id                     = data.aws_ami.lookup.id
 95 |       ami_type                   = "CUSTOM"
 96 |       enable_bootstrap_user_data = true
 97 |       post_bootstrap_user_data   = var.gpu_node_pool_additional_user_data
 98 |       vpc_security_group_ids     = var.existing_vpc_details == null ? [] : var.additional_security_group_ids
 99 |       block_device_mappings = {
100 |         root = {
101 |           device_name = data.aws_ami.lookup.root_device_name
102 |           ebs = {
103 |             volume_size           = var.gpu_node_pool_root_disk_size_gb
104 |             volume_type           = var.gpu_node_pool_root_volume_type
105 |             delete_on_termination = var.gpu_node_pool_delete_on_termination
106 |           }
107 |         }
108 |       }
109 |       ssh_key = var.ssh_key
110 | 
111 |     },
112 |     cpu_node_pool = {
113 |       name                   = "tf-cpu"
114 |       instance_types         = [var.cpu_instance_type]
115 |       min_size               = var.min_cpu_nodes
116 |       max_size               = var.max_cpu_nodes
117 |       desired_size           = var.desired_count_cpu_nodes
118 |       vpc_security_group_ids = var.existing_vpc_details == null ? [] : var.additional_security_group_ids
119 |       ssh_key                = var.ssh_key
120 |       block_device_mappings = {
121 |         root = {
122 |           device_name = "/dev/xvda"
123 |           ebs = {
124 |             volume_size           = var.cpu_node_pool_root_disk_size_gb
125 |             volume_type           = var.cpu_node_pool_root_volume_type
126 |             delete_on_termination = var.cpu_node_pool_delete_on_termination
127 |           }
128 |         }
129 |       }
130 |     }
131 |   }
132 |   # Cluster Add-on Config
133 |   cluster_addons = {
134 |     aws-ebs-csi-driver = {
135 |       service_account_role_arn = module.ebs_csi_irsa_role.iam_role_arn
136 |       most_recent              = true
137 |     }
138 |   }
139 | }
140 | 
141 | /********************************************
142 |   Configure AWS Role for Service Account
143 |   for EKS CSI Driver
144 | ********************************************/
145 | module "ebs_csi_irsa_role" {
146 |   source                = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
147 |   role_name             = "${var.cluster_name}-ebs-csi"
148 |   attach_ebs_csi_policy = true
149 |   oidc_providers = {
150 |     cluster = {
151 |       provider_arn               = module.eks.oidc_provider_arn
152 |       namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
153 |     }
154 |   }
155 | }
156 | 
157 | /********************************************
158 |   Custom AMI Lookup
159 | ********************************************/
160 | locals {
161 |   ubuntu_ami_lookup = {
162 |     owners = ["099720109477"] # Canonical
163 |     filters = [
164 |       {
165 |         name   = "name"
166 |         values = ["ubuntu-eks/k8s_${var.cluster_version}/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"]
167 |       },
168 |       {
169 |         name   = "virtualization-type"
170 |         values = ["hvm"]
171 |       }
172 |     ]
173 |   }
174 |   no_ami_lookup = {
175 |     owners  = []
176 |     filters = []
177 |   }
178 |   ami_lookup = var.gpu_ami_id == "" ? local.ubuntu_ami_lookup : local.no_ami_lookup
179 |   ami_id     = var.gpu_ami_id == "" ? data.aws_ami.lookup.id : var.gpu_ami_id
180 | }
181 | 
182 | 
183 | /********************************************
184 |   GPU Operator Configuration
185 | ********************************************/
186 | resource "helm_release" "gpu_operator" {
187 |   count            = var.install_gpu_operator ? 1 : 0
188 |   name             = "gpu-operator"
189 |   repository       = "https://helm.ngc.nvidia.com/nvidia"
190 |   chart            = "gpu-operator"
191 |   version          = var.gpu_operator_version
192 |   namespace        = var.gpu_operator_namespace
193 |   create_namespace = true
194 |   atomic           = true
195 |   cleanup_on_fail  = true
196 |   reset_values     = true
197 |   replace          = true
198 | 
199 |   set {
200 |     name  = "driver.version"
201 |     value = var.gpu_operator_driver_version
202 |   }
203 | 
204 | }
205 | 
206 | /********************************************
207 |  NIM Operator Configuration
208 | ********************************************/
209 | resource "helm_release" "nim_operator" {
210 |   count            = var.install_nim_operator ? 1 : 0
211 |   name             = "nim-operator"
212 |   repository       = "https://helm.ngc.nvidia.com/nvidia"
213 |   chart            = "k8s-nim-operator"
214 |   version          = var.nim_operator_version
215 |   namespace        = var.nim_operator_namespace
216 |   create_namespace = true
217 |   atomic           = true
218 |   cleanup_on_fail  = true
219 |   reset_values     = true
220 |   replace          = true
221 | }
222 | 


--------------------------------------------------------------------------------
/eks/variables.tf:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | /************************
  5 |   AWS Variables
  6 | *************************/
  7 | 
  8 | variable "aws_profile" {
  9 |   type        = string
 10 |   default     = "development"
 11 |   description = ""
 12 | }
 13 | 
 14 | variable "aws_region" {
 15 |   default     = "us-west-2"
 16 |   description = "AWS region to provision the Kubernetes Cluster"
 17 | }
 18 | 
 19 | 
 20 | variable "cluster_name" {
 21 |   default     = "eks-cluster-tf"
 22 |   type = string
 23 | }
 24 | 
 25 | variable "cluster_version" {
 26 |   type        = string
 27 |   default     = "1.32"
 28 |   description = "Version of EKS to install on the control plane (Major and Minor version only, do not include the patch)"
 29 | }
 30 | /************************
 31 |   GPU Operator Variables
 32 | *************************/
 33 | variable "install_gpu_operator" {
 34 |   default     = "true"
 35 |   description = "Whether to Install GPU Operator. Defaults to false available."
 36 | }
 37 | 
 38 | variable "gpu_operator_version" {
 39 |   default     = "v25.3.0"
 40 |   description = "Version of the GPU Operator to deploy. Defaults to latest available. "
 41 | }
 42 | 
 43 | variable "gpu_operator_driver_version" {
 44 |   type        = string
 45 |   default     = "570.124.06"
 46 |   description = "The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available."
 47 | }
 48 | 
 49 | variable "gpu_operator_namespace" {
 50 |   type        = string
 51 |   default     = "gpu-operator"
 52 |   description = "The namespace for the GPU operator deployment"
 53 | }
 54 | 
 55 | /************************
 56 |   NIM Operator Variables
 57 | *************************/
 58 | variable "install_nim_operator" {
 59 |   default     = "false"
 60 |   description = "Whether to Install NIM Operator. Defaults to false available."
 61 | }
 62 | 
 63 | variable "nim_operator_version" {
 64 |   default     = "v1.0.1"
 65 |   description = "Version of the GPU Operator to deploy. Defaults to latest available."
 66 | }
 67 | 
 68 | variable "nim_operator_namespace" {
 69 |   type        = string
 70 |   default     = "nim-operator"
 71 |   description = "The namespace for the GPU operator deployment"
 72 | }
 73 | 
 74 | /*****************************
 75 |   Managed Node Pool Variables
 76 | ******************************/
 77 | 
 78 | /******************************
 79 |   GPU-only  Node Pool Variables
 80 | *******************************/
 81 | variable "gpu_ami_id" {
 82 |   type        = string
 83 |   description = "AMI ID of the EKS Ubuntu Image cooresponding to the region and version of the cluser. Not required as we do a lookup for this image"
 84 |   default     = ""
 85 | }
 86 | 
 87 | variable "gpu_instance_type" {
 88 |   type        = string
 89 |   default     = "g4dn.2xlarge"
 90 |   description = "GPU EC2 worker node instance type"
 91 | }
 92 | 
 93 | variable "max_gpu_nodes" {
 94 |   type        = string
 95 |   default     = "5"
 96 |   description = "Maximum number of GPU nodes in the Autoscaling Group"
 97 | }
 98 | 
 99 | variable "min_gpu_nodes" {
100 |   type        = string
101 |   default     = "2"
102 |   description = "Minimum number of GPU nodes in the Autoscaling Group"
103 | }
104 | 
105 | variable "desired_count_gpu_nodes" {
106 |   type        = string
107 |   default     = "2"
108 |   description = "Minimum number of GPU nodes in the Autoscaling Group"
109 | }
110 | 
111 | variable "gpu_node_pool_root_disk_size_gb" {
112 |   type        = number
113 |   default     = 512
114 |   description = "The size of the root disk on all GPU nodes in the EKS-managed GPU-only Node Pool. This is primarily for container image storage on the node"
115 | }
116 | 
117 | variable "gpu_node_pool_root_volume_type" {
118 |   type        = string
119 |   default     = "gp2"
120 |   description = "The type of disk to use for the GPU node pool root disk (eg. gp2, gp3). Note, this is different from the type of disk used by applications via EKS Storage classes/PVs & PVCs"
121 | }
122 | 
123 | variable "gpu_node_pool_delete_on_termination" {
124 |   type        = bool
125 |   default     = true
126 |   description = "Delete the VM nodes root filesystem on each node of the instance type. This is set to true by default, but can be changed when desired when using the 'local-storage provisioner' and are keeping important application data on the nodes"
127 | }
128 | 
129 | variable "gpu_node_pool_additional_user_data" {
130 |   type        = string
131 |   default     = ""
132 |   description = "User data that is appended to the user data script after of the EKS bootstrap script on EKS-managed GPU node pool."
133 | }
134 | 
135 | /************************
136 |   CPU-only  Node Pool Variables
137 | *************************/
138 | 
139 | variable "cpu_instance_type" {
140 |   type        = string
141 |   default     = "t2.xlarge"
142 |   description = "CPU EC2 worker node instance type"
143 | }
144 | 
145 | variable "cpu_node_pool_root_disk_size_gb" {
146 |   type        = number
147 |   default     = 512
148 |   description = "The size of the root disk on all GPU nodes in the EKS-managed GPU-only Node Pool. This is primarily for container image storage on the node"
149 | }
150 | 
151 | variable "cpu_node_pool_root_volume_type" {
152 |   type        = string
153 |   default     = "gp2"
154 |   description = "The type of disk to use for the GPU node pool root disk (eg. gp2, gp3). Note, this is different from the type of disk used by applications via EKS Storage classes/PVs & PVCs"
155 | }
156 | 
157 | variable "cpu_node_pool_delete_on_termination" {
158 |   type        = bool
159 |   default     = true
160 |   description = "Delete the VM nodes root filesystem on each node of the instance type. This is set to true by default, but can be changed when desired when using the 'local-storage provisioner' and are keeping important application data on the nodes"
161 | }
162 | 
163 | variable "cpu_node_pool_additional_user_data" {
164 |   type        = string
165 |   default     = ""
166 |   description = "User data that is appended to the user data script after of the EKS bootstrap script on EKS-managed GPU node pool."
167 | }
168 | 
169 | variable "max_cpu_nodes" {
170 |   type        = string
171 |   default     = "2"
172 |   description = "Maximum number of CPU nodes in the Autoscaling Group"
173 | }
174 | 
175 | variable "min_cpu_nodes" {
176 |   type        = string
177 |   default     = "0"
178 |   description = "Minimum number of CPU nodes in the Autoscaling Group"
179 | }
180 | 
181 | variable "desired_count_cpu_nodes" {
182 |   type        = string
183 |   default     = "1"
184 |   description = "Minimum number of CPU nodes in the Autoscaling Group"
185 | }
186 | 
187 | 
188 | variable "existing_vpc_details" {
189 |   type = object({
190 |     vpc_id     = string
191 |     subnet_ids = list(string)
192 |   })
193 |   default     = null
194 |   description = "Variables used for re-using existing VPC for vpc_id & subnet_id"
195 | }
196 | 
197 | variable "cidr_block" {
198 |   type        = string
199 |   default     = "10.0.0.0/16"
200 |   description = "CIDR for VPC"
201 | }
202 | 
203 | variable "additional_user_data" {
204 |   type        = string
205 |   default     = ""
206 |   description = "User data that is appended to the user data script after of the EKS bootstrap script."
207 | }
208 | 
209 | variable "private_subnets" {
210 |   type        = list(any)
211 |   description = "List of subnet ranges for the Private VPC"
212 |   default     = ["10.0.0.0/19", "10.0.32.0/19", "10.0.64.0/19"]
213 | }
214 | 
215 | variable "public_subnets" {
216 |   type        = list(any)
217 |   description = "List of subnet ranges for the Private VPC"
218 |   default     = ["10.0.96.0/19", "10.0.128.0/19", "10.0.160.0/19"]
219 | }
220 | 
221 | variable "ssh_key" {
222 |   type    = string
223 |   default = ""
224 | }
225 | 
226 | variable "enable_nat_gateway" {
227 |   description = "Should be true if you want to provision NAT Gateways for each of your private networks"
228 |   default     = true
229 |   type        = bool
230 | }
231 | 
232 | variable "single_nat_gateway" {
233 |   type        = bool
234 |   description = "Should be true if you want to provision a single shared NAT Gateway across all of your private networks"
235 |   default     = false
236 | }
237 | 
238 | variable "enable_dns_support" {
239 |   type        = bool
240 |   default     = true
241 |   description = "Whether or not the Default VPC has DNS support"
242 | }
243 | 
244 | variable "enable_dns_hostnames" {
245 |   description = "Whether or not the Default VPC has DNS hostname support"
246 |   default     = true
247 |   type        = bool
248 | }
249 | 
250 | variable "additional_security_group_ids" {
251 |   type        = list(any)
252 |   default     = []
253 |   description = "list of additional security groups to add to nodes"
254 | }
255 | 
256 | variable "additional_node_security_groups_rules" {
257 |   description = "List of additional security group rules to add to the node security group created"
258 |   type        = any
259 |   default     = {}
260 | }
261 | 


--------------------------------------------------------------------------------
/gke/main.tf:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | /***************************
  5 | VPC Network Configuration
  6 | ***************************/
  7 | resource "google_compute_network" "gke-vpc" {
  8 |   count                   = var.vpc_enabled ? 1 : 0
  9 |   name                    = "${var.cluster_name}-vpc"
 10 |   auto_create_subnetworks = "false"
 11 |   project                 = var.project_id
 12 | }
 13 | 
 14 | /***************************
 15 | Subnet Configuration
 16 | ***************************/
 17 | resource "google_compute_subnetwork" "gke-subnet" {
 18 |   name          = "${var.cluster_name}-subnet"
 19 |   count         = var.vpc_enabled ? 1 : 0
 20 |   region        = var.region
 21 |   network       = google_compute_network.gke-vpc[0].name
 22 |   ip_cidr_range = "10.150.0.0/24"
 23 |   project       = var.project_id
 24 | }
 25 | 
 26 | /***************************
 27 | GKE Configuration
 28 | ***************************/
 29 | 
 30 | # Add data block to provide latest k8s version as an output
 31 | data "google_container_engine_versions" "latest" {
 32 |   provider = google-beta
 33 |   location = var.region
 34 |   project  = var.project_id
 35 | }
 36 | 
 37 | resource "google_container_cluster" "gke" {
 38 |   name     = var.cluster_name
 39 |   project  = var.project_id
 40 |   location = length(var.node_zones) == 1 ? one(var.node_zones) : var.region
 41 |   release_channel {
 42 |     channel = var.release_channel
 43 |   }
 44 |   min_master_version = var.min_master_version
 45 |   # Default Node Pool is required, to create a cluster, but we need a custom one instead
 46 |   # So we delete the default
 47 |   remove_default_node_pool = true
 48 |   initial_node_count       = 1
 49 | 
 50 |   network    = var.vpc_enabled ? google_compute_network.gke-vpc[0].name : var.network
 51 |   subnetwork = var.vpc_enabled ? google_compute_subnetwork.gke-subnet[0].name : var.subnetwork
 52 | 
 53 |   deletion_protection = false
 54 |   
 55 |   // Workload Identity Configuration
 56 |   workload_identity_config {
 57 |     workload_pool = "${var.project_id}.svc.id.goog"
 58 |   }
 59 | }
 60 | /***************************
 61 | GKE CPU Node Pool Config
 62 | ***************************/
 63 | resource "google_container_node_pool" "cpu_nodes" {
 64 |   name           = "tf-${var.cluster_name}-cpu-pool"
 65 |   project        = var.project_id
 66 |   location       = length(var.node_zones) == 1 ? one(var.node_zones) : var.region
 67 |   node_locations = length(var.node_zones) > 1 ? var.node_zones : null
 68 |   cluster        = google_container_cluster.gke.name
 69 |   node_count     = var.num_cpu_nodes
 70 |   autoscaling {
 71 |     min_node_count = var.cpu_min_node_count
 72 |     max_node_count = var.cpu_max_node_count
 73 |   }
 74 |   node_config {
 75 |     image_type = "UBUNTU_CONTAINERD"
 76 |     oauth_scopes = [
 77 |       "https://www.googleapis.com/auth/logging.write",
 78 |       "https://www.googleapis.com/auth/monitoring",
 79 |       "https://www.googleapis.com/auth/devstorage.read_only",
 80 |       "https://www.googleapis.com/auth/compute"
 81 |     ]
 82 | 
 83 |     preemptible  = var.use_cpu_spot_instances
 84 |     machine_type = var.cpu_instance_type
 85 |     disk_size_gb = var.disk_size_gb
 86 |     tags         = concat(["tf-managed", "${var.cluster_name}"], var.gpu_instance_tags)
 87 |     metadata = {
 88 |       disable-legacy-endpoints = "true"
 89 |     }
 90 | 
 91 |     labels = {
 92 |       part_of    = var.cluster_name
 93 |       env        = var.project_id
 94 |       managed_by = "terraform"
 95 |     }
 96 |     workload_metadata_config {
 97 |       mode = "GKE_METADATA"
 98 |     }
 99 |   }
100 |   timeouts {
101 |     create = "30m"
102 |     update = "20m"
103 |   }
104 | }
105 | 
106 | /***************************
107 | GKE GPU Node Pool Config
108 | ***************************/
109 | resource "google_container_node_pool" "gpu_nodes" {
110 |   name           = "tf-${var.cluster_name}-gpu-pool"
111 |   project        = var.project_id
112 |   location       = length(var.node_zones) == 1 ? one(var.node_zones) : var.region
113 |   node_locations = length(var.node_zones) > 1 ? var.node_zones : null
114 |   cluster        = google_container_cluster.gke.name
115 |   node_count     = var.num_gpu_nodes
116 |   autoscaling {
117 |     min_node_count = var.gpu_min_node_count
118 |     max_node_count = var.gpu_max_node_count
119 |   }
120 |   node_config {
121 |     image_type = "UBUNTU_CONTAINERD"
122 |     oauth_scopes = [
123 |       "https://www.googleapis.com/auth/logging.write",
124 |       "https://www.googleapis.com/auth/monitoring",
125 |       "https://www.googleapis.com/auth/devstorage.read_only",
126 |       "https://www.googleapis.com/auth/compute"
127 |     ]
128 |     guest_accelerator {
129 |       type  = var.gpu_type
130 |       count = var.gpu_count
131 |       gpu_driver_installation_config {
132 |         gpu_driver_version = "INSTALLATION_DISABLED"
133 |       }
134 |     }
135 | 
136 |     preemptible  = var.use_gpu_spot_instances
137 |     machine_type = var.gpu_instance_type
138 |     disk_size_gb = var.disk_size_gb
139 |     tags         = concat(["tf-managed", "${var.cluster_name}"], var.gpu_instance_tags)
140 |     metadata = {
141 |       disable-legacy-endpoints = "true"
142 |     }
143 | 
144 |     labels = {
145 |       part_of    = var.cluster_name
146 |       env        = var.project_id
147 |       managed_by = "terraform"
148 |       "gke-no-default-nvidia-gpu-device-plugin" = "true"
149 |     }
150 |     workload_metadata_config {
151 |       mode = "GKE_METADATA"
152 |     }
153 |   }
154 |   timeouts {
155 |     create = "30m"
156 |     update = "20m"
157 |   }
158 | }
159 | 
160 | /***************************
161 | Create GPU Operator Namespace
162 | ***************************/
163 | resource "kubernetes_namespace_v1" "gpu-operator" {
164 |   metadata {
165 |     annotations = {
166 |       name = "gpu-operator"
167 |     }
168 | 
169 |     labels = {
170 |       cluster    = var.cluster_name
171 |       managed_by = "Terraform"
172 |     }
173 | 
174 |     name = var.gpu_operator_namespace
175 |   }
176 | }
177 | /***************************
178 | K8s Resource Quota Config
179 | ***************************/
180 | resource "kubernetes_resource_quota_v1" "gpu-operator-quota" {
181 |   depends_on = [google_container_node_pool.gpu_nodes, kubernetes_namespace_v1.gpu-operator]
182 |   metadata {
183 |     name      = "gpu-operator-quota"
184 |     namespace = var.gpu_operator_namespace
185 |   }
186 |   spec {
187 |     hard = {
188 |       pods = 100
189 |     }
190 |     scope_selector {
191 |       match_expression {
192 |         operator   = "In"
193 |         scope_name = "PriorityClass"
194 |         values     = ["system-node-critical", "system-cluster-critical"]
195 |       }
196 |     }
197 |   }
198 | }
199 | /***************************
200 | GPU Operator Configuration
201 | ***************************/
202 | resource "helm_release" "gpu-operator" {
203 |   depends_on       = [google_container_node_pool.gpu_nodes, kubernetes_resource_quota_v1.gpu-operator-quota, kubernetes_namespace_v1.gpu-operator]
204 |   count            = var.install_gpu_operator ? 1 : 0
205 |   name             = "gpu-operator"
206 |   repository       = "https://helm.ngc.nvidia.com/nvidia"
207 |   chart            = "gpu-operator"
208 |   version          = var.gpu_operator_version
209 |   namespace        = var.gpu_operator_namespace
210 |   create_namespace = false
211 |   atomic           = true
212 |   cleanup_on_fail  = true
213 |   reset_values     = true
214 |   replace          = true
215 | 
216 |   set {
217 |     name  = "driver.version"
218 |     value = var.gpu_operator_driver_version
219 |   }
220 | 
221 | }
222 | 
223 | /***************************
224 | Create NIM Operator Namespace
225 | ***************************/
226 | resource "kubernetes_namespace_v1" "nim-operator" {
227 |   metadata {
228 |     annotations = {
229 |       name = "nim-operator"
230 |     }
231 | 
232 |     labels = {
233 |       cluster    = var.cluster_name
234 |       managed_by = "Terraform"
235 |     }
236 | 
237 |     name = var.nim_operator_namespace
238 |   }
239 | }
240 | /***************************
241 | K8s Resource Quota Config
242 | ***************************/
243 | resource "kubernetes_resource_quota_v1" "nim-operator-quota" {
244 |   depends_on = [google_container_node_pool.gpu_nodes, kubernetes_namespace_v1.nim-operator]
245 |   metadata {
246 |     name      = "gpu-operator-quota"
247 |     namespace = var.nim_operator_namespace
248 |   }
249 |   spec {
250 |     hard = {
251 |       pods = 100
252 |     }
253 |     scope_selector {
254 |       match_expression {
255 |         operator   = "In"
256 |         scope_name = "PriorityClass"
257 |         values     = ["system-node-critical", "system-cluster-critical"]
258 |       }
259 |     }
260 |   }
261 | }
262 | 
263 | /********************************************
264 |  NIM Operator Configuration
265 | ********************************************/
266 | resource "helm_release" "nim_operator" {
267 |   depends_on       = [google_container_node_pool.gpu_nodes, kubernetes_resource_quota_v1.nim-operator-quota, kubernetes_namespace_v1.nim-operator]
268 |   count            = var.install_nim_operator ? 1 : 0
269 |   name             = "nim-operator"
270 |   repository       = "https://helm.ngc.nvidia.com/nvidia"
271 |   chart            = "k8s-nim-operator"
272 |   version          = var.nim_operator_version
273 |   namespace        = var.nim_operator_namespace
274 |   create_namespace = true
275 |   atomic           = true
276 |   cleanup_on_fail  = true
277 |   reset_values     = true
278 |   replace          = true
279 | }
280 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                         Version 2.0, January 2004
  3 |                     http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |     "License" shall mean the terms and conditions for use, reproduction,
 10 |     and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |     "Licensor" shall mean the copyright owner or entity authorized by
 13 |     the copyright owner that is granting the License.
 14 | 
 15 |     "Legal Entity" shall mean the union of the acting entity and all
 16 |     other entities that control, are controlled by, or are under common
 17 |     control with that entity. For the purposes of this definition,
 18 |     "control" means (i) the power, direct or indirect, to cause the
 19 |     direction or management of such entity, whether by contract or
 20 |     otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |     outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |     "You" (or "Your") shall mean an individual or Legal Entity
 24 |     exercising permissions granted by this License.
 25 | 
 26 |     "Source" form shall mean the preferred form for making modifications,
 27 |     including but not limited to software source code, documentation
 28 |     source, and configuration files.
 29 | 
 30 |     "Object" form shall mean any form resulting from mechanical
 31 |     transformation or translation of a Source form, including but
 32 |     not limited to compiled object code, generated documentation,
 33 |     and conversions to other media types.
 34 | 
 35 |     "Work" shall mean the work of authorship, whether in Source or
 36 |     Object form, made available under the License, as indicated by a
 37 |     copyright notice that is included in or attached to the work
 38 |     (an example is provided in the Appendix below).
 39 | 
 40 |     "Derivative Works" shall mean any work, whether in Source or Object
 41 |     form, that is based on (or derived from) the Work and for which the
 42 |     editorial revisions, annotations, elaborations, or other modifications
 43 |     represent, as a whole, an original work of authorship. For the purposes
 44 |     of this License, Derivative Works shall not include works that remain
 45 |     separable from, or merely link (or bind by name) to the interfaces of,
 46 |     the Work and Derivative Works thereof.
 47 | 
 48 |     "Contribution" shall mean any work of authorship, including
 49 |     the original version of the Work and any modifications or additions
 50 |     to that Work or Derivative Works thereof, that is intentionally
 51 |     submitted to Licensor for inclusion in the Work by the copyright owner
 52 |     or by an individual or Legal Entity authorized to submit on behalf of
 53 |     the copyright owner. For the purposes of this definition, "submitted"
 54 |     means any form of electronic, verbal, or written communication sent
 55 |     to the Licensor or its representatives, including but not limited to
 56 |     communication on electronic mailing lists, source code control systems,
 57 |     and issue tracking systems that are managed by, or on behalf of, the
 58 |     Licensor for the purpose of discussing and improving the Work, but
 59 |     excluding communication that is conspicuously marked or otherwise
 60 |     designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |     "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |     on behalf of whom a Contribution has been received by Licensor and
 64 |     subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |     this License, each Contributor hereby grants to You a perpetual,
 68 |     worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |     copyright license to reproduce, prepare Derivative Works of,
 70 |     publicly display, publicly perform, sublicense, and distribute the
 71 |     Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |     this License, each Contributor hereby grants to You a perpetual,
 75 |     worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |     (except as stated in this section) patent license to make, have made,
 77 |     use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |     where such license applies only to those patent claims licensable
 79 |     by such Contributor that are necessarily infringed by their
 80 |     Contribution(s) alone or by combination of their Contribution(s)
 81 |     with the Work to which such Contribution(s) was submitted. If You
 82 |     institute patent litigation against any entity (including a
 83 |     cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |     or a Contribution incorporated within the Work constitutes direct
 85 |     or contributory patent infringement, then any patent licenses
 86 |     granted to You under this License for that Work shall terminate
 87 |     as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |     Work or Derivative Works thereof in any medium, with or without
 91 |     modifications, and in Source or Object form, provided that You
 92 |     meet the following conditions:
 93 | 
 94 |     (a) You must give any other recipients of the Work or
 95 |         Derivative Works a copy of this License; and
 96 | 
 97 |     (b) You must cause any modified files to carry prominent notices
 98 |         stating that You changed the files; and
 99 | 
100 |     (c) You must retain, in the Source form of any Derivative Works
101 |         that You distribute, all copyright, patent, trademark, and
102 |         attribution notices from the Source form of the Work,
103 |         excluding those notices that do not pertain to any part of
104 |         the Derivative Works; and
105 | 
106 |     (d) If the Work includes a "NOTICE" text file as part of its
107 |         distribution, then any Derivative Works that You distribute must
108 |         include a readable copy of the attribution notices contained
109 |         within such NOTICE file, excluding those notices that do not
110 |         pertain to any part of the Derivative Works, in at least one
111 |         of the following places: within a NOTICE text file distributed
112 |         as part of the Derivative Works; within the Source form or
113 |         documentation, if provided along with the Derivative Works; or,
114 |         within a display generated by the Derivative Works, if and
115 |         wherever such third-party notices normally appear. The contents
116 |         of the NOTICE file are for informational purposes only and
117 |         do not modify the License. You may add Your own attribution
118 |         notices within Derivative Works that You distribute, alongside
119 |         or as an addendum to the NOTICE text from the Work, provided
120 |         that such additional attribution notices cannot be construed
121 |         as modifying the License.
122 | 
123 |     You may add Your own copyright statement to Your modifications and
124 |     may provide additional or different license terms and conditions
125 |     for use, reproduction, or distribution of Your modifications, or
126 |     for any such Derivative Works as a whole, provided Your use,
127 |     reproduction, and distribution of the Work otherwise complies with
128 |     the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |     any Contribution intentionally submitted for inclusion in the Work
132 |     by You to the Licensor shall be under the terms and conditions of
133 |     this License, without any additional terms or conditions.
134 |     Notwithstanding the above, nothing herein shall supersede or modify
135 |     the terms of any separate license agreement you may have executed
136 |     with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |     names, trademarks, service marks, or product names of the Licensor,
140 |     except as required for reasonable and customary use in describing the
141 |     origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |     agreed to in writing, Licensor provides the Work (and each
145 |     Contributor provides its Contributions) on an "AS IS" BASIS,
146 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |     implied, including, without limitation, any warranties or conditions
148 |     of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |     PARTICULAR PURPOSE. You are solely responsible for determining the
150 |     appropriateness of using or redistributing the Work and assume any
151 |     risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |     whether in tort (including negligence), contract, or otherwise,
155 |     unless required by applicable law (such as deliberate and grossly
156 |     negligent acts) or agreed to in writing, shall any Contributor be
157 |     liable to You for damages, including any direct, indirect, special,
158 |     incidental, or consequential damages of any character arising as a
159 |     result of this License or out of the use or inability to use the
160 |     Work (including but not limited to damages for loss of goodwill,
161 |     work stoppage, computer failure or malfunction, or any and all
162 |     other commercial damages or losses), even if such Contributor
163 |     has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |     the Work or Derivative Works thereof, You may choose to offer,
167 |     and charge a fee for, acceptance of support, warranty, indemnity,
168 |     or other liability obligations and/or rights consistent with this
169 |     License. However, in accepting such obligations, You may act only
170 |     on Your own behalf and on Your sole responsibility, not on behalf
171 |     of any other Contributor, and only if You agree to indemnify,
172 |     defend, and hold each Contributor harmless for any liability
173 |     incurred by, or claims asserted against, such Contributor by reason
174 |     of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |     To apply the Apache License to your work, attach the following
181 |     boilerplate notice, with the fields enclosed by brackets "[]"
182 |     replaced with your own identifying information. (Don't include
183 |     the brackets!)  The text should be enclosed in the appropriate
184 |     comment syntax for the file format. We also recommend that a
185 |     file or class name and description of purpose be included on the
186 |     same "printed page" as the copyright notice for easier
187 |     identification within third-party archives.
188 | 
189 | Copyright 2022-2023  NVIDIA CORPORATION & AFFILIATES.
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 |     http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.


--------------------------------------------------------------------------------
/aks/README.md:
--------------------------------------------------------------------------------
  1 | #  NVIDIA AKS cluster
  2 | 
  3 | ## Tested on
  4 | This module was created and tested on Linux and MacOS.
  5 | 
  6 | ## Resources Created
  7 | - Azure Resource Group
  8 | - AKS Cluster
  9 | - 1x CPU nodepool (defaults to 1x CPU node -- Standard_D16_v5)
 10 | - 1x GPU nodepool (defaults to 2x GPU node -- Standard_NC4as_T4_v3 with T4)
 11 | - Installs Latest version of GPU Operator
 12 | 
 13 | ## Prerequisites
 14 | 1. Kubectl
 15 | 2. [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli)
 16 | 3. Azure Account & Subscription where you are permitted to create cloud resources
 17 | 4. Terraform (CLI)
 18 | 5. [Azure Kubelogin](https://github.com/Azure/kubelogin#setup)
 19 | 
 20 | ## Usage
 21 | 
 22 | This module assumes that you have a working `terraform` binary and active Azure credentials.
 23 | 
 24 | No Terraform Provider is setup for remote state management but can be added.
 25 | We strongly encourage you [configure remote state](https://developer.hashicorp.com/terraform/language/state/remote) before running in production.
 26 | 
 27 | 1. Clone the repo
 28 |         
 29 |     ```
 30 |     git clone https://github.com/NVIDIA/nvidia-terraform-modules.git
 31 | 
 32 |     cd nvidia-terraform-modules/aks
 33 |     ```
 34 | 
 35 | 2. Logging in to Azure via the CLI
 36 |     - Run the below command , this will authenticate you to your Azure account
 37 |         
 38 |     ```
 39 |     az login
 40 |     ```
 41 | 
 42 | 3. Update `terraform.tfvars` file to customize a parameter from its default value, please uncomment the line and change the content
 43 | 
 44 |    Mandatory: provide your admin_group_object_ids    
 45 | 
 46 | 
 47 | Add the IDs of the members or groups who should have cluster access to the variable `admin_group_object_ids`. 
 48 | The GUID input can be retrieved in the Azure portal by searching for the desired user or group, for more info please refer [Find Object Id](https://learn.microsoft.com/en-us/partner-center/marketplace/find-tenant-object-id).
 49 | 
 50 | 
 51 | 
 52 |     ```
 53 |     admin_group_object_ids       = ["xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"]
 54 |     ```
 55 | 
 56 | 
 57 |    Mandatory: provide your subscription_id
 58 | 
 59 | 
 60 |     ```
 61 |     subscription_id              = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 62 |     ```
 63 | 
 64 | 
 65 | 4. Initialize the module with the command:
 66 |         
 67 |     ```
 68 |     terraform init
 69 |     ```
 70 | 
 71 | 5. Run this command to view the proposed changes:
 72 | 
 73 |     ```
 74 |     terraform plan -out tfplan
 75 |     ```
 76 | 
 77 | 6. Run the below command to apply the configuration:
 78 | 
 79 |     ```
 80 |     terraform apply tfplan
 81 |     ```
 82 | 
 83 | 7. Once cluster is created run the below command with aks cluster name and resource group name to get kubeconfig so you are able to run `kubectl` commands:
 84 | 
 85 |     ```
 86 |     az aks get-credentials --resource-group aks-cluster-tf-rg --name aks-cluster-tf
 87 |     ```
 88 | 
 89 | #### Cleaning up / Deleting resources
 90 | 
 91 | 1. Run the beloe commands to delete all remaining Azure resources created by this module. You should see `Destroy complete!` message after a few minutes.
 92 |         
 93 |     ```
 94 |     terraform state rm kubernetes_namespace_v1.gpu-operator
 95 | 
 96 |     terraform state rm kubernetes_namespace_v1.nim-operator
 97 |     ```
 98 | 
 99 |     ```
100 |     terraform destroy --auto-approve
101 |     ```
102 |     
103 | ## Running as a module
104 | 
105 | Call the AKS module by adding this to an existing Terraform file:
106 | 
107 | ```hcl
108 | module "nvidia-aks" {
109 |   source                 = "git::github.com/NVIDIA/nvidia-terraform-modules/aks" 
110 |   cluster_name           = "nvidia-aks"
111 |   admin_group_object_ids = [] # See below for the value of this variable
112 | }
113 | ```
114 | All configurable options for this module are listed below.
115 | If you need additional values added, please open a pull request.        ```
116 | 
117 | ## Issues
118 | - None. If you do encounter an issue, please file a GitHub issue.
119 | 
120 | ## Troubleshooting
121 | -  ### Quota Errors
122 |     New Azure accounts which have not turned on VMs or GPU VMs in any region will need to request quota in that region. 
123 |     During installation, if you see a quota-related error, click the link in the error message to be redirected to the Azure console with a prepopulated quota request. Re-run `terraform apply` once the quota request is complete. This will take ~5m per quota request
124 | 
125 | - ### Azure Cloudshell Errors
126 |   When using Azure Cloudshell during installation, if you see a `MSI` or `Bad Request(400)` error, it is [known issue](https://github.com/Azure/azure-cli/issues/11749) with Azure Cloushell. 
127 |   There are 2 workarounds:
128 |   - Use Azure CLI on a local machine
129 |   -  In Cloud Shell, run `az login` and re-run `terraform apply`
130 | 
131 | 
132 | ## Requirements
133 | 
134 | | Name | Version |
135 | |------|---------|
136 | | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3.4 |
137 | | <a name="requirement_azurerm"></a> [azurerm](#requirement\_azurerm) | ~>4.24.0 |
138 | | <a name="requirement_kubernetes"></a> [kubernetes](#requirement\_kubernetes) | ~>2.19.0 |
139 | 
140 | ## Providers
141 | 
142 | | Name | Version |
143 | |------|---------|
144 | | <a name="provider_azurerm"></a> [azurerm](#provider\_azurerm) | ~>4.24.0 |
145 | | <a name="provider_helm"></a> [helm](#provider\_helm) | n/a |
146 | | <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | ~>2.19.0 |
147 | 
148 | ## Modules
149 | 
150 | No modules.
151 | 
152 | ## Resources
153 | 
154 | | Name | Type |
155 | |------|------|
156 | | [azurerm_kubernetes_cluster.aks](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster) | resource |
157 | | [azurerm_kubernetes_cluster_node_pool.aks](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster_node_pool) | resource |
158 | | [azurerm_resource_group.aks](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/resource_group) | resource |
159 | | [helm_release.gpu-operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
160 | | [helm_release.nim_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
161 | | [kubernetes_namespace_v1.gpu-operator](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
162 | | [kubernetes_namespace_v1.nim-operator](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
163 | | [azurerm_kubernetes_cluster.akscluster](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/kubernetes_cluster) | data source |
164 | | [azurerm_resource_group.existing](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/resource_group) | data source |
165 | 
166 | ## Inputs
167 | 
168 | | Name | Description | Type | Default | Required |
169 | |------|-------------|------|---------|:--------:|
170 | | <a name="input_admin_group_object_ids"></a> [admin\_group\_object\_ids](#input\_admin\_group\_object\_ids) | (Required) A list of Object IDs (GUIDs) of Azure Active Directory Groups which should have Owner Role on the Cluster. <br>  This is not the email address of the group, the GUID can be found in the Azure panel by searching for the AD Group<br>  NOTE: You will need Azure "Owner" role (not "Contributor") to attach an AD role to the Kubernetes cluster. | `list(any)` | n/a | yes |
171 | | <a name="input_cluster_name"></a> [cluster\_name](#input\_cluster\_name) | The name of the AKS Cluster to be created | `string` | `"aks-cluster"` | no |
172 | | <a name="input_cpu_machine_type"></a> [cpu\_machine\_type](#input\_cpu\_machine\_type) | Machine instance type of the AKS CPU node pool | `string` | `"Standard_D16_v5"` | no |
173 | | <a name="input_cpu_node_pool_count"></a> [cpu\_node\_pool\_count](#input\_cpu\_node\_pool\_count) | Count of nodes in Default GPU pool | `number` | `1` | no |
174 | | <a name="input_cpu_node_pool_disk_size"></a> [cpu\_node\_pool\_disk\_size](#input\_cpu\_node\_pool\_disk\_size) | Disk size in GB of nodes in the Default GPU pool | `number` | `100` | no |
175 | | <a name="input_cpu_node_pool_max_count"></a> [cpu\_node\_pool\_max\_count](#input\_cpu\_node\_pool\_max\_count) | Max count of nodes in Default CPU pool | `number` | `5` | no |
176 | | <a name="input_cpu_node_pool_min_count"></a> [cpu\_node\_pool\_min\_count](#input\_cpu\_node\_pool\_min\_count) | Min ount of number of nodes in Default CPU pool | `number` | `1` | no |
177 | | <a name="input_cpu_os_sku"></a> [cpu\_os\_sku](#input\_cpu\_os\_sku) | Specifies the OS SKU used by the agent pool. Possible values include: Ubuntu, CBLMariner, Mariner, Windows2019, Windows2022 | `string` | `"Ubuntu"` | no |
178 | | <a name="input_existing_resource_group_name"></a> [existing\_resource\_group\_name](#input\_existing\_resource\_group\_name) | The name of an existing resource group the Kubernetes cluster should be deployed into. Defaults to the name of the cluster + `-rg` if none is specified | `string` | `null` | no |
179 | | <a name="input_gpu_machine_type"></a> [gpu\_machine\_type](#input\_gpu\_machine\_type) | Machine instance type of the AKS GPU node pool | `string` | `"Standard_NC6s_v3"` | no |
180 | | <a name="input_gpu_node_pool_count"></a> [gpu\_node\_pool\_count](#input\_gpu\_node\_pool\_count) | Count of nodes in Default GPU pool | `number` | `2` | no |
181 | | <a name="input_gpu_node_pool_disk_size"></a> [gpu\_node\_pool\_disk\_size](#input\_gpu\_node\_pool\_disk\_size) | Disk size in GB of nodes in the Default GPU pool | `number` | `100` | no |
182 | | <a name="input_gpu_node_pool_max_count"></a> [gpu\_node\_pool\_max\_count](#input\_gpu\_node\_pool\_max\_count) | Max count of nodes in Default GPU pool | `number` | `5` | no |
183 | | <a name="input_gpu_node_pool_min_count"></a> [gpu\_node\_pool\_min\_count](#input\_gpu\_node\_pool\_min\_count) | Min count of number of nodes in Default GPU pool | `number` | `2` | no |
184 | | <a name="input_gpu_operator_driver_version"></a> [gpu\_operator\_driver\_version](#input\_gpu\_operator\_driver\_version) | The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available. | `string` | `"570.124.06"` | no |
185 | | <a name="input_gpu_operator_namespace"></a> [gpu\_operator\_namespace](#input\_gpu\_operator\_namespace) | The namespace to deploy the NVIDIA GPU operator into | `string` | `"gpu-operator"` | no |
186 | | <a name="input_gpu_operator_version"></a> [gpu\_operator\_version](#input\_gpu\_operator\_version) | Version of the GPU operator to be installed | `string` | `"v25.3.0"` | no |
187 | | <a name="input_gpu_os_sku"></a> [gpu\_os\_sku](#input\_gpu\_os\_sku) | Specifies the OS SKU used by the agent pool. Possible values include: Ubuntu, CBLMariner, Mariner, Windows2019, Windows2022 | `string` | `"Ubuntu"` | no |
188 | | <a name="input_install_gpu_operator"></a> [install\_gpu\_operator](#input\_install\_gpu\_operator) | Whether to Install GPU Operator. Defaults to false available. | `string` | `"true"` | no |
189 | | <a name="input_install_nim_operator"></a> [install\_nim\_operator](#input\_install\_nim\_operator) | Whether to Install NIM Operator. Defaults to false available. | `string` | `"false"` | no |
190 | | <a name="input_kubernetes_version"></a> [kubernetes\_version](#input\_kubernetes\_version) | Version of Kubernetes to turn on. Run 'az aks get-versions --location <location> --output table' to view all available versions | `string` | `"1.30"` | no |
191 | | <a name="input_location"></a> [location](#input\_location) | The region to create resources in | `any` | n/a | yes |
192 | | <a name="input_nim_operator_namespace"></a> [nim\_operator\_namespace](#input\_nim\_operator\_namespace) | The namespace for the GPU operator deployment | `string` | `"nim-operator"` | no |
193 | | <a name="input_nim_operator_version"></a> [nim\_operator\_version](#input\_nim\_operator\_version) | Version of the GPU Operator to deploy. Defaults to latest available. | `string` | `"v1.0.0"` | no |
194 | 
195 | ## Outputs
196 | 
197 | | Name | Description |
198 | |------|-------------|
199 | | <a name="output_client_certificate"></a> [client\_certificate](#output\_client\_certificate) | n/a |
200 | | <a name="output_kube_config"></a> [kube\_config](#output\_kube\_config) | n/a |
201 | | <a name="output_kubernetes_cluster_name"></a> [kubernetes\_cluster\_name](#output\_kubernetes\_cluster\_name) | n/a |
202 | | <a name="output_location"></a> [location](#output\_location) | n/a |
203 | | <a name="output_resource_group_name"></a> [resource\_group\_name](#output\_resource\_group\_name) | n/a |
204 | 


--------------------------------------------------------------------------------
/gke/README.md:
--------------------------------------------------------------------------------
  1 | # NVIDIA GKE Cluster
  2 | 
  3 | This repo provides Terraform configuration to bring up a GKE Kubernetes Cluster with the GPU operator and GPU nodes from scratch.
  4 | 
  5 | 
  6 | ## Tested on
  7 | This module was created with and tested on Linux using Bash, it may or may not work on Windows or when using Powershell.
  8 | 
  9 | ## Resources Created
 10 | - VPC Network for GKE Cluster
 11 | - Subnet in VPC
 12 | - GKE Cluster
 13 | - 1 CPU nodepool (defaults to 1x CPU node -- n1-standard-4)
 14 | - 1 GPU nodepool (defaults to 2x GPU nodes -- n1-standard-4 with 1x Tesla T4)
 15 | - Installs latest version of GPU Operator via Helm 
 16 | 
 17 | ## Prerequisites
 18 | 1. Kubectl
 19 | 2. Google Cloud ((glcoud)[https://cloud.google.com/sdk/docs/install]) CLI
 20 | 3. GCP Account & Project where you are permitted to create cloud resources
 21 | 4. Terraform (CLI) 
 22 | 
 23 | 
 24 | #### Setup
 25 | 
 26 | 1. Requires the `gcloud` SDK binary -- [Download here](https://cloud.google.com/sdk/docs/install)
 27 | 
 28 | 2. Requires the Terraform cli @ Version 1.3.4 or higher -- [Download here](https://developer.hashicorp.com/terraform/downloads)
 29 | 
 30 | 3. To run this module assumes elevated permissions (Kubernetes Engine Admin) in your GCP account, specifically permissions to create VPC networks, GKE clusters, and Compute nodes. This will not work on accounts using the "free plan" as you cannot use GPU nodes until a billing account is attached and activated. 
 31 | 
 32 | 4. You will need to enable both the Kubernetes API and the Compute Engine APIs enabled. Click [the GKE tab in the GCP panel](https://console.cloud.google.com/kubernetes) for your project and enable the GKE API, which will also enable the Compute engine API at the same time
 33 | 
 34 | 5. Ensure you have [GPU Quota](https://cloud.google.com/compute/quotas#gpu_quota) in your desired region/zone. You can [request](GPU Quota) if it is not enabled in a new account. You will need quota for both `GPUS_ALL_REGIONS` and for the specific SKU in the desired region.
 35 | 
 36 | ## Usage 
 37 | 
 38 | 0. Authenticate to GCP:
 39 | 
 40 |    ```
 41 |    gcloud auth login
 42 |    ```
 43 | 
 44 | 
 45 | 1. Run this command to clone the repo:
 46 | 
 47 |     ```
 48 |     git clone https://github.com/NVIDIA/nvidia-terraform-modules.git
 49 | 
 50 |     cd nvidia-terraform-modules/gke
 51 |     ```
 52 | 
 53 | 2. Update `terraform.tfvars` to customize a parameter from its default value, please uncomment the line and change the content
 54 | 
 55 |     Mandatory: provide your project ID by updating the parameter `project_id`. 
 56 | 
 57 | You can get the `project_id` from your [GCP console](#https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects).
 58 | 
 59 |    ```
 60 |    project_id                          = "xx-xxxx-xxxx"
 61 |    
 62 |    ```
 63 | 
 64 | 3. Run this command to make your Google Credentials available to the `terraform` executable:
 65 | 
 66 |     ```
 67 |     gcloud auth application-default login
 68 |     ```
 69 | 
 70 | 4. Run this command to fetch the required Terraform provider plugins:
 71 | 
 72 |     ```
 73 |     terraform init
 74 |     ```
 75 | 
 76 | 5. If your credentials are setup correctly, you should see the proposed changes in GCP by running `terraform plan -out tfplan`:
 77 | 
 78 |     ```
 79 |     terraform plan -out tfplan
 80 |     ```
 81 | 
 82 | ** Note on IAM Permissions:** you need either `Admin` permissions or `Compute Instance Admin (v1)`, `Kubernetes Engine Admin` and `Compute Network Admin (v1)` to run this module. 
 83 | 
 84 | 6. If this configuration looks approproate run this command:
 85 | 
 86 |     ```
 87 |     terraform apply tfplan
 88 |     ```
 89 | 
 90 | 7. It will take ~5 minutes after the `terraform apply` successful completion message for the GPU operator to get to a running state
 91 | 
 92 | 8. Connect to the cluster with `kubectl` by running the following two commands after the cluster is created:
 93 | 
 94 |     ```
 95 |     gcloud components install gke-gcloud-auth-plugin
 96 | 
 97 |     gcloud container clusters get-credentials <CLUSTER_NAME> --region=<REGION>
 98 |     ```
 99 | 
100 | #### Cleaning up / Deleting resources
101 | 
102 | 1. Run these commands to delete all remaining GCP resources created by this module. You should see `Destroy complete!` message after a few minutes.
103 | 
104 |     ```
105 |     terraform state rm kubernetes_namespace_v1.gpu-operator
106 | 
107 |     terraform state rm kubernetes_namespace_v1.nim-operator
108 |     ```
109 | 
110 |     ```
111 |     sed -i '' 's/\"deletion_protection\": true\,/\"deletion_protection\": false\,/g' terraform.tfstate
112 |     ```
113 | 
114 |     ```
115 |     terraform destroy --auto-approve
116 |     ```
117 | 
118 | # Terraform Module Information
119 | ## Running as a module
120 | 
121 | Call the GKE module by adding this to an existing Terraform file:
122 | 
123 | ```hcl
124 | module "nvidia-gke" {
125 |   source     = "git::github.com/NVIDIA/nvidia-terraform-modules/gke"
126 |   project_id = "<your GKE Project ID>"
127 |   region     = "us-west1" # Can be any region
128 |   node_zones = ["us-west1-b"] # Can be any region but ensure your desired machine types/gpus exist
129 | }
130 | ```
131 | 
132 | All configurable options for this module are listed below.
133 | If you need additional values added, please open a merge request.
134 | 
135 | ## Requirements
136 | 
137 | | Name | Version |
138 | |------|---------|
139 | | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3.4 |
140 | | <a name="requirement_google"></a> [google](#requirement\_google) | 6.27.0 |
141 | | <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | 6.27.0 |
142 | 
143 | ## Providers
144 | 
145 | | Name | Version |
146 | |------|---------|
147 | | <a name="provider_google"></a> [google](#provider\_google) | 6.27.0 |
148 | | <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | 6.27.0 |
149 | | <a name="provider_helm"></a> [helm](#provider\_helm) | n/a |
150 | | <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | 2.19.0 |
151 | 
152 | ## Modules
153 | 
154 | No modules.
155 | 
156 | ## Resources
157 | 
158 | | Name | Type |
159 | |------|------|
160 | | [google_compute_network.gke-vpc](https://registry.terraform.io/providers/hashicorp/google/4.27.0/docs/resources/compute_network) | resource |
161 | | [google_compute_subnetwork.gke-subnet](https://registry.terraform.io/providers/hashicorp/google/4.27.0/docs/resources/compute_subnetwork) | resource |
162 | | [google_container_cluster.gke](https://registry.terraform.io/providers/hashicorp/google/4.27.0/docs/resources/container_cluster) | resource |
163 | | [google_container_node_pool.cpu_nodes](https://registry.terraform.io/providers/hashicorp/google/4.27.0/docs/resources/container_node_pool) | resource |
164 | | [google_container_node_pool.gpu_nodes](https://registry.terraform.io/providers/hashicorp/google/4.27.0/docs/resources/container_node_pool) | resource |
165 | | [helm_release.gpu-operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
166 | | [helm_release.nim_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
167 | | [kubernetes_namespace_v1.gpu-operator](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
168 | | [kubernetes_namespace_v1.nim-operator](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
169 | | [kubernetes_resource_quota_v1.gpu-operator-quota](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/resource_quota_v1) | resource |
170 | | [kubernetes_resource_quota_v1.nim-operator-quota](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/resource_quota_v1) | resource |
171 | | [google-beta_google_container_engine_versions.latest](https://registry.terraform.io/providers/hashicorp/google-beta/4.57.0/docs/data-sources/google_container_engine_versions) | data source |
172 | | [google_client_config.provider](https://registry.terraform.io/providers/hashicorp/google/4.27.0/docs/data-sources/client_config) | data source |
173 | | [google_container_cluster.gke-cluster](https://registry.terraform.io/providers/hashicorp/google/4.27.0/docs/data-sources/container_cluster) | data source |
174 | | [google_project.cluster](https://registry.terraform.io/providers/hashicorp/google/4.27.0/docs/data-sources/project) | data source |
175 | 
176 | ## Inputs
177 | 
178 | | Name | Description | Type | Default | Required |
179 | |------|-------------|------|---------|:--------:|
180 | | <a name="input_cluster_name"></a> [cluster\_name](#input\_cluster\_name) | Name of the Kubernetes Cluster to provision | `string` | n/a | yes |
181 | | <a name="input_cpu_instance_type"></a> [cpu\_instance\_type](#input\_cpu\_instance\_type) | Machine Type for CPU node pool | `string` | `"n1-standard-4"` | no |
182 | | <a name="input_cpu_max_node_count"></a> [cpu\_max\_node\_count](#input\_cpu\_max\_node\_count) | Max Number of CPU nodes in CPU nodepool | `string` | `"5"` | no |
183 | | <a name="input_cpu_min_node_count"></a> [cpu\_min\_node\_count](#input\_cpu\_min\_node\_count) | Number of CPU nodes in CPU nodepool | `string` | `"1"` | no |
184 | | <a name="input_disk_size_gb"></a> [disk\_size\_gb](#input\_disk\_size\_gb) | n/a | `string` | `"512"` | no |
185 | | <a name="input_gpu_count"></a> [gpu\_count](#input\_gpu\_count) | Number of GPUs to attach to each node in GPU pool | `string` | `"1"` | no |
186 | | <a name="input_gpu_instance_tags"></a> [gpu\_instance\_tags](#input\_gpu\_instance\_tags) | GPU instance nodes tags | `list(string)` | `[]` | no |
187 | | <a name="input_gpu_instance_type"></a> [gpu\_instance\_type](#input\_gpu\_instance\_type) | Machine Type for GPU node pool | `string` | `"n1-standard-4"` | no |
188 | | <a name="input_gpu_max_node_count"></a> [gpu\_max\_node\_count](#input\_gpu\_max\_node\_count) | Max Number of GPU nodes in GPU nodepool | `string` | `"5"` | no |
189 | | <a name="input_gpu_min_node_count"></a> [gpu\_min\_node\_count](#input\_gpu\_min\_node\_count) | Min number of GPU nodes in GPU nodepool | `string` | `"2"` | no |
190 | | <a name="input_gpu_operator_driver_version"></a> [gpu\_operator\_driver\_version](#input\_gpu\_operator\_driver\_version) | The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available | `string` | `"570.124.06"` | no |
191 | | <a name="input_gpu_operator_namespace"></a> [gpu\_operator\_namespace](#input\_gpu\_operator\_namespace) | The namespace to deploy the NVIDIA GPU operator into | `string` | `"gpu-operator"` | no |
192 | | <a name="input_gpu_operator_version"></a> [gpu\_operator\_version](#input\_gpu\_operator\_version) | Version of the GPU Operator to deploy. Defaults to latest available | `string` | `"v25.3.0"` | no |
193 | | <a name="input_gpu_type"></a> [gpu\_type](#input\_gpu\_type) | GPU SKU To attach to NVIDIA GPU Node (eg. nvidia-tesla-k80) | `string` | `"nvidia-tesla-t4"` | no |
194 | | <a name="input_install_gpu_operator"></a> [install\_gpu\_operator](#input\_install\_gpu\_operator) | Whether to Install GPU Operator. Defaults to false available. | `string` | `"true"` | no |
195 | | <a name="input_install_nim_operator"></a> [install\_nim\_operator](#input\_install\_nim\_operator) | Whether to Install NIM Operator. Defaults to false available. | `string` | `"false"` | no |
196 | | <a name="input_min_master_version"></a> [min\_master\_version](#input\_min\_master\_version) | The minimum cluster version of the master. | `string` | `"1.32"` | no |
197 | | <a name="input_network"></a> [network](#input\_network) | Network CIDR for VPC | `string` | `""` | no |
198 | | <a name="input_nim_operator_namespace"></a> [nim\_operator\_namespace](#input\_nim\_operator\_namespace) | The namespace for the GPU operator deployment | `string` | `"nim-operator"` | no |
199 | | <a name="input_nim_operator_version"></a> [nim\_operator\_version](#input\_nim\_operator\_version) | Version of the GPU Operator to deploy. Defaults to latest available | `string` | `"v1.0.1"` | no |
200 | | <a name="input_node_zones"></a> [node\_zones](#input\_node\_zones) | Specify zones to put nodes in (must be in same region defined above) | `list(any)` | n/a | yes |
201 | | <a name="input_num_cpu_nodes"></a> [num\_cpu\_nodes](#input\_num\_cpu\_nodes) | Number of CPU nodes when pool is created | `number` | `1` | no |
202 | | <a name="input_num_gpu_nodes"></a> [num\_gpu\_nodes](#input\_num\_gpu\_nodes) | Number of GPU nodes when pool is created | `number` | `2` | no |
203 | | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | GCP Project ID for the VPC and K8s Cluster. This module currently does not support projects with a SharedVPC | `any` | n/a | yes |
204 | | <a name="input_region"></a> [region](#input\_region) | The Region resources (VPC, GKE, Compute Nodes) will be created in | `any` | n/a | yes |
205 | | <a name="input_release_channel"></a> [release\_channel](#input\_release\_channel) | Configuration options for the Release channel feature, which provide more control over automatic upgrades of your GKE clusters. When updating this field, GKE imposes specific version requirements | `string` | `"REGULAR"` | no |
206 | | <a name="input_subnetwork"></a> [subnetwork](#input\_subnetwork) | Subnet name used for k8s cluster nodes | `string` | `""` | no |
207 | | <a name="input_use_cpu_spot_instances"></a> [use\_cpu\_spot\_instances](#input\_use\_cpu\_spot\_instances) | Use Spot instance for CPU pool | `bool` | `false` | no |
208 | | <a name="input_use_gpu_spot_instances"></a> [use\_gpu\_spot\_instances](#input\_use\_gpu\_spot\_instances) | Use Spot instance for GPU pool | `bool` | `false` | no |
209 | | <a name="input_vpc_enabled"></a> [vpc\_enabled](#input\_vpc\_enabled) | Variable to control nvidia-kubernetes GKE module VPC creation | `bool` | `true` | no |
210 | 
211 | ## Outputs
212 | 
213 | | Name | Description |
214 | |------|-------------|
215 | | <a name="output_kubernetes_cluster_endpoint_ip"></a> [kubernetes\_cluster\_endpoint\_ip](#output\_kubernetes\_cluster\_endpoint\_ip) | GKE Cluster IP Endpoint |
216 | | <a name="output_kubernetes_cluster_name"></a> [kubernetes\_cluster\_name](#output\_kubernetes\_cluster\_name) | GKE Cluster Name |
217 | | <a name="output_kubernetes_config_file"></a> [kubernetes\_config\_file](#output\_kubernetes\_config\_file) | GKE Cluster IP Endpoint |
218 | | <a name="output_project_id"></a> [project\_id](#output\_project\_id) | GCloud Project ID |
219 | | <a name="output_region"></a> [region](#output\_region) | Region for Kubernetes Resources to be created in when using this module |
220 | | <a name="output_subnet_cidr_range"></a> [subnet\_cidr\_range](#output\_subnet\_cidr\_range) | The IPs and CIDRs of the subnets |
221 | | <a name="output_subnet_region"></a> [subnet\_region](#output\_subnet\_region) | The region of the VPC subnet used in this module |
222 | | <a name="output_vpc_project"></a> [vpc\_project](#output\_vpc\_project) | Project of the VPC network (can be different from the project launching Kubernetes resources) |
223 | 


--------------------------------------------------------------------------------
/eks/README.md:
--------------------------------------------------------------------------------
  1 | # EKS Ready Kubernetes (EKS)
  2 | This repo provides Terraform configuration to bring up a EKS Kubernetes Cluster with the GPU operator and GPU nodes from scratch.
  3 | 
  4 | ## Tested on
  5 | This module was created and tested on Linux and MacOS.
  6 | 
  7 | ## Resources Created
  8 | - VPC Network for EKS Cluster
  9 | - Subnets in VPC for EKS CLuster
 10 | - EKS Cluster
 11 | - 1x CPU nodepool
 12 | - 1x GPU nodepool
 13 | - Installs latest version of GPU Operator via Helm
 14 | - 1x KMS Key to encrypt cluster secrets
 15 | 
 16 | For more details on resources created and their default values, please see the [Terraform Module Inputs](#inputs) section.
 17 | 
 18 | ## Prerequisites
 19 | 1. Kubectl 
 20 | 2. [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
 21 |     - You must run `aws-configure` once on your machine to populate the default region present in `~/.aws/config`
 22 | 3. AWS Account where you have permissions to create a cluster, IAM roles and networking
 23 | 4. Terraform (CLI)[https://developer.hashicorp.com/terraform/downloads]
 24 | 5. [JQ](https://stedolan.github.io/jq/download/)
 25 |     - The provisioning will fail without this step, as it is used to set up your Kubernetes configuration file file after the cluster is provisioned
 26 | 
 27 | #### Issues
 28 | - None. If you encounter any, please file a GitHub issue
 29 | 
 30 | ## Usage
 31 | 
 32 | This module assumes that you have a working `terraform` binary and active AWS credentials (admin access or finely scoped permissions with basic EC2, EKS, VPC and IAM creation permissions).
 33 | 
 34 | No Terraform Provider is setup for remote state management but can be added.
 35 | We strongly encourage you [configure remote state](https://developer.hashicorp.com/terraform/language/state/remote) before running in production.
 36 | 
 37 | 1. Clone the repo
 38 |         
 39 | 
 40 |     ```
 41 |     git clone https://github.com/NVIDIA/nvidia-terraform-modules.git
 42 | 
 43 |     cd nvidia-terraform-modules/eks
 44 |     ```
 45 | 
 46 | 2. Ensure you have active credentials set with the AWS CLI.
 47 | 
 48 |     ```
 49 |     aws configure
 50 |     ```
 51 | 
 52 | 3. Update `terraform.tfvars` to customize a parameter from its default value, please uncomment the line and change the content 
 53 |  
 54 | 
 55 | 4. Run the below command to initialize the configured
 56 | 
 57 |     ```
 58 |     terraform init
 59 |     ```
 60 | 
 61 | 5. Run the below command to see what will be applied
 62 | 
 63 |     ```
 64 |     terraform plan -out tfplan
 65 |     ```
 66 | 
 67 | 6. Run the below command to apply the code against your AWS environment
 68 | 
 69 |     ```
 70 |     terraform apply tfplan
 71 |     ```
 72 | 
 73 | 7. Connect to the cluster with `kubectl` by running below command with updated cluster name and region after the cluster is created
 74 | 
 75 |     ```
 76 |     aws eks update-kubeconfig --name <eks-cluster-name> --region <eks-region>
 77 |     ```
 78 | 
 79 | #### Cleaning up / Deleting resources
 80 | 
 81 | 1. Run the beloe commands to delete all remaining AWS resources created by this module. You should see `Destroy complete!` message after a few minutes.
 82 | 
 83 |     ```
 84 |     terraform destroy --auto-approve
 85 |     ```
 86 | 
 87 | ## Running as a module
 88 | 
 89 | Call the EKS module by adding this to an existing Terraform file:
 90 | 
 91 | ```hcl
 92 | module "nvidia-eks" {
 93 |   source       = "git::github.com/nvidia/nvidia-terraform-modules/eks" 
 94 |   cluster_name = "nvidia-eks"
 95 | }
 96 | ```
 97 | In a production environment, we suggest pinning to a known tag of this Terraform module
 98 | All configurable options for this module are listed below.
 99 | If you need additional values added, please open a pull request.
100 | 
101 | ## Requirements
102 | 
103 | | Name | Version |
104 | |------|---------|
105 | | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3.4 |
106 | | <a name="requirement_aws"></a> [aws](#requirement\_aws) | ~>5.93.0 |
107 | | <a name="requirement_kubernetes"></a> [kubernetes](#requirement\_kubernetes) | ~>2.19.0 |
108 | 
109 | ## Providers
110 | 
111 | | Name | Version |
112 | |------|---------|
113 | | <a name="provider_aws"></a> [aws](#provider\_aws) | ~>5.93.0 |
114 | | <a name="provider_helm"></a> [helm](#provider\_helm) | n/a |
115 | 
116 | ## Modules
117 | 
118 | | Name | Source | Version |
119 | |------|--------|---------|
120 | | <a name="module_ebs_csi_irsa_role"></a> [ebs\_csi\_irsa\_role](#module\_ebs\_csi\_irsa\_role) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | n/a |
121 | | <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | 18.29.0 |
122 | | <a name="module_vpc"></a> [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | 4.0.2 |
123 | 
124 | ## Resources
125 | 
126 | | Name | Type |
127 | |------|------|
128 | | [helm_release.gpu_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
129 | | [helm_release.nim_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
130 | | [aws_ami.lookup](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
131 | | [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
132 | | [aws_eks_cluster.eks](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source |
133 | | [aws_instances.nodes](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/instances) | data source |
134 | | [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source |
135 | 
136 | ## Inputs
137 | 
138 | | Name | Description | Type | Default | Required |
139 | |------|-------------|------|---------|:--------:|
140 | | <a name="input_additional_node_security_groups_rules"></a> [additional\_node\_security\_groups\_rules](#input\_additional\_node\_security\_groups\_rules) | List of additional security group rules to add to the node security group created | `any` | `{}` | no |
141 | | <a name="input_additional_security_group_ids"></a> [additional\_security\_group\_ids](#input\_additional\_security\_group\_ids) | list of additional security groups to add to nodes | `list(any)` | `[]` | no |
142 | | <a name="input_additional_user_data"></a> [additional\_user\_data](#input\_additional\_user\_data) | User data that is appended to the user data script after of the EKS bootstrap script. | `string` | `""` | no |
143 | | <a name="input_aws_profile"></a> [aws\_profile](#input\_aws\_profile) | n/a | `string` | `"development"` | no |
144 | | <a name="input_cidr_block"></a> [cidr\_block](#input\_cidr\_block) | CIDR for VPC | `string` | `"10.0.0.0/16"` | no |
145 | | <a name="input_cluster_name"></a> [cluster\_name](#input\_cluster\_name) | n/a | `string` | n/a | yes |
146 | | <a name="input_cluster_version"></a> [cluster\_version](#input\_cluster\_version) | Version of EKS to install on the control plane (Major and Minor version only, do not include the patch) | `string` | `"1.30"` | no |
147 | | <a name="input_cpu_instance_type"></a> [cpu\_instance\_type](#input\_cpu\_instance\_type) | CPU EC2 worker node instance type | `string` | `"t2.xlarge"` | no |
148 | | <a name="input_cpu_node_pool_additional_user_data"></a> [cpu\_node\_pool\_additional\_user\_data](#input\_cpu\_node\_pool\_additional\_user\_data) | User data that is appended to the user data script after of the EKS bootstrap script on EKS-managed GPU node pool. | `string` | `""` | no |
149 | | <a name="input_cpu_node_pool_delete_on_termination"></a> [cpu\_node\_pool\_delete\_on\_termination](#input\_cpu\_node\_pool\_delete\_on\_termination) | Delete the VM nodes root filesystem on each node of the instance type. This is set to true by default, but can be changed when desired when using the 'local-storage provisioner' and are keeping important application data on the nodes | `bool` | `true` | no |
150 | | <a name="input_cpu_node_pool_root_disk_size_gb"></a> [cpu\_node\_pool\_root\_disk\_size\_gb](#input\_cpu\_node\_pool\_root\_disk\_size\_gb) | The size of the root disk on all GPU nodes in the EKS-managed GPU-only Node Pool. This is primarily for container image storage on the node | `number` | `512` | no |
151 | | <a name="input_cpu_node_pool_root_volume_type"></a> [cpu\_node\_pool\_root\_volume\_type](#input\_cpu\_node\_pool\_root\_volume\_type) | The type of disk to use for the GPU node pool root disk (eg. gp2, gp3). Note, this is different from the type of disk used by applications via EKS Storage classes/PVs & PVCs | `string` | `"gp2"` | no |
152 | | <a name="input_desired_count_cpu_nodes"></a> [desired\_count\_cpu\_nodes](#input\_desired\_count\_cpu\_nodes) | Minimum number of CPU nodes in the Autoscaling Group | `string` | `"1"` | no |
153 | | <a name="input_desired_count_gpu_nodes"></a> [desired\_count\_gpu\_nodes](#input\_desired\_count\_gpu\_nodes) | Minimum number of GPU nodes in the Autoscaling Group | `string` | `"2"` | no |
154 | | <a name="input_enable_dns_hostnames"></a> [enable\_dns\_hostnames](#input\_enable\_dns\_hostnames) | Whether or not the Default VPC has DNS hostname support | `bool` | `true` | no |
155 | | <a name="input_enable_dns_support"></a> [enable\_dns\_support](#input\_enable\_dns\_support) | Whether or not the Default VPC has DNS support | `bool` | `true` | no |
156 | | <a name="input_enable_nat_gateway"></a> [enable\_nat\_gateway](#input\_enable\_nat\_gateway) | Should be true if you want to provision NAT Gateways for each of your private networks | `bool` | `true` | no |
157 | | <a name="input_existing_vpc_details"></a> [existing\_vpc\_details](#input\_existing\_vpc\_details) | Variables used for re-using existing VPC for vpc\_id & subnet\_id | <pre>object({<br>    vpc_id     = string<br>    subnet_ids = list(string)<br>  })</pre> | `null` | no |
158 | | <a name="input_gpu_ami_id"></a> [gpu\_ami\_id](#input\_gpu\_ami\_id) | AMI ID of the EKS Ubuntu Image cooresponding to the region and version of the cluser. Not required as we do a lookup for this image | `string` | `""` | no |
159 | | <a name="input_gpu_instance_type"></a> [gpu\_instance\_type](#input\_gpu\_instance\_type) | GPU EC2 worker node instance type | `string` | `"g6e.12xlarge"` | no |
160 | | <a name="input_gpu_node_pool_additional_user_data"></a> [gpu\_node\_pool\_additional\_user\_data](#input\_gpu\_node\_pool\_additional\_user\_data) | User data that is appended to the user data script after of the EKS bootstrap script on EKS-managed GPU node pool. | `string` | `""` | no |
161 | | <a name="input_gpu_node_pool_delete_on_termination"></a> [gpu\_node\_pool\_delete\_on\_termination](#input\_gpu\_node\_pool\_delete\_on\_termination) | Delete the VM nodes root filesystem on each node of the instance type. This is set to true by default, but can be changed when desired when using the 'local-storage provisioner' and are keeping important application data on the nodes | `bool` | `true` | no |
162 | | <a name="input_gpu_node_pool_root_disk_size_gb"></a> [gpu\_node\_pool\_root\_disk\_size\_gb](#input\_gpu\_node\_pool\_root\_disk\_size\_gb) | The size of the root disk on all GPU nodes in the EKS-managed GPU-only Node Pool. This is primarily for container image storage on the node | `number` | `512` | no |
163 | | <a name="input_gpu_node_pool_root_volume_type"></a> [gpu\_node\_pool\_root\_volume\_type](#input\_gpu\_node\_pool\_root\_volume\_type) | The type of disk to use for the GPU node pool root disk (eg. gp2, gp3). Note, this is different from the type of disk used by applications via EKS Storage classes/PVs & PVCs | `string` | `"gp2"` | no |
164 | | <a name="input_gpu_operator_driver_version"></a> [gpu\_operator\_driver\_version](#input\_gpu\_operator\_driver\_version) | The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available. | `string` | `"570.124.06"` | no |
165 | | <a name="input_gpu_operator_namespace"></a> [gpu\_operator\_namespace](#input\_gpu\_operator\_namespace) | The namespace for the GPU operator deployment | `string` | `"gpu-operator"` | no |
166 | | <a name="input_gpu_operator_version"></a> [gpu\_operator\_version](#input\_gpu\_operator\_version) | Version of the GPU Operator to deploy. Defaults to latest available. | `string` | `"v25.3.0"` | no |
167 | | <a name="input_install_gpu_operator"></a> [install\_gpu\_operator](#input\_install\_gpu\_operator) | Whether to Install GPU Operator. Defaults to false available. | `string` | `"true"` | no |
168 | | <a name="input_install_nim_operator"></a> [install\_nim\_operator](#input\_install\_nim\_operator) | Whether to Install NIM Operator. Defaults to false available. | `string` | `"false"` | no |
169 | | <a name="input_max_cpu_nodes"></a> [max\_cpu\_nodes](#input\_max\_cpu\_nodes) | Maximum number of CPU nodes in the Autoscaling Group | `string` | `"2"` | no |
170 | | <a name="input_max_gpu_nodes"></a> [max\_gpu\_nodes](#input\_max\_gpu\_nodes) | Maximum number of GPU nodes in the Autoscaling Group | `string` | `"5"` | no |
171 | | <a name="input_min_cpu_nodes"></a> [min\_cpu\_nodes](#input\_min\_cpu\_nodes) | Minimum number of CPU nodes in the Autoscaling Group | `string` | `"0"` | no |
172 | | <a name="input_min_gpu_nodes"></a> [min\_gpu\_nodes](#input\_min\_gpu\_nodes) | Minimum number of GPU nodes in the Autoscaling Group | `string` | `"2"` | no |
173 | | <a name="input_nim_operator_namespace"></a> [nim\_operator\_namespace](#input\_nim\_operator\_namespace) | The namespace for the GPU operator deployment | `string` | `"nim-operator"` | no |
174 | | <a name="input_nim_operator_version"></a> [nim\_operator\_version](#input\_nim\_operator\_version) | Version of the GPU Operator to deploy. Defaults to latest available. | `string` | `"v1.0.1"` | no |
175 | | <a name="input_private_subnets"></a> [private\_subnets](#input\_private\_subnets) | List of subnet ranges for the Private VPC | `list(any)` | <pre>[<br>  "10.0.0.0/19",<br>  "10.0.32.0/19",<br>  "10.0.64.0/19"<br>]</pre> | no |
176 | | <a name="input_public_subnets"></a> [public\_subnets](#input\_public\_subnets) | List of subnet ranges for the Private VPC | `list(any)` | <pre>[<br>  "10.0.96.0/19",<br>  "10.0.128.0/19",<br>  "10.0.160.0/19"<br>]</pre> | no |
177 | | <a name="input_region"></a> [region](#input\_region) | AWS region to provision the Kubernetes Cluster | `string` | `"us-west-2"` | no |
178 | | <a name="input_single_nat_gateway"></a> [single\_nat\_gateway](#input\_single\_nat\_gateway) | Should be true if you want to provision a single shared NAT Gateway across all of your private networks | `bool` | `false` | no |
179 | | <a name="input_ssh_key"></a> [ssh\_key](#input\_ssh\_key) | n/a | `string` | `""` | no |
180 | 
181 | ## Outputs
182 | 
183 | | Name | Description |
184 | |------|-------------|
185 | | <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | n/a |
186 | | <a name="output_cluster_endpoint"></a> [cluster\_endpoint](#output\_cluster\_endpoint) | n/a |
187 | | <a name="output_cpu_node_role_name"></a> [cpu\_node\_role\_name](#output\_cpu\_node\_role\_name) | IAM Node Role Bane for CPU node pools |
188 | | <a name="output_gpu_node_role_name"></a> [gpu\_node\_role\_name](#output\_gpu\_node\_role\_name) | IAM Node Role Name for GPU node pools |
189 | | <a name="output_kube_exec_api_version"></a> [kube\_exec\_api\_version](#output\_kube\_exec\_api\_version) | n/a |
190 | | <a name="output_kube_exec_args"></a> [kube\_exec\_args](#output\_kube\_exec\_args) | n/a |
191 | | <a name="output_kube_exec_command"></a> [kube\_exec\_command](#output\_kube\_exec\_command) | n/a |
192 | | <a name="output_nodes"></a> [nodes](#output\_nodes) | n/a |
193 | | <a name="output_oidc_endpoint"></a> [oidc\_endpoint](#output\_oidc\_endpoint) | n/a |
194 | | <a name="output_private_subnet_ids"></a> [private\_subnet\_ids](#output\_private\_subnet\_ids) | n/a |
195 | | <a name="output_public_subnet_ids"></a> [public\_subnet\_ids](#output\_public\_subnet\_ids) | n/a |
196 | 


--------------------------------------------------------------------------------