├── modules ├── custom_data │ ├── outputs.tf │ ├── files │ │ ├── default-storageclass.yaml │ │ ├── azure-cloud.conf.template │ │ └── rke2-init.sh │ ├── main.tf │ └── variables.tf ├── nodepool │ ├── outputs.tf │ ├── main.tf │ └── variables.tf ├── rke2-cluster │ ├── outputs.tf │ ├── variables.tf │ └── main.tf ├── lb │ ├── outputs.tf │ ├── variables.tf │ └── main.tf ├── rke2-server │ ├── outputs.tf │ ├── variables.tf │ └── main.tf ├── statestore │ └── main.tf ├── common │ └── download.sh └── rke2-agents │ ├── main.tf │ └── variables.tf ├── tests ├── load-balancer.yaml ├── pvc-pod.yaml ├── README.md └── smoke-test.sh ├── examples └── quickstart │ ├── output.tf │ ├── terraform.tfvars.sample │ ├── README.md │ ├── variables.tf │ └── main.tf ├── scripts ├── check-terraform.sh ├── fetch-kubeconfig.sh └── fetch-ssh-key.sh ├── outputs.tf ├── .gitignore ├── .devcontainer ├── devcontainer.json ├── Dockerfile └── scripts │ └── non-root-user.sh ├── main.tf ├── variables.tf └── README.md /modules/custom_data/outputs.tf: -------------------------------------------------------------------------------- 1 | output "templated" { 2 | value = data.template_file.init.rendered 3 | } -------------------------------------------------------------------------------- /modules/nodepool/outputs.tf: -------------------------------------------------------------------------------- 1 | output "scale_set_id" { 2 | value = azurerm_linux_virtual_machine_scale_set.this.id 3 | } 4 | -------------------------------------------------------------------------------- /tests/load-balancer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: test-svc 5 | spec: 6 | type: LoadBalancer 7 | ports: 8 | - port: 80 9 | selector: 10 | app: dummy 11 | -------------------------------------------------------------------------------- /examples/quickstart/output.tf: -------------------------------------------------------------------------------- 1 | output "rke2_cluster" { 2 | value = module.rke2.rke2_cluster 3 | } 4 | 5 | output "kv_name" { 6 | value = module.rke2.kv_name 7 | } 8 | 9 | output "rg_name" { 10 | value = azurerm_resource_group.rke2.name 11 | } 12 | -------------------------------------------------------------------------------- /modules/rke2-cluster/outputs.tf: -------------------------------------------------------------------------------- 1 | output "token_vault_url" { 2 | value = module.rke2.token_vault_url 3 | } 4 | 5 | output "token_vault_name" { 6 | value = module.rke2.token_vault_name 7 | } 8 | 9 | output "cluster_data" { 10 | value = module.rke2.cluster_data 11 | } 12 | -------------------------------------------------------------------------------- /scripts/check-terraform.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | terraform init 5 | 6 | TFFMT_COUNT=$(terraform fmt -write=false -recursive | wc -l) 7 | if (( $TFFMT_COUNT > 0 )); then 8 | echo -e "\n*** ERROR! the following files require re-formatting" 9 | terraform fmt -recursive -check 10 | fi 11 | 12 | terraform validate 13 | -------------------------------------------------------------------------------- /examples/quickstart/terraform.tfvars.sample: -------------------------------------------------------------------------------- 1 | # General Azure settings 2 | cluster_name = "rke2-example" 3 | cloud = "AzureUSGovernmentCloud" 4 | location = "usgovvirginia" 5 | 6 | # RKE cluster sizing 7 | server_instance_count = 1 8 | agent_instance_count = 2 9 | vm_size = "Standard_D8_v3" 10 | 11 | # Connectivity options 12 | server_public_ip = true 13 | server_open_ssh_public = true 14 | -------------------------------------------------------------------------------- /outputs.tf: -------------------------------------------------------------------------------- 1 | output "rke2_cluster" { 2 | description = "RKE2 cluster data created" 3 | value = module.rke2_cluster.cluster_data 4 | } 5 | 6 | output "kv_name" { 7 | description = "Name of the key vault created" 8 | value = module.rke2_cluster.token_vault_name 9 | } 10 | 11 | output "rg_name" { 12 | description = "Name of the resource group used" 13 | value = local.resource_group_name 14 | } 15 | -------------------------------------------------------------------------------- /modules/lb/outputs.tf: -------------------------------------------------------------------------------- 1 | output "lb_url" { 2 | value = var.type == "public" ? azurerm_public_ip.pip[0].ip_address : azurerm_lb.this.private_ip_address 3 | } 4 | 5 | output "backend_pool_id" { 6 | value = azurerm_lb_backend_address_pool.bepool.id 7 | } 8 | 9 | output "azurerm_lb_nat_pool_ssh_id" { 10 | value = azurerm_lb_nat_pool.ssh.id 11 | } 12 | 13 | output "controlplane_probe_id" { 14 | value = azurerm_lb_probe.this.id 15 | } 16 | -------------------------------------------------------------------------------- /modules/custom_data/files/default-storageclass.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: storage.k8s.io/v1 2 | kind: StorageClass 3 | metadata: 4 | annotations: 5 | storageclass.beta.kubernetes.io/is-default-class: "true" 6 | labels: 7 | kubernetes.io/cluster-service: "true" 8 | name: default 9 | parameters: 10 | cachingmode: ReadOnly 11 | kind: Managed 12 | storageaccounttype: StandardSSD_LRS 13 | provisioner: kubernetes.io/azure-disk 14 | reclaimPolicy: Delete 15 | volumeBindingMode: Immediate 16 | allowVolumeExpansion: true 17 | -------------------------------------------------------------------------------- /modules/custom_data/main.tf: -------------------------------------------------------------------------------- 1 | data "template_file" "init" { 2 | template = file("${path.module}/files/rke2-init.sh") 3 | 4 | vars = { 5 | type = var.agent ? "agent" : "server" 6 | 7 | server_url = var.server_url 8 | vault_url = var.vault_url 9 | token_secret = var.token_secret 10 | config = var.config 11 | ccm = var.ccm 12 | cloud = var.cloud 13 | node_labels = var.node_labels 14 | node_taints = var.node_taints 15 | 16 | pre_userdata = var.pre_userdata 17 | post_userdata = var.post_userdata 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /examples/quickstart/README.md: -------------------------------------------------------------------------------- 1 | # Example RKE2 Deployment 2 | 3 | This is an example RKE2 deployment which uses the main module at the root of this repo to deploy RKE2 4 | 5 | ## Quick Deployment 6 | 7 | - Run from this examples directory, e.g. `cd examples/quickstart` 8 | - Copy `terraform.tfvars.sample` to `terraform.tfvars` 9 | - Change `cluster_name` and other settings, but most can be left as the defaults 10 | - Run `terraform apply -auto-approve` 11 | 12 | ## Connect 13 | 14 | For kubectl 15 | 16 | ```bash 17 | source ../../scripts/fetch-kubeconfig.sh 18 | kubectl get nodes 19 | ``` 20 | 21 | For SSH 22 | 23 | ```bash 24 | ../../scripts/fetch-ssh-key.sh 25 | ``` -------------------------------------------------------------------------------- /modules/custom_data/files/azure-cloud.conf.template: -------------------------------------------------------------------------------- 1 | { 2 | "cloud": "${cloud}", 3 | "tenantId": "${tenant_id}", 4 | "userAssignedIdentityID": "${user_assigned_identity_id}", 5 | "subscriptionId": "${subscription_id}", 6 | "resourceGroup": "${rg_name}", 7 | "vmType": "vmss", 8 | "location": "${location}", 9 | "subnetName": "${subnet_name}", 10 | "securityGroupName": "${nsg_name}", 11 | "securityGroupResourceGroup": "${rg_name}", 12 | "vnetName": "${virtual_network_name}", 13 | "vnetResourceGroup": "${rg_name}", 14 | "routeTableName": "${rg_name}", 15 | "useManagedIdentityExtension": true, 16 | "useInstanceMetadata": true, 17 | "loadBalancerSku": "standard" 18 | } 19 | -------------------------------------------------------------------------------- /scripts/fetch-kubeconfig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ ! -f terraform.tfstate ]]; then 4 | echo "Unable to find terraform.tfstate please run from the directory you ran terraform apply" 5 | exit 1 6 | fi 7 | 8 | if [[ "$0" = "$BASH_SOURCE" ]]; then 9 | echo "Please source this script. Do not execute." 10 | exit 1 11 | fi 12 | 13 | DIRECTORY=$(dirname $0) 14 | 15 | KV_NAME=${1:-$(terraform output -raw kv_name)} 16 | FILE=rke2.kubeconfig 17 | 18 | 19 | echo "Fetching kubeconfig from KeyVault $KV_NAME" 20 | az keyvault secret show --name kubeconfig --vault-name $KV_NAME -o json | jq -r '.value' > $FILE 21 | 22 | if [ $? -eq 0 ]; then 23 | echo "Download successful. Setting KUBECONFIG to $FILE" 24 | export KUBECONFIG=$FILE 25 | fi 26 | -------------------------------------------------------------------------------- /modules/rke2-server/outputs.tf: -------------------------------------------------------------------------------- 1 | output "network_security_group_name" { 2 | value = azurerm_network_security_group.server.name 3 | } 4 | 5 | output "token_vault_url" { 6 | value = module.statestore.vault_url 7 | } 8 | 9 | output "token_vault_name" { 10 | value = module.statestore.vault_name 11 | } 12 | 13 | output "cluster_data" { 14 | value = { 15 | name = local.uname 16 | server_url = module.cp_lb.lb_url 17 | cluster_identity_id = azurerm_user_assigned_identity.cluster.id 18 | cluster_identity_client_id = azurerm_user_assigned_identity.cluster.client_id 19 | token = { 20 | vault_url = module.statestore.vault_url 21 | token_secret = module.statestore.token_secret_name 22 | vault_id = module.statestore.vault_id 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tests/pvc-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: test-pvc-azure-disk 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | storageClassName: default 9 | resources: 10 | requests: 11 | storage: 5Gi 12 | 13 | --- 14 | kind: Pod 15 | apiVersion: v1 16 | metadata: 17 | name: test-pod 18 | spec: 19 | containers: 20 | - name: test-pod 21 | image: mcr.microsoft.com/oss/nginx/nginx:1.15.5-alpine 22 | resources: 23 | requests: 24 | cpu: 100m 25 | memory: 128Mi 26 | limits: 27 | cpu: 250m 28 | memory: 256Mi 29 | volumeMounts: 30 | - mountPath: "/mnt/azure" 31 | name: volume 32 | volumes: 33 | - name: volume 34 | persistentVolumeClaim: 35 | claimName: test-pvc-azure-disk 36 | -------------------------------------------------------------------------------- /modules/lb/variables.tf: -------------------------------------------------------------------------------- 1 | variable "name" {} 2 | 3 | variable "resource_group_name" {} 4 | 5 | variable "type" { 6 | description = "(Optional) Toggle between private or public load balancer" 7 | type = string 8 | default = "private" 9 | } 10 | 11 | variable "subnet_id" { 12 | type = string 13 | default = null 14 | } 15 | 16 | variable "private_ip_address" { 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "private_ip_address_allocation" { 22 | type = string 23 | default = null 24 | } 25 | 26 | variable "lb_sku" { 27 | type = string 28 | default = "Standard" 29 | } 30 | 31 | variable "tags" { 32 | default = {} 33 | type = map(string) 34 | } 35 | 36 | variable "zone" { 37 | description = "(Optional) Defaults to No-Zone. Possible values Zone-Redundant, 1, 2, 3, No-Zone" 38 | type = string 39 | default = "No-Zone" 40 | } -------------------------------------------------------------------------------- /scripts/fetch-ssh-key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [[ ! -f terraform.tfstate ]]; then 6 | echo "Unable to find terraform.tfstate please run from the directory you ran terraform apply" 7 | exit 1 8 | fi 9 | 10 | FILE_NAME="rke2.priv_key" 11 | USERNAME="rke2" 12 | 13 | KV_NAME=${1:-$(terraform output -raw kv_name)} 14 | if [[ $1 != "" ]]; then 15 | RG=${1%-*} 16 | SERVER_URL=$(az network public-ip show -g $RG -n $1-pip --query "ipAddress" -o tsv) 17 | else 18 | SERVER_URL=$(terraform output -json rke2_cluster | jq -r '.server_url') 19 | fi 20 | 21 | az keyvault secret show --name node-key --vault-name $KV_NAME | jq -r '.value' > $FILE_NAME 22 | [[ $# != 0 ]] && { echo "Failed to fetch node-key secret from KeyVault: $KV_NAME"; exit 1; } 23 | chmod 600 $FILE_NAME 24 | 25 | echo "Connect to the first server with the following command:" 26 | echo " ssh ${USERNAME}@${SERVER_URL} -p 5000 -i $FILE_NAME" 27 | echo "For each server in the cluster increase the port by 1, e.g. 5001, 5002" 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Local .terraform directories 2 | **/.terraform/* 3 | 4 | # Crash log files 5 | crash.log 6 | 7 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most 8 | # .tfvars files are managed as part of configuration and so should be included in 9 | # version control. 10 | # 11 | # example.tfvars 12 | 13 | # Ignore override files as they are usually used to override resources locally and so 14 | # are not checked in 15 | override.tf 16 | override.tf.json 17 | *_override.tf 18 | *_override.tf.json 19 | 20 | # Include override files you do wish to add to version control using negated pattern 21 | # 22 | # !example_override.tf 23 | 24 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 25 | # example: *tfplan* 26 | 27 | # ides 28 | .idea 29 | 30 | # Generated 31 | *.pem 32 | rke2.yaml 33 | admin.conf 34 | 35 | # .tfstate files 36 | *.tfstate 37 | *.tfstate.* 38 | *.terraform.lock.hcl* 39 | 40 | # tf vars & plans 41 | *.tfvars 42 | *.tfplan 43 | !sample.tfvars 44 | 45 | # rke2 46 | *.kubeconfig 47 | *.priv_key 48 | .ssh 49 | 50 | # Test output 51 | tests/*.xml 52 | -------------------------------------------------------------------------------- /examples/quickstart/variables.tf: -------------------------------------------------------------------------------- 1 | variable "cloud" { 2 | description = "Which Azure cloud to use" 3 | type = string 4 | default = "AzureUSGovernmentCloud" 5 | validation { 6 | condition = contains(["AzureUSGovernmentCloud", "AzurePublicCloud"], var.cloud) 7 | error_message = "Allowed values for cloud are \"AzureUSGovernmentCloud\" or \"AzurePublicCloud\"." 8 | } 9 | } 10 | 11 | variable "server_public_ip" { 12 | description = "Assign a public IP to the control plane load balancer" 13 | type = bool 14 | default = true 15 | } 16 | 17 | variable "server_open_ssh_public" { 18 | description = "Allow SSH to the server nodes through the control plane load balancer" 19 | type = bool 20 | default = false 21 | } 22 | 23 | variable "vm_size" { 24 | type = string 25 | default = "Standard_D8_v3" 26 | } 27 | 28 | variable "server_instance_count" { 29 | type = number 30 | default = 1 31 | } 32 | 33 | variable "agent_instance_count" { 34 | type = number 35 | default = 2 36 | } 37 | 38 | variable "cluster_name" { 39 | type = string 40 | } 41 | 42 | variable "location" { 43 | type = string 44 | default = "usgovvirginia" 45 | } -------------------------------------------------------------------------------- /examples/quickstart/main.tf: -------------------------------------------------------------------------------- 1 | provider "azurerm" { 2 | features {} 3 | } 4 | 5 | resource "azurerm_resource_group" "rke2" { 6 | name = var.cluster_name 7 | location = var.location 8 | } 9 | 10 | resource "azurerm_virtual_network" "rke2" { 11 | name = "${var.cluster_name}-vnet" 12 | address_space = ["10.0.0.0/16"] 13 | 14 | resource_group_name = azurerm_resource_group.rke2.name 15 | location = azurerm_resource_group.rke2.location 16 | } 17 | 18 | resource "azurerm_subnet" "rke2" { 19 | name = "${var.cluster_name}-snet" 20 | 21 | resource_group_name = azurerm_resource_group.rke2.name 22 | virtual_network_name = azurerm_virtual_network.rke2.name 23 | 24 | address_prefixes = ["10.0.1.0/24"] 25 | } 26 | 27 | module "rke2" { 28 | source = "../.." 29 | cluster_name = var.cluster_name 30 | subnet_id = azurerm_subnet.rke2.id 31 | server_public_ip = var.server_public_ip 32 | server_open_ssh_public = var.server_open_ssh_public 33 | vm_size = var.vm_size 34 | server_instance_count = var.server_instance_count 35 | agent_instance_count = var.agent_instance_count 36 | cloud = var.cloud 37 | } 38 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.155.1/containers/azure-functions-dotnetcore-3.1 3 | { 4 | "name": "RKE2 Azure", 5 | "dockerFile": "Dockerfile", 6 | "mounts": [ 7 | // Mounts the login details from the host machine to azcli works in the container 8 | "type=bind,source=${env:HOME}${env:USERPROFILE}/.azure,target=/home/vscode/.azure", 9 | // ssh keys for git repos 10 | "type=bind,source=${env:HOME}${env:USERPROFILE}/.ssh,target=/home/vscode/.ssh-localhost" 11 | ], 12 | // We need to be able to use the network for openVPN (see https://github.com/haugene/docker-transmission-openvpn/issues/488). 13 | "runArgs": [ 14 | "--cap-add=NET_ADMIN" 15 | ], 16 | // Set *default* container specific settings.json values on container create. 17 | "settings": { 18 | "terminal.integrated.shell.linux": "/bin/zsh" 19 | }, 20 | // Add the IDs of extensions you want installed when the container is created. 21 | "extensions": [ 22 | "ms-vscode.azure-account", 23 | "hashicorp.terraform" 24 | ], 25 | // Use 'postCreateCommand' to run commands after the container is created. 26 | "postCreateCommand": "sudo chown -R $(whoami) ~/", 27 | // Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 28 | "remoteUser": "vscode" 29 | } 30 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/python-3/.devcontainer/base.Dockerfile 2 | # [Choice] Python version: 3, 3.9, 3.8, 3.7, 3.6 3 | ARG VERSION="20.04" 4 | FROM ubuntu:${VERSION} 5 | 6 | # Avoid warnings by switching to noninteractive 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | 9 | ARG USERNAME=vscode 10 | ARG USER_UID=1000 11 | ARG USER_GID=$USER_UID 12 | ARG TERRAFORM_VERSION=1.0.1 13 | ARG TFLINT_VERSION=0.30.0 14 | 15 | # Set up non-root user 16 | COPY scripts/non-root-user.sh /tmp/ 17 | RUN bash /tmp/non-root-user.sh "${USERNAME}" "${USER_UID}" "${USER_GID}" 18 | 19 | # Install system tools 20 | RUN apt-get update && \ 21 | apt-get -y install --no-install-recommends sudo git zsh ca-certificates \ 22 | zip unzip curl jq openvpn 23 | 24 | # Install Az CLI. 25 | RUN curl -sL https://aka.ms/InstallAzureCLIDeb | bash 26 | 27 | # Install Terraform 28 | RUN mkdir -p /tmp/docker-downloads \ 29 | && curl -sSL -o /tmp/docker-downloads/terraform.zip https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip \ 30 | && unzip /tmp/docker-downloads/terraform.zip \ 31 | && mv terraform /usr/local/bin \ 32 | && rm /tmp/docker-downloads/terraform.zip 33 | 34 | # Install TFlint 35 | RUN curl -sSL -o /tmp/docker-downloads/tflint.zip https://github.com/wata727/tflint/releases/download/v${TFLINT_VERSION}/tflint_linux_amd64.zip \ 36 | && unzip /tmp/docker-downloads/tflint.zip \ 37 | && mv tflint /usr/local/bin \ 38 | && rm /tmp/docker-downloads/tflint.zip 39 | -------------------------------------------------------------------------------- /main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azurerm = { 4 | version = "~>2.67.0" 5 | source = "hashicorp/azurerm" 6 | } 7 | tls = { 8 | source = "hashicorp/tls" 9 | version = "~>3.1.0" 10 | } 11 | } 12 | } 13 | 14 | locals { 15 | tags = { 16 | "Environment" = var.cluster_name, 17 | "Terraform" = "true", 18 | } 19 | network_details = regex("^(?P\\/subscriptions\\/(?P[^\\/]*)\\/resourceGroups\\/(?P[^\\/]*)\\/providers\\/Microsoft\\.Network\\/virtualNetworks\\/(?P[^\\/]*))\\/subnets\\/(?P[^\\/]*)$", var.subnet_id) 20 | resource_group_name = length(var.resource_group_name) > 0 ? var.resource_group_name : local.network_details.resource_group 21 | } 22 | 23 | 24 | module "rke2_cluster" { 25 | source = "./modules/rke2-cluster" 26 | cluster_name = var.cluster_name 27 | resource_group_name = local.resource_group_name 28 | vnet_id = local.network_details.vnet_id 29 | subnet_id = var.subnet_id 30 | vnet_name = local.network_details.vnet 31 | subnet_name = local.network_details.subnet 32 | cloud = var.cloud 33 | tags = local.tags 34 | 35 | server_public_ip = var.server_public_ip 36 | server_open_ssh_public = var.server_open_ssh_public 37 | vm_size = var.vm_size 38 | agent_vm_size = var.agent_vm_size 39 | server_vm_size = var.server_vm_size 40 | server_instance_count = var.server_instance_count 41 | agent_instance_count = var.agent_instance_count 42 | } 43 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Smoke Tests for Bare Metal Kubernetes on Azure 2 | 3 | These resources can be deployed to Kubernetes to validate & smoke test that the Azure cloud provider is working, in particular: 4 | - A service of type LoadBalancer is able to get an external IP, via an Azure LoadBalancer 5 | - PV and PVCs can be bound using the default storage class which should use `azure-disk` as the provisioner. See modules/custom_data/files/default-storageclass.yaml 6 | - Pods can mount PVCs, therefor are labeled correctly 7 | 8 | ## Running - Automated Script 9 | 10 | Run test script, this will check the cluster nodes are ready, deploy test resources, poll for them to be in the correct statuses and states, then remove the resources. 11 | 12 | ```bash 13 | tests/smoke-test.sh 14 | ``` 15 | 16 | ## Running - Manually 17 | 18 | Deploy all test resources 19 | 20 | ```bash 21 | kubectl apply -f tests/ 22 | ``` 23 | 24 | Validate with 25 | 26 | ```bash 27 | kubectl get pods,svc,pvc 28 | ``` 29 | 30 | It might be several minutes before everything is ready, but you should expect to see: 31 | 32 | - `pod/test-pod` should be **Running** 33 | - `service/test-svc` should have an **external IP address** assigned 34 | - `test-pvc-azure-disk` should be **Bound** 35 | 36 | Running `kubectl describe` against any of these resources may provide additional information and trouble shooting details. 37 | 38 | Also in the Azure resource group you should see the following resources created: 39 | 40 | - Azure Load Balancer named `kubernetes` 41 | - Azure Managed Disk named `kubernetes-dynamic-pvc-{some_guid}` 42 | - Public IP named `kubernetes-{some_random_string}` 43 | -------------------------------------------------------------------------------- /modules/custom_data/variables.tf: -------------------------------------------------------------------------------- 1 | variable "agent" { 2 | description = "Toggle server or agent init, defaults to agent" 3 | type = bool 4 | default = true 5 | } 6 | 7 | variable "server_url" { 8 | description = "rke2 server url" 9 | type = string 10 | } 11 | 12 | variable "vault_url" { 13 | description = "Vault url where token secret is located" 14 | type = string 15 | } 16 | 17 | variable "token_secret" { 18 | description = "Secret name of token in key vault" 19 | type = string 20 | } 21 | 22 | variable "config" { 23 | description = "RKE2 config file yaml contents" 24 | type = string 25 | default = "" 26 | } 27 | 28 | variable "ccm" { 29 | description = "Toggle cloud controller manager" 30 | type = bool 31 | default = false 32 | } 33 | 34 | variable "cloud" { 35 | type = string 36 | default = "AzureUSGovernmentCloud" 37 | validation { 38 | condition = contains(["AzureUSGovernmentCloud", "AzurePublicCloud"], var.cloud) 39 | error_message = "Allowed values for cloud are \"AzureUSGovernmentCloud\" or \"AzurePublicCloud\"." 40 | } 41 | } 42 | 43 | variable "node_labels" { 44 | description = "Node labels to add to the cluster" 45 | type = string 46 | default = "[]" 47 | } 48 | 49 | variable "node_taints" { 50 | description = "Node taints to add to the cluster" 51 | type = string 52 | default = "[]" 53 | } 54 | 55 | # 56 | # Custom Userdata 57 | # 58 | variable "pre_userdata" { 59 | description = "Custom userdata to run immediately before rke2 node attempts to join cluster, after required rke2, dependencies are installed" 60 | default = "" 61 | } 62 | 63 | variable "post_userdata" { 64 | description = "Custom userdata to run immediately after rke2 node attempts to join cluster" 65 | default = "" 66 | } 67 | 68 | 69 | -------------------------------------------------------------------------------- /variables.tf: -------------------------------------------------------------------------------- 1 | variable "cluster_name" { 2 | description = "Prefix used for all resources" 3 | type = string 4 | } 5 | 6 | variable "subnet_id" { 7 | description = "Subnet where to deploy the cluster resources" 8 | type = string 9 | 10 | } 11 | 12 | variable "cloud" { 13 | description = "Which Azure cloud to use" 14 | type = string 15 | default = "AzureUSGovernmentCloud" 16 | validation { 17 | condition = contains(["AzureUSGovernmentCloud", "AzurePublicCloud"], var.cloud) 18 | error_message = "Allowed values for cloud are \"AzureUSGovernmentCloud\" or \"AzurePublicCloud\"." 19 | } 20 | } 21 | 22 | variable "server_public_ip" { 23 | description = "Assign a public IP to the control plane load balancer" 24 | type = bool 25 | default = false 26 | } 27 | 28 | variable "server_open_ssh_public" { 29 | description = "Allow SSH to the server nodes through the control plane load balancer" 30 | type = bool 31 | default = false 32 | } 33 | 34 | variable "vm_size" { 35 | description = "Default VM size to use for the cluster" 36 | type = string 37 | default = "Standard_D8_v3" 38 | } 39 | 40 | variable "server_vm_size" { 41 | type = string 42 | description = "VM size to use for the server nodes" 43 | default = "" 44 | } 45 | 46 | variable "agent_vm_size" { 47 | type = string 48 | description = "VM size to use for the agent nodes" 49 | default = "" 50 | } 51 | 52 | variable "server_instance_count" { 53 | description = "Number of server nodes to deploy" 54 | type = number 55 | default = 1 56 | } 57 | 58 | variable "agent_instance_count" { 59 | description = "Number of agent nodes to deploy" 60 | type = number 61 | default = 2 62 | } 63 | 64 | variable "resource_group_name" { 65 | description = "(Optional) the name of an existing resource group to be used if not specified the subnet resource group will be used" 66 | type = string 67 | default = "" 68 | } 69 | -------------------------------------------------------------------------------- /modules/statestore/main.tf: -------------------------------------------------------------------------------- 1 | data "azurerm_client_config" "current" {} 2 | 3 | resource "azurerm_key_vault" "this" { 4 | name = var.name 5 | location = var.location 6 | resource_group_name = var.resource_group_name 7 | 8 | sku_name = "standard" 9 | tenant_id = data.azurerm_client_config.current.tenant_id 10 | enabled_for_template_deployment = true 11 | 12 | tags = merge({}, var.tags) 13 | } 14 | 15 | resource "azurerm_key_vault_access_policy" "policy" { 16 | key_vault_id = azurerm_key_vault.this.id 17 | object_id = data.azurerm_client_config.current.object_id 18 | tenant_id = data.azurerm_client_config.current.tenant_id 19 | 20 | key_permissions = [] 21 | 22 | secret_permissions = [ 23 | "Backup", 24 | "Delete", 25 | "Get", 26 | "List", 27 | "Purge", 28 | "Recover", 29 | "Restore", 30 | "Set", 31 | ] 32 | } 33 | 34 | resource "azurerm_key_vault_access_policy" "service_reader" { 35 | key_vault_id = azurerm_key_vault.this.id 36 | tenant_id = data.azurerm_client_config.current.tenant_id 37 | object_id = var.reader_object_id 38 | 39 | key_permissions = [] 40 | secret_permissions = ["Get", "Set"] 41 | certificate_permissions = [] 42 | storage_permissions = [] 43 | 44 | lifecycle { 45 | create_before_destroy = true 46 | } 47 | } 48 | 49 | resource "azurerm_key_vault_secret" "token" { 50 | name = "${var.name}-token" 51 | key_vault_id = azurerm_key_vault.this.id 52 | value = var.token 53 | tags = merge({}, var.tags) 54 | 55 | depends_on = [azurerm_key_vault_access_policy.policy] 56 | } 57 | 58 | variable "name" {} 59 | variable "location" {} 60 | variable "resource_group_name" {} 61 | variable "token" {} 62 | variable "reader_object_id" {} 63 | variable "tags" { 64 | type = map(string) 65 | default = {} 66 | } 67 | 68 | output "vault_url" { 69 | value = azurerm_key_vault.this.vault_uri 70 | } 71 | 72 | output "token_secret_name" { 73 | value = azurerm_key_vault_secret.token.name 74 | } 75 | 76 | output "vault_name" { 77 | value = azurerm_key_vault.this.name 78 | } 79 | 80 | output "vault_id" { 81 | value = azurerm_key_vault.this.id 82 | } 83 | -------------------------------------------------------------------------------- /.devcontainer/scripts/non-root-user.sh: -------------------------------------------------------------------------------- 1 | # usage: non-root-user.sh [username] [user UID] [user GID] 2 | 3 | USERNAME=${1:-"automatic"} 4 | USER_UID=${2:-"automatic"} 5 | USER_GID=${3:-"automatic"} 6 | 7 | set -e 8 | 9 | if [ "$(id -u)" -ne 0 ]; then 10 | echo -e 'Script must be run as root. Use sudo, su, or add "USER root" to your Dockerfile before running this script.' 11 | exit 1 12 | fi 13 | 14 | 15 | # If in automatic mode, determine if a user already exists, if not use vscode 16 | if [ "${USERNAME}" = "auto" ] || [ "${USERNAME}" = "automatic" ]; then 17 | USERNAME="" 18 | POSSIBLE_USERS=("vscode" "node" "codespace" "$(awk -v val=1000 -F ":" '$3==val{print $1}' /etc/passwd)") 19 | for CURRENT_USER in ${POSSIBLE_USERS[@]}; do 20 | if id -u ${CURRENT_USER} > /dev/null 2>&1; then 21 | USERNAME=${CURRENT_USER} 22 | break 23 | fi 24 | done 25 | if [ "${USERNAME}" = "" ]; then 26 | USERNAME=vscode 27 | fi 28 | elif [ "${USERNAME}" = "none" ]; then 29 | USERNAME=root 30 | USER_UID=0 31 | USER_GID=0 32 | fi 33 | 34 | 35 | 36 | 37 | # Create or update a non-root user to match UID/GID. 38 | if id -u ${USERNAME} > /dev/null 2>&1; then 39 | # User exists, update if needed 40 | if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -G $USERNAME)" ]; then 41 | groupmod --gid $USER_GID $USERNAME 42 | usermod --gid $USER_GID $USERNAME 43 | fi 44 | if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then 45 | usermod --uid $USER_UID $USERNAME 46 | fi 47 | else 48 | # Create user 49 | if [ "${USER_GID}" = "automatic" ]; then 50 | groupadd $USERNAME 51 | else 52 | groupadd --gid $USER_GID $USERNAME 53 | fi 54 | if [ "${USER_UID}" = "automatic" ]; then 55 | useradd -s /bin/bash --gid $USERNAME -m $USERNAME 56 | else 57 | useradd -s /bin/bash --uid $USER_UID --gid $USERNAME -m $USERNAME 58 | fi 59 | fi 60 | 61 | # Add add sudo support for non-root user 62 | if [ "${USERNAME}" != "root" ] && [ "${EXISTING_NON_ROOT_USER}" != "${USERNAME}" ]; then 63 | mkdir -p /etc/sudoers.d 64 | echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME 65 | chmod 0440 /etc/sudoers.d/$USERNAME 66 | EXISTING_NON_ROOT_USER="${USERNAME}" 67 | fi 68 | -------------------------------------------------------------------------------- /tests/smoke-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIRECTORY=$(dirname $0) 4 | 5 | # Import testing library 6 | eval "$(curl -q -s https://raw.githubusercontent.com/coryb/osht/master/osht.sh)" 7 | OSHT_JUNIT=1 8 | PLAN 4 9 | 10 | which kubectl > /dev/null || { echo -e "💥 Error! Command kubectl not installed"; exit 1; } 11 | kubectl version > /dev/null 2>&1 || { echo -e "💥 Error! kubectl is not pointing at a cluster, configure KUBECONFIG or $HOME/.kube/config"; exit 1; } 12 | 13 | readyNodes=0 14 | echo "💠 Checking nodes..." 15 | for i in {1..12}; do 16 | readyNodes=$(kubectl get nodes | grep Ready | wc -l) 17 | if (( "$readyNodes" >= 3 )); then 18 | echo "✅ Cluster has $readyNodes nodes ready" 19 | break 20 | fi 21 | echo "⏰ waiting 10 seconds for at least 3 nodes to be ready..." 22 | sleep 10 23 | done 24 | IS "$readyNodes" -ge 3 25 | 26 | echo "🚀 Creating smoke test resources..." 27 | 28 | kubectl apply -f ${DIRECTORY}/load-balancer.yaml > /dev/null 29 | kubectl apply -f ${DIRECTORY}/pvc-pod.yaml > /dev/null 30 | 31 | echo "🔍 Polling resources..." 32 | 33 | externalIp="" 34 | for i in {1..12}; do 35 | externalIp=$(kubectl get service/test-svc -o jsonpath='{.status.loadBalancer.ingress[0].ip}') 36 | if [[ "$externalIp" != "" ]]; then 37 | echo "✅ SERVICE: test-svc has external IP '$externalIp'" 38 | break 39 | fi 40 | echo "⏰ waiting 10 seconds for load-balancer to be ready..." 41 | sleep 10 42 | done 43 | IS $externalIp != "" 44 | 45 | for i in {1..12}; do 46 | pvcStatus=$(kubectl get pvc test-pvc-azure-disk | grep test-pvc-azure-disk | awk '{print $2}') 47 | if [[ "$pvcStatus" == "Bound" ]]; then 48 | echo "✅ PVC: test-pvc-azure-disk is 'Bound'" 49 | break 50 | fi 51 | echo "⏰ waiting 10 seconds for test-pvc-azure-disk to be bound..." 52 | sleep 10 53 | done 54 | IS $pvcStatus == "Bound" 55 | 56 | podStatus="" 57 | for i in {1..12}; do 58 | podStatus=$(kubectl get po test-pod | grep test-pod | awk '{print $3}') 59 | if [[ "$podStatus" == "Running" ]]; then 60 | echo "✅ POD: test-pod is 'Running'" 61 | break 62 | fi 63 | echo "⏰ waiting 10 seconds for test-pod to start..." 64 | sleep 10 65 | done 66 | IS $podStatus == "Running" 67 | 68 | echo "❌ Removing smoke test resources..." 69 | kubectl delete -f ${DIRECTORY}/load-balancer.yaml --wait=false 70 | kubectl delete -f ${DIRECTORY}/pvc-pod.yaml --wait=false 71 | kubectl delete pvc test-pvc-azure-disk --wait=false 72 | -------------------------------------------------------------------------------- /modules/rke2-cluster/variables.tf: -------------------------------------------------------------------------------- 1 | variable "cluster_name" { 2 | description = "Name of the cluster" 3 | type = string 4 | } 5 | 6 | variable "resource_group_name" { 7 | description = "Name of the resource group" 8 | type = string 9 | } 10 | 11 | variable "vnet_id" { 12 | description = "Id of the virtual network to deploy the cluster on" 13 | type = string 14 | } 15 | 16 | variable "subnet_id" { 17 | description = "Id of the subnet to deploy the cluster on" 18 | type = string 19 | } 20 | 21 | variable "vnet_name" { 22 | description = "Name of the virtual network to deploy the cluster on" 23 | type = string 24 | } 25 | 26 | variable "subnet_name" { 27 | description = "Name of the subnet to deploy the cluster on" 28 | type = string 29 | } 30 | 31 | variable "cloud" { 32 | description = "Cloud provider to use" 33 | type = string 34 | default = "AzureUSGovernmentCloud" 35 | validation { 36 | condition = contains(["AzureUSGovernmentCloud", "AzurePublicCloud"], var.cloud) 37 | error_message = "Allowed values for cloud are \"AzureUSGovernmentCloud\" or \"AzurePublicCloud\"." 38 | } 39 | } 40 | 41 | variable "vm_size" { 42 | description = "Size of the VM to deploy trhe cluster at" 43 | type = string 44 | default = "Standard_DS4_v3" 45 | } 46 | 47 | variable "server_vm_size" { 48 | type = string 49 | description = "VM size to use for the server nodes if you do not specify vm_size will be used" 50 | default = "" 51 | } 52 | 53 | variable "agent_vm_size" { 54 | type = string 55 | description = "VM size to use for the agent nodes if you do not specify vm_size will be used" 56 | default = "" 57 | } 58 | 59 | variable "server_instance_count" { 60 | description = "Number of server nodes to deploy" 61 | type = number 62 | default = 1 63 | } 64 | 65 | variable "agent_instance_count" { 66 | description = "Number of agent nodes to deploy" 67 | type = number 68 | default = 2 69 | } 70 | 71 | variable "tags" { 72 | description = "Tags to apply to the cluster" 73 | type = object({}) 74 | default = {} 75 | } 76 | 77 | variable "server_public_ip" { 78 | description = "If true assign a public ip to the server nodes" 79 | type = bool 80 | default = false 81 | } 82 | 83 | variable "server_open_ssh_public" { 84 | description = "If true open the ssh port for the server nodes" 85 | type = bool 86 | default = false 87 | } 88 | -------------------------------------------------------------------------------- /modules/common/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | export INSTALL_RKE2_TYPE="${type}" 5 | export INSTALL_RKE2_VERSION="${rke2_version}" 6 | 7 | if [ "$${DEBUG}" == 1 ]; then 8 | set -x 9 | fi 10 | 11 | # info logs the given argument at info log level. 12 | info() { 13 | echo "[INFO] " "$@" 14 | } 15 | 16 | # warn logs the given argument at warn log level. 17 | warn() { 18 | echo "[WARN] " "$@" >&2 19 | } 20 | 21 | # fatal logs the given argument at fatal log level. 22 | fatal() { 23 | echo "[ERROR] " "$@" >&2 24 | exit 1 25 | } 26 | 27 | read_os() { 28 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') 29 | VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') 30 | } 31 | 32 | get_installer() { 33 | curl -fsSL https://get.rke2.io -o install.sh 34 | chmod u+x install.sh 35 | } 36 | 37 | do_download() { 38 | read_os 39 | get_installer 40 | 41 | case $ID in 42 | centos) 43 | yum install -y unzip 44 | 45 | # TODO: Determine minimum supported version, for now just carry on assuming ignorance 46 | case $VERSION in 47 | 7*) 48 | info "Identified CentOS 7" 49 | INSTALL_RKE2_METHOD='yum' INSTALL_RKE2_TYPE="${type}" ./install.sh 50 | 51 | ;; 52 | 8*) 53 | info "Identified CentOS 8" 54 | INSTALL_RKE2_METHOD='yum' INSTALL_RKE2_TYPE="${type}" ./install.sh 55 | 56 | ;; 57 | esac 58 | ;; 59 | 60 | rhel) 61 | yum install -y unzip 62 | 63 | case $VERSION in 64 | 7*) 65 | info "Identified RHEL 7" 66 | 67 | yum install -y http://mirror.centos.org/centos/7/extras/x86_64/Packages/container-selinux-2.119.2-1.911c772.el7_8.noarch.rpm 68 | INSTALL_RKE2_METHOD='yum' INSTALL_RKE2_TYPE="${type}" ./install.sh 69 | ;; 70 | 8*) 71 | info "Identified RHEL 8" 72 | 73 | INSTALL_RKE2_METHOD='yum' INSTALL_RKE2_TYPE="${type}" ./install.sh 74 | ;; 75 | esac 76 | 77 | ;; 78 | 79 | ubuntu) 80 | info "Identified Ubuntu" 81 | # TODO: Determine minimum supported version, for now just carry on assuming ignorance 82 | apt update -y 83 | 84 | apt install -y less iptables resolvconf linux-headers-$(uname -r) telnet jq 85 | 86 | INSTALL_RKE2_METHOD='tar' INSTALL_RKE2_TYPE="${type}" ./install.sh 87 | 88 | ;; 89 | amzn) 90 | # azurecli already present, only need rke2 91 | yum update -y 92 | 93 | case $VERSION in 94 | 2) 95 | info "Identified Amazon Linux 2" 96 | INSTALL_RKE2_METHOD='tar' INSTALL_RKE2_TYPE="${type}" ./install.sh 97 | ;; 98 | *) 99 | info "Identified Amazon Linux 1" 100 | INSTALL_RKE2_METHOD='tar' INSTALL_RKE2_TYPE="${type}" ./install.sh 101 | ;; 102 | esac 103 | ;; 104 | *) 105 | fatal "$${ID} $${VERSION} is not currently supported" 106 | ;; 107 | esac 108 | } 109 | 110 | { 111 | do_download 112 | } 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This project includes the Terraform configuration to deploy an RKE2 cluster in Azure. 2 | 3 | # Notes 4 | 5 | 1. The terraform script does not work within Azure Cloudshell because of a Cloudshell/Terraform AzureRM provider [issue](https://github.com/terraform-providers/terraform-provider-azurerm/issues/7787). 6 | 1. A .devcontainer is provided with all dependencies installed. It is not required to be used. 7 | 1. It is expected that the cloud target has been set using az cloud set -name AzureUSGovernment and an az login and subscription setting has been performed. 8 | 1. The .tfvar cloud variable values are determined by the Kubernetes azure cloud provider which utilises the [go-autorest library](https://github.com/Azure/go-autorest/blob/v9.9.0/autorest/azure/environments.go#L29) which doesn't use Azure defined cloud names. 9 | 10 | # Supported Azure regions 11 | 12 | These regions were deployed and tested other regions might also work 13 | 14 | 1. USGovVirginia 15 | 1. USGovArizona 16 | 17 | # Getting Started 18 | 19 | 1. Start devcontainer in VScode or install prerequisites in your system. The devcontainer is in [.devcontainer](.devcontainer). 20 | 21 | 2. An example usage of this module can be found in the [quickstart](./examples/quickstart) folder 22 | 23 | # Connecting to RKE2 24 | 25 | This section assumes you have a publicly accessible cluster, i.e. you have set `server_public_ip` to true 26 | 27 | A script is provided to download the kubeconfig file needed to access the cluster, from KeyVault to the local machine, it also sets KUBECONFIG to point to the new kubeconfig 28 | 29 | ```bash 30 | source scripts/fetch-kubeconfig.sh 31 | ``` 32 | 33 | > **Note.** You must run this from the location where Terraform apply has been run and there is a terraform.tfstate file 34 | > **Note.** You must source the script, also you may have to wait for a minute or two after deploying the cluster before the kubeconfig is ready 35 | 36 | Now you can run kubectl commands against the cluster as normal, e.g. `kubectl get nodes` or `kubectl get pods -A` to see the status and health of the cluster. 37 | 38 | # Smoke Tests 39 | 40 | A set of simple smoke tests is provided to validate the cluster is healthy and can communicate with Azure 41 | 42 | See [Smoke Tests for Bare Metal Kubernetes on Azure](./tests/README.md) 43 | 44 | # SSH to Servers (Control Plane) 45 | 46 | If you set `server_open_ssh_public` to true, then SSH will be allowed onto the server nodes, through the control plane load balancer. 47 | 48 | > Note. This is only recommended when troubleshooting RKE2 itself, and associated configuration such as the Azure cloud provider. For normal operation SSH access is not required. 49 | 50 | This is done with a Azure Load Balancer NAT pool, the pool maps ports from 5000 onwards to port 22 on each of the instances, e.g. 51 | 52 | - Port 5000 -> port 22 on instance 0 53 | - Port 5001 -> port 22 on instance 1 54 | - Port 5002 -> port 22 on instance 2 55 | - etc 56 | 57 | A script is provided that will download the SSH private key from KeyVault and tell you the public IP you need to use. The SSH username is `rke2` 58 | 59 | ```bash 60 | ./scripts/fetch-ssh-key.sh 61 | ``` 62 | 63 | > **Note.** You must run this from the location where Terraform apply has been run and there is a terraform.tfstate file 64 | 65 | > **Note.** For reasons unknown sometimes the scale set takes some time to settle down, and even with a single instance, it might not be instance 0, it can be 1 or even 2, so try ports 5001 and 5002 if 5000 doesn't work -------------------------------------------------------------------------------- /modules/lb/main.tf: -------------------------------------------------------------------------------- 1 | data "azurerm_resource_group" "rg" { 2 | name = var.resource_group_name 3 | } 4 | 5 | resource "azurerm_public_ip" "pip" { 6 | count = var.type == "public" ? 1 : 0 7 | 8 | name = "${var.name}-pip" 9 | allocation_method = "Static" 10 | sku = "Standard" 11 | availability_zone = var.zone 12 | 13 | resource_group_name = data.azurerm_resource_group.rg.name 14 | location = data.azurerm_resource_group.rg.location 15 | 16 | 17 | tags = merge({}, var.tags) 18 | } 19 | 20 | resource "azurerm_lb" "this" { 21 | name = "${var.name}-cp" 22 | 23 | resource_group_name = data.azurerm_resource_group.rg.name 24 | location = data.azurerm_resource_group.rg.location 25 | 26 | sku = "Standard" 27 | 28 | frontend_ip_configuration { 29 | name = "${var.name}-lb-fe" 30 | public_ip_address_id = var.type == "public" ? azurerm_public_ip.pip[0].id : null 31 | subnet_id = var.type == "public" ? null : var.subnet_id 32 | private_ip_address = var.private_ip_address 33 | private_ip_address_allocation = var.private_ip_address_allocation 34 | availability_zone = var.zone 35 | } 36 | 37 | tags = merge({}, var.tags) 38 | } 39 | 40 | # 41 | # Load Balancer backend address pool 42 | # 43 | //noinspection MissingProperty 44 | resource "azurerm_lb_backend_address_pool" "bepool" { 45 | name = "${var.name}-lbe-be-pool" 46 | loadbalancer_id = azurerm_lb.this.id 47 | } 48 | 49 | # 50 | # Load Balancer health probe 51 | # 52 | resource "azurerm_lb_probe" "this" { 53 | name = "${var.name}-lb-cp-probe" 54 | loadbalancer_id = azurerm_lb.this.id 55 | resource_group_name = data.azurerm_resource_group.rg.name 56 | 57 | protocol = "Tcp" 58 | interval_in_seconds = 10 59 | number_of_probes = 3 60 | 61 | port = 6443 62 | } 63 | 64 | resource "azurerm_lb_rule" "controlplane" { 65 | name = "${var.name}-cp" 66 | loadbalancer_id = azurerm_lb.this.id 67 | resource_group_name = data.azurerm_resource_group.rg.name 68 | 69 | protocol = "Tcp" 70 | frontend_port = 6443 71 | backend_port = 6443 72 | 73 | frontend_ip_configuration_name = azurerm_lb.this.frontend_ip_configuration.0.name 74 | backend_address_pool_id = azurerm_lb_backend_address_pool.bepool.id 75 | probe_id = azurerm_lb_probe.this.id 76 | } 77 | 78 | resource "azurerm_lb_rule" "supervisor" { 79 | name = "${var.name}-supervisor" 80 | loadbalancer_id = azurerm_lb.this.id 81 | resource_group_name = data.azurerm_resource_group.rg.name 82 | 83 | protocol = "Tcp" 84 | backend_port = 9345 85 | frontend_port = 9345 86 | 87 | frontend_ip_configuration_name = azurerm_lb.this.frontend_ip_configuration.0.name 88 | backend_address_pool_id = azurerm_lb_backend_address_pool.bepool.id 89 | probe_id = azurerm_lb_probe.this.id 90 | } 91 | 92 | 93 | resource "azurerm_lb_nat_pool" "ssh" { 94 | resource_group_name = data.azurerm_resource_group.rg.name 95 | loadbalancer_id = azurerm_lb.this.id 96 | name = "SSHNatPool" 97 | protocol = "Tcp" 98 | frontend_port_start = 5000 99 | frontend_port_end = 5100 100 | backend_port = 22 101 | frontend_ip_configuration_name = "${var.name}-lb-fe" 102 | } 103 | 104 | -------------------------------------------------------------------------------- /modules/nodepool/main.tf: -------------------------------------------------------------------------------- 1 | locals {} 2 | 3 | data "azurerm_resource_group" "rg" { 4 | name = var.resource_group_name 5 | } 6 | 7 | resource "azurerm_linux_virtual_machine_scale_set" "this" { 8 | name = format("vm-%s", lower(replace(var.name, "/[[:^alnum:]]/", ""))) 9 | 10 | resource_group_name = data.azurerm_resource_group.rg.name 11 | location = data.azurerm_resource_group.rg.location 12 | 13 | sku = var.vm_size 14 | instances = var.instances 15 | overprovision = var.overprovision 16 | zones = var.zones 17 | zone_balance = var.zone_balance 18 | single_placement_group = var.single_placement_group 19 | upgrade_mode = var.upgrade_mode 20 | priority = var.priority 21 | eviction_policy = var.eviction_policy 22 | health_probe_id = var.health_probe_id 23 | disable_password_authentication = true 24 | 25 | custom_data = var.custom_data 26 | 27 | admin_username = var.admin_username 28 | admin_ssh_key { 29 | username = var.admin_username 30 | public_key = var.admin_ssh_public_key 31 | } 32 | 33 | source_image_id = var.source_image_id != null ? var.source_image_id : null 34 | //noinspection ConflictingProperties 35 | dynamic "source_image_reference" { 36 | for_each = var.source_image_id != null ? [] : [1] 37 | content { 38 | offer = lookup(var.source_image_reference, "offer") 39 | publisher = lookup(var.source_image_reference, "publisher") 40 | sku = lookup(var.source_image_reference, "sku") 41 | version = lookup(var.source_image_reference, "version") 42 | } 43 | } 44 | 45 | os_disk { 46 | caching = "ReadWrite" 47 | storage_account_type = var.os_disk_storage_account_type 48 | disk_encryption_set_id = var.os_disk_encryption_set_id 49 | disk_size_gb = var.os_disk_size_gb 50 | } 51 | 52 | dynamic "data_disk" { 53 | for_each = var.additional_data_disks 54 | content { 55 | lun = lookup(data_disk, "lun") 56 | disk_size_gb = lookup(data_disk, "disk_size_gb", 20) 57 | caching = lookup(data_disk, "caching", "ReadWrite") 58 | storage_account_type = lookup(data_disk, "storage_account_type", "Standard_LRS") 59 | } 60 | } 61 | 62 | network_interface { 63 | name = "nic-${format("vm-%s", lower(replace(var.name, "/[[:^alnum:]]/", "")))}" 64 | primary = true 65 | network_security_group_id = var.nsg_id 66 | dns_servers = var.dns_servers 67 | enable_accelerated_networking = var.enable_accelerated_networking 68 | 69 | ip_configuration { 70 | name = "ipconfig-${format("vm-%s", lower(replace(var.name, "/[[:^alnum:]]/", "")))}" 71 | primary = true 72 | subnet_id = var.subnet_id 73 | 74 | load_balancer_backend_address_pool_ids = var.load_balancer_backend_address_pool_ids 75 | load_balancer_inbound_nat_rules_ids = var.load_balancer_inbound_nat_rules_ids 76 | 77 | dynamic "public_ip_address" { 78 | for_each = var.assign_public_ips ? [{}] : [] 79 | content { 80 | name = "pip-${format("vm-%s", lower(replace(var.name, "/[[:^alnum:]]/", "")))}" 81 | } 82 | } 83 | } 84 | } 85 | 86 | identity { 87 | type = "UserAssigned" 88 | identity_ids = var.identity_ids 89 | } 90 | 91 | automatic_instance_repair { 92 | enabled = var.enable_automatic_instance_repair 93 | grace_period = var.automatic_instance_repair_grace_period 94 | } 95 | 96 | tags = merge({}, var.tags) 97 | } 98 | -------------------------------------------------------------------------------- /modules/rke2-cluster/main.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | nodepool_nsgs = [module.rke2.network_security_group_name] 3 | } 4 | 5 | resource "tls_private_key" "default" { 6 | algorithm = "RSA" 7 | rsa_bits = 4096 8 | } 9 | 10 | data "azurerm_resource_group" "rg" { 11 | name = var.resource_group_name 12 | } 13 | 14 | resource "azurerm_network_security_group" "k8s" { 15 | name = "${var.cluster_name}-k8s-nsg" 16 | 17 | resource_group_name = data.azurerm_resource_group.rg.name 18 | location = data.azurerm_resource_group.rg.location 19 | 20 | tags = merge({}, var.tags) 21 | } 22 | 23 | module "rke2" { 24 | source = "../rke2-server" 25 | 26 | cluster_name = var.cluster_name 27 | resource_group_name = var.resource_group_name 28 | 29 | virtual_network_id = var.vnet_id 30 | subnet_id = var.subnet_id 31 | virtual_network_name = var.vnet_name 32 | subnet_name = var.subnet_name 33 | k8s_nsg_name = azurerm_network_security_group.k8s.name 34 | 35 | admin_ssh_public_key = tls_private_key.default.public_key_openssh 36 | 37 | servers = var.server_instance_count 38 | vm_size = length(var.server_vm_size) > 0 ? var.server_vm_size : var.vm_size 39 | priority = "Regular" #"Spot" 40 | 41 | enable_ccm = true 42 | cloud = var.cloud 43 | public_ip = var.server_public_ip 44 | open_ssh_public = var.server_open_ssh_public 45 | 46 | # OS tuning 47 | pre_userdata = < 0 ? var.agent_vm_size : var.vm_size 74 | priority = "Regular" #"Spot" 75 | cloud = var.cloud 76 | 77 | # OS tuning 78 | pre_userdata = <&2 15 | } 16 | 17 | # fatal logs the given argument at fatal log level. 18 | fatal() { 19 | echo "[ERROR] " "$@" >&2 20 | exit 1 21 | } 22 | 23 | config() { 24 | mkdir -p "/etc/rancher/rke2" 25 | cat < "/etc/rancher/rke2/config.yaml" 26 | # Additional user defined configuration 27 | ${config} 28 | EOF 29 | } 30 | 31 | append_config() { 32 | echo "$1" >> "/etc/rancher/rke2/config.yaml" 33 | } 34 | 35 | get_azure_domain() { 36 | if [ "$CLOUD" = "AzureUSGovernmentCloud" ]; then 37 | echo 'usgovcloudapi.net' 38 | else 39 | echo 'azure.com' 40 | fi 41 | } 42 | 43 | get_azure_vault() { 44 | if [ "$CLOUD" = "AzureUSGovernmentCloud" ]; then 45 | echo 'usgovcloudapi.net' 46 | else 47 | echo 'azure.net' 48 | fi 49 | } 50 | 51 | # The most simple "leader election" you've ever seen in your life 52 | elect_leader() { 53 | 54 | azure_domain=$(get_azure_domain) 55 | 56 | access_token=$(curl -s "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https%3A%2F%2Fmanagement.$${azure_domain}" -H Metadata:true | jq -r ".access_token") 57 | 58 | read subscriptionId resourceGroupName virtualMachineScaleSetName < \ 59 | <(echo $(curl -s -H Metadata:true --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=2020-09-01" | jq -r ".compute | .subscriptionId, .resourceGroupName, .vmScaleSetName")) 60 | 61 | first=$(curl -s https://management.$${azure_domain}/subscriptions/$${subscriptionId}/resourceGroups/$${resourceGroupName}/providers/Microsoft.Compute/virtualMachineScaleSets/$${virtualMachineScaleSetName}/virtualMachines?api-version=2020-12-01 \ 62 | -H "Authorization: Bearer $${access_token}" | jq -ej "[.value[]] | sort_by(.instanceId | tonumber) | .[0].properties.osProfile.computerName") 63 | 64 | 65 | if [ $(hostname) = $${first} ]; then 66 | SERVER_TYPE="leader" 67 | info "Electing as cluster leader" 68 | else 69 | info "Electing as joining server" 70 | fi 71 | } 72 | 73 | identify() { 74 | info "Identifying server type..." 75 | 76 | # Default to server 77 | SERVER_TYPE="server" 78 | 79 | supervisor_status=$(curl --max-time 5.0 --write-out '%%{http_code}' -sk --output /dev/null https://"${server_url}":9345/ping) 80 | 81 | if [ "$supervisor_status" -ne 200 ]; then 82 | info "API server unavailable, performing simple leader election" 83 | elect_leader 84 | else 85 | info "API server available, identifying as server joining existing cluster" 86 | fi 87 | } 88 | 89 | cp_wait() { 90 | while true; do 91 | supervisor_status=$(curl --max-time 5.0 --write-out '%%{http_code}' -sk --output /dev/null https://"${server_url}":9345/ping) 92 | if [ "$supervisor_status" -eq 200 ]; then 93 | info "Cluster is ready" 94 | 95 | # Let things settle down for a bit, without this HA cluster creation is very unreliable 96 | sleep 10 97 | break 98 | fi 99 | info "Waiting for cluster to be ready..." 100 | sleep 10 101 | done 102 | } 103 | 104 | fetch_token() { 105 | info "Fetching rke2 join token..." 106 | 107 | azure_vault=$(get_azure_vault) 108 | 109 | access_token=$(curl "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https%3A%2F%2Fvault.$${azure_vault}" -H Metadata:true | jq -r ".access_token") 110 | token=$(curl '${vault_url}secrets/${token_secret}?api-version=2016-10-01' -H "Authorization: Bearer $${access_token}" | jq -r ".value") 111 | 112 | echo "token: $${token}" >> "/etc/rancher/rke2/config.yaml" 113 | } 114 | 115 | upload() { 116 | # Wait for kubeconfig to exist, then upload to secrets 117 | retries=10 118 | 119 | while [ ! -f /etc/rancher/rke2/rke2.yaml ]; do 120 | sleep 10 121 | if [ "$retries" = 0 ]; then 122 | fatal "Failed to create kubeconfig" 123 | fi 124 | ((retries--)) 125 | done 126 | 127 | azure_vault=$(get_azure_vault) 128 | 129 | access_token=$(curl "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https%3A%2F%2Fvault.$${azure_vault}" -H Metadata:true | jq -r ".access_token") 130 | 131 | curl -v -X PUT \ 132 | -H "Content-Type: application/json" \ 133 | -H "Authorization: Bearer $${access_token}" \ 134 | "${vault_url}secrets/kubeconfig?api-version=7.1" \ 135 | --data-binary @- << EOF 136 | { 137 | "value": "$(sed "s/127.0.0.1/${server_url}/g" /etc/rancher/rke2/rke2.yaml)" 138 | } 139 | EOF 140 | } 141 | 142 | pre_userdata() { 143 | info "Beginning user defined pre userdata" 144 | ${pre_userdata} 145 | info "Beginning user defined pre userdata" 146 | } 147 | 148 | post_userdata() { 149 | info "Beginning user defined post userdata" 150 | ${post_userdata} 151 | info "Ending user defined post userdata" 152 | } 153 | 154 | { 155 | pre_userdata 156 | 157 | config 158 | fetch_token 159 | 160 | if [ "$CCM" = "true" ]; then 161 | append_config 'cloud-provider-name: azure' 162 | append_config 'cloud-provider-config: /etc/rancher/rke2/cloud.conf' 163 | fi 164 | 165 | # Config which is applied to both agent and server nodes 166 | append_config 'node-label: ${node_labels}' 167 | append_config 'node-taint: ${node_taints}' 168 | 169 | if [ "$TYPE" = "server" ]; then 170 | # Initialize server 171 | identify 172 | 173 | cat <> "/etc/rancher/rke2/config.yaml" 174 | tls-san: 175 | - ${server_url} 176 | EOF 177 | 178 | if [ $SERVER_TYPE = "server" ]; then 179 | append_config 'server: https://${server_url}:9345' 180 | # Wait for cluster to exist, then init another server 181 | cp_wait 182 | fi 183 | 184 | # This attempts to stagger the times when servers try to join the cluster 185 | # We rely on the well ordered cardinal host names that VMSS will assign 186 | host=$(hostname) 187 | hostNum=$${host: -2} 188 | sleepTime=$(( hostNum * 80 )) 189 | info "Staggering process, waiting extra $sleepTime seconds before joining..." 190 | sleep $sleepTime 191 | 192 | systemctl enable rke2-server 193 | systemctl daemon-reload 194 | systemctl start rke2-server 195 | 196 | export KUBECONFIG=/etc/rancher/rke2/rke2.yaml 197 | export PATH=$PATH:/var/lib/rancher/rke2/bin 198 | 199 | # Upload kubeconfig to s3 bucket 200 | upload 201 | 202 | else 203 | append_config 'server: https://${server_url}:9345' 204 | 205 | # Default to agent 206 | systemctl enable rke2-agent 207 | systemctl daemon-reload 208 | systemctl start rke2-agent 209 | fi 210 | 211 | post_userdata 212 | } 213 | -------------------------------------------------------------------------------- /modules/rke2-server/variables.tf: -------------------------------------------------------------------------------- 1 | variable "cluster_name" { 2 | type = string 3 | description = "Name of the server cluster" 4 | } 5 | 6 | variable "resource_group_name" { 7 | type = string 8 | description = "Name of the resource group to put the server cluster on" 9 | } 10 | variable "virtual_network_id" { 11 | type = string 12 | description = "Id of the virtual network to put the server cluster on" 13 | } 14 | variable "virtual_network_name" { 15 | type = string 16 | description = "Name of the virtual network to put the server cluster on" 17 | } 18 | variable "subnet_id" { 19 | type = string 20 | description = "Id of the subnet to put the server cluster on" 21 | } 22 | variable "subnet_name" { 23 | type = string 24 | description = "Name of the subnet to put the server cluster on" 25 | } 26 | 27 | variable "k8s_nsg_name" { 28 | type = string 29 | description = "Name of the NSG to add to the server cluster" 30 | } 31 | 32 | variable "admin_username" { 33 | type = string 34 | description = "Name of the admin user of the server cluster" 35 | default = "rke2" 36 | } 37 | 38 | variable "admin_ssh_public_key" { 39 | type = string 40 | description = "SSH public key of the admin user of the server cluster" 41 | default = "" 42 | } 43 | 44 | variable "assign_public_ips" { 45 | type = string 46 | description = "If true assign public IPs to nodes in the cluster" 47 | default = false 48 | } 49 | 50 | variable "servers" { 51 | description = "Number of servers to create" 52 | type = number 53 | default = 1 54 | } 55 | 56 | variable "spot" { 57 | description = "Toggle spot requests for server pool" 58 | type = bool 59 | default = false 60 | } 61 | 62 | variable "controlplane_loadbalancer_type" { 63 | description = "Type of load balancer to use for the control plane" 64 | type = string 65 | default = "private" 66 | } 67 | 68 | variable "controlplane_loadbalancer_private_ip_address" { 69 | description = "IP address of the private load balancer for the control plane" 70 | type = string 71 | default = null 72 | } 73 | 74 | variable "controlplane_loadbalancer_private_ip_address_allocation" { 75 | description = "IP address allocation of the private load balancer for the control plane" 76 | type = string 77 | default = null 78 | } 79 | 80 | # 81 | # Server pool variables 82 | # 83 | variable "source_image_reference" { 84 | description = "Source image query parameters" 85 | type = object({ 86 | publisher = string 87 | offer = string 88 | sku = string 89 | version = string 90 | }) 91 | 92 | default = { 93 | offer = "UbuntuServer" 94 | publisher = "Canonical" 95 | sku = "18.04-LTS" 96 | version = "latest" 97 | } 98 | } 99 | 100 | variable "vm_size" { 101 | type = string 102 | default = "Standard_DS4_v2" 103 | description = "Server pool vm size" 104 | } 105 | 106 | variable "rke2_version" { 107 | default = "v1.21.5+rke2r2" 108 | } 109 | 110 | variable "pre_userdata" { 111 | description = "(Optional) Additional userdata to be ran immediately before cluster bootstrapping." 112 | type = string 113 | default = "" 114 | } 115 | 116 | variable "post_userdata" { 117 | description = "(Optional) Additional userdata to be ran post cluster bootstrapping." 118 | type = string 119 | default = "" 120 | } 121 | 122 | variable "rke2_config" { 123 | description = "(Optional) Additional RKE2 configuration in config file format: https://docs.rke2.io/install/install_options/install_options/#configuration-file" 124 | type = string 125 | default = "" 126 | } 127 | 128 | variable "enable_ccm" { 129 | description = "(Optional) Enable in tree Azure Cloud Controller Manager in RKE2." 130 | type = bool 131 | default = false 132 | } 133 | 134 | variable "tags" { 135 | description = "(Optional) Tags to add to the server pool" 136 | type = map(string) 137 | default = {} 138 | } 139 | 140 | variable "zones" { 141 | description = "(Optional) List of availability zones servers should be created in." 142 | type = list(number) 143 | default = [] 144 | } 145 | variable "zone_balance" { 146 | description = "(Optional) Toggle server balance within availability zones specified." 147 | default = null 148 | } 149 | 150 | variable "single_placement_group" { 151 | description = "TODO: (Optional) Toggle single placement group." 152 | default = null 153 | } 154 | 155 | variable "upgrade_mode" { 156 | description = "(Optional) Specify how upgrades should happen. Possible values are Automatic, Manual and Rolling. Defaults to Automatic." 157 | default = "Automatic" 158 | } 159 | 160 | variable "priority" { 161 | description = "(Optional) Specify the priority of the VMSS. Possible values are Regular and Spot. Defaults to Regular" 162 | default = "Regular" 163 | } 164 | 165 | variable "eviction_policy" { 166 | description = "(Optional) Specify how server instances should be evicted. Possible values are Delete and Deallocate." 167 | default = "Delete" 168 | } 169 | 170 | variable "dns_servers" { 171 | description = "(Optional) Specify any additional dns servers applied to server scale set." 172 | type = list(string) 173 | default = [] 174 | } 175 | 176 | variable "enable_accelerated_networking" { 177 | description = "(Optional) Toggle accelerated networking for server scale set." 178 | type = bool 179 | default = false 180 | } 181 | 182 | variable "enable_automatic_instance_repair" { 183 | description = "(Optional) Toggle automatic instance repair." 184 | type = bool 185 | default = true 186 | } 187 | 188 | variable "automatic_instance_repair_grace_period" { 189 | description = "TODO: (Optional) Toggle accelerated networking for server scale set." 190 | type = string 191 | default = "PT50M" 192 | } 193 | 194 | variable "os_disk_storage_account_type" { 195 | description = "(Optional) Storage Account used for OS Disk. Possible values include Standard_LRS or Premium_LRS." 196 | type = string 197 | default = "Standard_LRS" 198 | } 199 | 200 | variable "os_disk_size_gb" { 201 | description = "(Optional) Storage disk size for OS in GB. Defaults to 30Gb" 202 | type = number 203 | default = 30 204 | } 205 | 206 | variable "os_disk_encryption_set_id" { 207 | description = "TODO: Docs" 208 | type = string 209 | default = null 210 | } 211 | 212 | variable "additional_data_disks" { 213 | description = "TODO: Docs" 214 | type = list(object({ 215 | lun = number 216 | disk_size_gb = number 217 | caching = string 218 | storage_account_type = string 219 | })) 220 | default = [] 221 | } 222 | 223 | variable "cloud" { 224 | description = "(Optional) Cloud provider to use. Possible values are AzureUSGovernmentCloud, AzurePublicCloud" 225 | type = string 226 | default = "AzureUSGovernmentCloud" 227 | validation { 228 | condition = contains(["AzureUSGovernmentCloud", "AzurePublicCloud"], var.cloud) 229 | error_message = "Allowed values for cloud are \"AzureUSGovernmentCloud\" or \"AzurePublicCloud\"." 230 | } 231 | } 232 | 233 | variable "public_ip" { 234 | description = "(Optional) if true, assign public IPs to nodes in the cluster" 235 | type = bool 236 | default = false 237 | } 238 | 239 | variable "open_ssh_public" { 240 | description = "(Optional) if true, allow ssh access to nodes in the cluster" 241 | type = bool 242 | default = false 243 | } 244 | -------------------------------------------------------------------------------- /modules/rke2-server/main.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | # Create a unique cluster name we'll prefix to all resources created and ensure it's lowercase 3 | uname = lower("${var.cluster_name}-${random_string.uid.result}") 4 | 5 | ccm_tags = { 6 | "kubernetes.io_cluster_${local.uname}" = "owned" 7 | } 8 | } 9 | 10 | data "azurerm_resource_group" "rg" { 11 | name = var.resource_group_name 12 | } 13 | 14 | resource "random_string" "uid" { 15 | length = 3 16 | special = false 17 | lower = true 18 | upper = false 19 | number = true 20 | } 21 | 22 | resource "random_password" "token" { 23 | length = 40 24 | special = false 25 | } 26 | 27 | module "statestore" { 28 | source = "../statestore" 29 | 30 | name = local.uname 31 | resource_group_name = data.azurerm_resource_group.rg.name 32 | location = data.azurerm_resource_group.rg.location 33 | 34 | token = random_password.token.result 35 | reader_object_id = azurerm_user_assigned_identity.cluster.principal_id 36 | } 37 | 38 | resource "azurerm_user_assigned_identity" "cluster" { 39 | name = "${local.uname}-cluster" 40 | 41 | resource_group_name = data.azurerm_resource_group.rg.name 42 | location = data.azurerm_resource_group.rg.location 43 | 44 | tags = merge({}, var.tags) 45 | } 46 | 47 | resource "azurerm_role_assignment" "cluster_vault" { 48 | scope = data.azurerm_resource_group.rg.id 49 | principal_id = azurerm_user_assigned_identity.cluster.principal_id 50 | role_definition_name = "Key Vault Secrets User" 51 | } 52 | 53 | resource "azurerm_role_assignment" "cluster_reader" { 54 | scope = module.servers.scale_set_id 55 | principal_id = azurerm_user_assigned_identity.cluster.principal_id 56 | role_definition_name = "Reader" 57 | } 58 | 59 | 60 | resource "azurerm_role_assignment" "role1" { 61 | scope = data.azurerm_resource_group.rg.id #module.servers.scale_set_id 62 | role_definition_name = "Contributor" 63 | principal_id = azurerm_user_assigned_identity.cluster.principal_id 64 | skip_service_principal_aad_check = true 65 | 66 | } 67 | 68 | resource "azurerm_role_assignment" "role2" { 69 | scope = data.azurerm_resource_group.rg.id #module.servers.scale_set_id 70 | role_definition_name = "Network Contributor" 71 | principal_id = azurerm_user_assigned_identity.cluster.principal_id 72 | skip_service_principal_aad_check = true 73 | 74 | } 75 | 76 | # 77 | # Server Network Security Group 78 | # 79 | resource "azurerm_network_security_group" "server" { 80 | name = "${local.uname}-rke2-server-nsg" 81 | 82 | resource_group_name = data.azurerm_resource_group.rg.name 83 | location = data.azurerm_resource_group.rg.location 84 | 85 | tags = merge({}, var.tags) 86 | } 87 | 88 | resource "azurerm_network_security_rule" "server_cp" { 89 | name = "${local.uname}-rke2-server-controlplane" 90 | network_security_group_name = azurerm_network_security_group.server.name 91 | access = "Allow" 92 | direction = "Inbound" 93 | priority = 101 94 | protocol = "Tcp" 95 | resource_group_name = data.azurerm_resource_group.rg.name 96 | 97 | source_port_range = "*" 98 | destination_port_range = "6443" 99 | source_address_prefix = "*" 100 | destination_address_prefix = "*" 101 | } 102 | 103 | resource "azurerm_network_security_rule" "server_supervisor" { 104 | name = "${local.uname}-rke2-server-supervisor" 105 | network_security_group_name = azurerm_network_security_group.server.name 106 | access = "Allow" 107 | direction = "Inbound" 108 | priority = 102 109 | protocol = "Tcp" 110 | resource_group_name = data.azurerm_resource_group.rg.name 111 | 112 | source_port_range = "*" 113 | destination_port_range = "9345" 114 | source_address_prefix = "*" 115 | destination_address_prefix = "*" 116 | } 117 | 118 | # 119 | # Server Nodepool 120 | # 121 | module "init" { 122 | source = "../custom_data" 123 | 124 | server_url = module.cp_lb.lb_url 125 | vault_url = module.statestore.vault_url 126 | token_secret = module.statestore.token_secret_name 127 | 128 | config = var.rke2_config 129 | pre_userdata = var.pre_userdata 130 | post_userdata = var.post_userdata 131 | ccm = var.enable_ccm 132 | node_labels = "[]" 133 | node_taints = "[\"CriticalAddonsOnly=true:NoExecute\"]" 134 | cloud = var.cloud 135 | agent = false 136 | } 137 | 138 | data "azurerm_client_config" "current" {} 139 | 140 | data "template_cloudinit_config" "init" { 141 | base64_encode = true 142 | 143 | part { 144 | filename = "00_download.sh" 145 | content_type = "text/x-shellscript" 146 | content = templatefile("${path.module}/../common/download.sh", { 147 | rke2_version = var.rke2_version 148 | type = "server" 149 | }) 150 | } 151 | 152 | part { 153 | filename = "01_rke2.sh" 154 | content_type = "text/x-shellscript" 155 | content = module.init.templated 156 | } 157 | 158 | part { 159 | filename = "azure-cloud.tpl" 160 | content_type = "text/cloud-config" 161 | content = jsonencode({ 162 | write_files = [ 163 | { 164 | content = "vm.max_map_count=262144\nsysctl -w fs.file-max=131072" 165 | path = "/etc/sysctl.d/10-vm-map-count.conf" 166 | permissions = "5555" 167 | }, 168 | { 169 | content = templatefile("${path.module}/../custom_data/files/azure-cloud.conf.template", { 170 | tenant_id = data.azurerm_client_config.current.tenant_id 171 | user_assigned_identity_id = azurerm_user_assigned_identity.cluster.client_id 172 | subscription_id = data.azurerm_client_config.current.subscription_id 173 | rg_name = data.azurerm_resource_group.rg.name 174 | location = data.azurerm_resource_group.rg.location 175 | subnet_name = var.subnet_name 176 | virtual_network_name = var.virtual_network_name 177 | nsg_name = var.k8s_nsg_name 178 | cloud = var.cloud 179 | }) 180 | path = "/etc/rancher/rke2/cloud.conf" 181 | permissions = "5555" 182 | }, 183 | { 184 | content = templatefile("${path.module}/../custom_data/files/default-storageclass.yaml", {}) 185 | path = "/var/lib/rancher/rke2/server/manifests/default-storageclass.yaml" 186 | permissions = "5555" 187 | } 188 | ] 189 | }) 190 | } 191 | } 192 | 193 | module "cp_lb" { 194 | source = "../lb" 195 | 196 | name = local.uname 197 | resource_group_name = data.azurerm_resource_group.rg.name 198 | 199 | subnet_id = var.subnet_id 200 | private_ip_address = var.controlplane_loadbalancer_private_ip_address 201 | private_ip_address_allocation = var.controlplane_loadbalancer_private_ip_address_allocation 202 | 203 | tags = merge({}, var.tags) 204 | 205 | type = var.public_ip ? "public" : "private" 206 | } 207 | 208 | module "servers" { 209 | source = "../nodepool" 210 | 211 | name = "${local.uname}-server" 212 | 213 | resource_group_name = data.azurerm_resource_group.rg.name 214 | virtual_network_id = var.virtual_network_id 215 | subnet_id = var.subnet_id 216 | 217 | admin_username = var.admin_username 218 | admin_ssh_public_key = var.admin_ssh_public_key 219 | 220 | vm_size = var.vm_size 221 | instances = var.servers 222 | # Forcing this to false, as the RKE2 bootstrap now relies on well ordered hostnames to stagger the join process 223 | overprovision = false 224 | zones = var.zones 225 | zone_balance = var.zone_balance 226 | single_placement_group = var.single_placement_group 227 | upgrade_mode = var.upgrade_mode 228 | priority = var.priority 229 | eviction_policy = var.priority == "Spot" ? var.eviction_policy : null 230 | dns_servers = var.dns_servers 231 | enable_accelerated_networking = var.enable_accelerated_networking 232 | 233 | source_image_reference = var.source_image_reference 234 | assign_public_ips = var.assign_public_ips 235 | nsg_id = azurerm_network_security_group.server.id 236 | 237 | health_probe_id = module.cp_lb.controlplane_probe_id 238 | load_balancer_backend_address_pool_ids = [module.cp_lb.backend_pool_id] 239 | load_balancer_inbound_nat_rules_ids = var.open_ssh_public ? [module.cp_lb.azurerm_lb_nat_pool_ssh_id] : [] 240 | 241 | identity_ids = [azurerm_user_assigned_identity.cluster.id] 242 | custom_data = data.template_cloudinit_config.init.rendered 243 | 244 | enable_automatic_instance_repair = var.enable_automatic_instance_repair 245 | automatic_instance_repair_grace_period = var.enable_automatic_instance_repair ? var.automatic_instance_repair_grace_period : null 246 | 247 | os_disk_size_gb = var.os_disk_size_gb 248 | os_disk_storage_account_type = var.os_disk_storage_account_type 249 | os_disk_encryption_set_id = var.os_disk_encryption_set_id 250 | 251 | additional_data_disks = var.additional_data_disks 252 | 253 | tags = merge({ 254 | "Role" = "server", 255 | }, local.ccm_tags, var.tags) 256 | 257 | # Fix bug with dependency upon resource deletions 258 | depends_on = [module.cp_lb] 259 | } 260 | --------------------------------------------------------------------------------