├── example ├── .gitignore ├── main.tf └── README.md ├── modules └── agent_group │ ├── outputs.tf │ ├── versions.tf │ ├── main.tf │ ├── variables.tf │ └── agent.tf ├── addons.tf ├── .pre-commit-config.yaml ├── versions.tf ├── CONTRIBUTING.md ├── ssh.tf ├── network.tf ├── main.tf ├── LICENSE ├── agents.tf ├── output.tf ├── kubeconfig.tf ├── manifests ├── hello-kubernetes.yaml ├── hcloud-ccm-net.yaml └── upgrade-controller.yaml ├── control_plane.tf ├── control_plane_primary.tf ├── .gitignore ├── upgrades.tf ├── variables.tf ├── logo.svg └── README.md /example/.gitignore: -------------------------------------------------------------------------------- 1 | .terraform 2 | .envrc 3 | kubeconfig* 4 | *.lock.hcl -------------------------------------------------------------------------------- /modules/agent_group/outputs.tf: -------------------------------------------------------------------------------- 1 | output "public_ips" { 2 | value = [for server in hcloud_server.agent : server.ipv4_address] 3 | } -------------------------------------------------------------------------------- /addons.tf: -------------------------------------------------------------------------------- 1 | data "http" "hcloud_csi_driver_manifest" { 2 | url = "https://raw.githubusercontent.com/hetznercloud/csi-driver/${var.hcloud_csi_driver_version}/deploy/kubernetes/hcloud-csi.yml" 3 | } 4 | 5 | -------------------------------------------------------------------------------- /modules/agent_group/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | hcloud = { 4 | source = "hetznercloud/hcloud" 5 | version = "~> 1.27" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/antonbabenko/pre-commit-terraform 3 | rev: v1.50.0 4 | hooks: 5 | - id: terraform_fmt 6 | - id: terraform_validate 7 | - id: terraform_docs 8 | args: ['--args=--indent 3 --hide modules --hide providers --hide requirements --hide resources --hide data-sources'] -------------------------------------------------------------------------------- /modules/agent_group/main.tf: -------------------------------------------------------------------------------- 1 | data "hcloud_image" "ubuntu" { 2 | name = "ubuntu-20.04" 3 | } 4 | 5 | resource "random_pet" "agent_suffix" { 6 | count = var.server_count 7 | } 8 | 9 | locals { 10 | agent_pet_names = [for pet in random_pet.agent_suffix : pet.id] 11 | agent_name_map = { for i in range(0, var.server_count) : random_pet.agent_suffix[i].id => i } 12 | } 13 | -------------------------------------------------------------------------------- /versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | hcloud = { 4 | source = "hetznercloud/hcloud" 5 | version = "~> 1.27" 6 | } 7 | kubectl = { 8 | source = "gavinbunney/kubectl" 9 | version = "~> 1.13" 10 | } 11 | remote = { 12 | source = "tenstad/remote" 13 | version = "~> 0.0.23" 14 | } 15 | } 16 | required_version = ">= 0.13" 17 | } 18 | 19 | provider "hcloud" { 20 | token = var.hcloud_token 21 | } 22 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | This guide describes necessary tools and processes to contribute to this project. 4 | 5 | ## Development Setup 6 | 7 | ### Required tools 8 | 9 | The following tools are necessary for working with this repository: 10 | 11 | - [terraform](https://www.terraform.io) (for obvious reasons) 12 | - [pre-commit](https://pre-commit.com/#install) (to run linter/docs) 13 | - this requires a python installation 14 | - [terraform-docs](https://terraform-docs.io/user-guide/installation/) (to create the inputs/outputs table) 15 | 16 | ### pre-commit 17 | 18 | Before you commit, run `pre-commit run -a`. 19 | 20 | You can also do that automatically before each commit with `pre-commit install`. 21 | 22 | -------------------------------------------------------------------------------- /ssh.tf: -------------------------------------------------------------------------------- 1 | resource "tls_private_key" "ssh" { 2 | count = var.ssh_private_key_location == null ? 1 : 0 3 | algorithm = "RSA" 4 | rsa_bits = 4096 5 | } 6 | 7 | data "local_file" "custom_ssh_private_key" { 8 | count = var.ssh_private_key_location == null ? 0 : 1 9 | filename = var.ssh_private_key_location 10 | } 11 | 12 | data "tls_public_key" "custom_ssh" { 13 | count = var.ssh_private_key_location == null ? 0 : 1 14 | private_key_pem = data.local_file.custom_ssh_private_key[0].content 15 | } 16 | 17 | locals { 18 | ssh_private_key = var.ssh_private_key_location != null ? data.local_file.custom_ssh_private_key[0].content : tls_private_key.ssh[0].private_key_pem 19 | ssh_public_key = var.ssh_private_key_location != null ? data.tls_public_key.custom_ssh[0].public_key_openssh : tls_private_key.ssh[0].public_key_openssh 20 | } -------------------------------------------------------------------------------- /network.tf: -------------------------------------------------------------------------------- 1 | resource "hcloud_network" "k3s" { 2 | count = var.network_id == null ? 1 : 0 3 | name = "${var.name}-k3s-network" 4 | ip_range = var.network_cidr 5 | labels = local.common_labels 6 | } 7 | 8 | data "hcloud_network" "k3s" { 9 | count = var.network_id == null ? 0 : 1 10 | id = var.network_id 11 | } 12 | 13 | locals { 14 | network_id = var.network_id == null ? hcloud_network.k3s[0].id : var.network_id 15 | network_name = var.network_id == null ? hcloud_network.k3s[0].name : data.hcloud_network.k3s[0].name 16 | primary_control_plane_ip = cidrhost(hcloud_network_subnet.k3s_nodes.ip_range, var.control_plane_primary_index) 17 | } 18 | 19 | resource "hcloud_network_subnet" "k3s_nodes" { 20 | type = "cloud" 21 | network_id = local.network_id 22 | network_zone = "eu-central" 23 | ip_range = var.subnet_cidr 24 | } 25 | -------------------------------------------------------------------------------- /main.tf: -------------------------------------------------------------------------------- 1 | resource "random_password" "k3s_cluster_secret" { 2 | length = 48 3 | special = false 4 | } 5 | 6 | resource "hcloud_ssh_key" "provision_public" { 7 | name = "${var.name} - provisioning SSH key" 8 | public_key = local.ssh_public_key 9 | labels = local.common_labels 10 | } 11 | 12 | data "hcloud_image" "ubuntu" { 13 | name = "ubuntu-20.04" 14 | } 15 | 16 | locals { 17 | server_base_packages = ["wireguard"] 18 | cluster_dns_ip = cidrhost(var.service_cidr, 10) 19 | k3s_setup_args = "--cluster-cidr ${var.cluster_cidr} --service-cidr ${var.service_cidr} --cluster-dns ${local.cluster_dns_ip} --disable local-storage --disable-cloud-controller --disable traefik --disable servicelb --flannel-backend=wireguard --kubelet-arg='cloud-provider=external'" 20 | k3s_server_join_cmd = "sh -s - server --server 'https://${local.primary_control_plane_ip}:6443' ${local.k3s_setup_args}" 21 | k3s_server_init_cmd = "sh -s - server --cluster-init ${local.k3s_setup_args}" 22 | } 23 | -------------------------------------------------------------------------------- /example/main.tf: -------------------------------------------------------------------------------- 1 | variable "hcloud_token" {} 2 | 3 | module "demo_cluster" { 4 | source = "./.." 5 | # Can also point to a git repository, e.g. git::https://github.com/StarpTech/k-andy.git?ref=main 6 | hcloud_token = var.hcloud_token 7 | name = "k-andy-demo" 8 | k3s_version = "v1.21.10+k3s1" 9 | server_locations = ["nbg1", "fsn1"] 10 | agent_groups = { 11 | "storage" = { 12 | count = 2 13 | type = "cpx31" 14 | ip_offset = 13 15 | taints = [ 16 | "component=storage:NoSchedule" 17 | ] 18 | } 19 | "small" = { 20 | count = 2 21 | type = "cx21" 22 | ip_offset = 24 23 | taints = [] 24 | } 25 | "medium" = { 26 | count = 1 27 | type = "cx31" 28 | ip_offset = 32 29 | taints = [] 30 | } 31 | } 32 | } 33 | 34 | output "control_plane_ips" { 35 | value = module.demo_cluster.control_planes_public_ips 36 | } 37 | 38 | output "k3s_token" { 39 | value = module.demo_cluster.k3s_token 40 | sensitive = true 41 | } 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Dustin Deus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /agents.tf: -------------------------------------------------------------------------------- 1 | module "agent_group" { 2 | for_each = var.agent_groups 3 | 4 | source = "./modules/agent_group" 5 | 6 | k3s_cluster_secret = random_password.k3s_cluster_secret.result 7 | k3s_version = var.k3s_version 8 | 9 | taints = each.value.taints 10 | 11 | cluster_name = var.name 12 | group_name = each.key 13 | 14 | server_locations = var.server_locations 15 | 16 | provisioning_ssh_key_id = hcloud_ssh_key.provision_public.id 17 | ssh_private_key = local.ssh_private_key 18 | 19 | control_plane_ip = local.primary_control_plane_ip 20 | network_id = local.network_id 21 | public_control_plane_ip = hcloud_server.first_control_plane.ipv4_address 22 | 23 | subnet_id = hcloud_network_subnet.k3s_nodes.id 24 | subnet_ip_range = hcloud_network_subnet.k3s_nodes.ip_range 25 | 26 | ip_offset = each.value.ip_offset 27 | 28 | server_count = each.value.count 29 | server_type = each.value.type 30 | common_labels = local.common_labels 31 | 32 | additional_packages = concat(local.server_base_packages, var.server_additional_packages) 33 | 34 | depends_on = [hcloud_server.first_control_plane] 35 | } 36 | -------------------------------------------------------------------------------- /output.tf: -------------------------------------------------------------------------------- 1 | output "control_planes_public_ips" { 2 | value = concat([hcloud_server.first_control_plane.ipv4_address], [for server in hcloud_server.control_plane : server.ipv4_address]) 3 | description = "The public IP addresses of the control plane servers" 4 | } 5 | 6 | output "agents_public_ips" { 7 | value = flatten([for agents in module.agent_group : agents.public_ips]) 8 | description = "The public IP addresses of the agent servers" 9 | } 10 | 11 | output "ssh_private_key" { 12 | description = "Key to SSH into nodes" 13 | value = local.ssh_private_key 14 | sensitive = true 15 | } 16 | 17 | output "k3s_token" { 18 | description = "Secret k3s authentication token" 19 | value = random_password.k3s_cluster_secret.result 20 | sensitive = true 21 | } 22 | 23 | output "network_id" { 24 | value = local.network_id 25 | } 26 | 27 | output "subnet_id" { 28 | value = hcloud_network_subnet.k3s_nodes.id 29 | } 30 | 31 | output "cidr_block" { 32 | value = hcloud_network_subnet.k3s_nodes.ip_range 33 | } 34 | 35 | output "server_locations" { 36 | description = "Array of hetzner server locations we deploy to" 37 | value = var.server_locations 38 | } 39 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Example Usage 2 | 3 | ## Environment Setup 4 | 5 | It's easiest if you set up some environment variables. 6 | 7 | This example uses [direnv](https://direnv.net) with an `.envrc` for that, 8 | but you can also just execute the `export` statements in your session. 9 | 10 | Our `.envrc` here looks like this: 11 | 12 | ```shell 13 | export TF_VAR_hcloud_token=THETOKENYOUGETFROMTHECLOUDCONSOLE 14 | export KUBECONFIG=$(pwd)/kubeconfig-k-andy-demo.yaml 15 | ``` 16 | 17 | The `hcloud_token` is an API Token from a [Hetzner Cloud](https://console.hetzner.cloud/projects) project. 18 | 19 | We also set `KUBECONFIG` so we can later just run `kubectl` in here to interact with the created cluster. 20 | 21 | If you use `direnv`, don't forget to run `direnv allow`. 22 | 23 | ## Cluster Creation 24 | 25 | Now you bring up the cluster with terraform and then test if it's there and looking good. 26 | 27 | ```shell 28 | terraform init 29 | terraform apply 30 | kubectl cluster-info 31 | kubectl get node 32 | ``` 33 | 34 | ## Demo Application 35 | 36 | A demo application can be found in [manifests](manifests/hello-kubernetes.yaml). Run: 37 | 38 | ```sh 39 | kubectl apply -f ../manifests/hello-kubernetes.yaml 40 | ``` 41 | 42 | and try to access `http://:8080`. 43 | 44 | You can find the public IP of the service with `kubectl get service hello-kubernetes -o jsonpath='{.status.loadBalancer.ingress}'` 45 | 46 | ## Destroy your cluster 47 | 48 | If you no longer need the cluster don't forget to destroy it. Load-Balancers and volumes must be deleted manually. 49 | 50 | ```sh 51 | terraform destroy 52 | ``` 53 | -------------------------------------------------------------------------------- /kubeconfig.tf: -------------------------------------------------------------------------------- 1 | data "remote_file" "kubeconfig" { 2 | conn { 3 | host = hcloud_server.first_control_plane.ipv4_address 4 | port = 22 5 | user = "root" 6 | private_key = local.ssh_private_key 7 | } 8 | path = "/etc/rancher/k3s/k3s.yaml" 9 | } 10 | 11 | locals { 12 | kubeconfig_external = replace(data.remote_file.kubeconfig.content, "127.0.0.1", hcloud_server.first_control_plane.ipv4_address) 13 | } 14 | 15 | resource "local_file" "kubeconfig" { 16 | count = var.create_kubeconfig ? 1 : 0 17 | sensitive_content = local.kubeconfig_external 18 | filename = var.kubeconfig_filename == null ? "./kubeconfig-${var.name}.yaml" : var.kubeconfig_filename 19 | file_permission = "400" 20 | } 21 | 22 | locals { 23 | kubeconfig_parsed = yamldecode(local.kubeconfig_external) 24 | kubeconfig_data = { 25 | host = local.kubeconfig_parsed["clusters"][0]["cluster"]["server"] 26 | client_certificate = base64decode(local.kubeconfig_parsed["users"][0]["user"]["client-certificate-data"]) 27 | client_key = base64decode(local.kubeconfig_parsed["users"][0]["user"]["client-key-data"]) 28 | cluster_ca_certificate = base64decode(local.kubeconfig_parsed["clusters"][0]["cluster"]["certificate-authority-data"]) 29 | } 30 | } 31 | 32 | output "kubeconfig_file" { 33 | value = local.kubeconfig_external 34 | description = "Kubeconfig file content with external IP address" 35 | sensitive = true 36 | } 37 | 38 | output "kubeconfig" { 39 | description = "Structured kubeconfig data to supply to other providers" 40 | value = local.kubeconfig_data 41 | sensitive = true 42 | } 43 | -------------------------------------------------------------------------------- /manifests/hello-kubernetes.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: hello-kubernetes 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: hello-kubernetes 10 | template: 11 | metadata: 12 | labels: 13 | app: hello-kubernetes 14 | spec: 15 | containers: 16 | - name: hello-kubernetes 17 | image: paulbouwer/hello-kubernetes:1.10 18 | ports: 19 | - name: http 20 | containerPort: 8080 21 | livenessProbe: 22 | httpGet: 23 | path: / 24 | port: http 25 | readinessProbe: 26 | httpGet: 27 | path: / 28 | port: http 29 | env: 30 | - name: KUBERNETES_NAMESPACE 31 | valueFrom: 32 | fieldRef: 33 | fieldPath: metadata.namespace 34 | - name: KUBERNETES_POD_NAME 35 | valueFrom: 36 | fieldRef: 37 | fieldPath: metadata.name 38 | - name: KUBERNETES_NODE_NAME 39 | valueFrom: 40 | fieldRef: 41 | fieldPath: spec.nodeName 42 | volumes: 43 | - name: my-csi-volume 44 | persistentVolumeClaim: 45 | claimName: csi-pvc 46 | --- 47 | apiVersion: v1 48 | kind: Service 49 | metadata: 50 | name: hello-kubernetes 51 | annotations: 52 | load-balancer.hetzner.cloud/location: nbg1 53 | spec: 54 | type: LoadBalancer 55 | ports: 56 | - port: 8080 57 | targetPort: 8080 58 | selector: 59 | app: hello-kubernetes 60 | --- 61 | apiVersion: v1 62 | kind: PersistentVolumeClaim 63 | metadata: 64 | name: csi-pvc 65 | spec: 66 | accessModes: 67 | - ReadWriteOnce 68 | resources: 69 | requests: 70 | storage: 10Gi 71 | storageClassName: hcloud-volumes -------------------------------------------------------------------------------- /modules/agent_group/variables.tf: -------------------------------------------------------------------------------- 1 | variable "cluster_name" { 2 | description = "Cluster name (used in naming the servers)" 3 | } 4 | 5 | variable "group_name" { 6 | description = "Name of the agent group" 7 | } 8 | 9 | variable "server_locations" { 10 | description = "Server locations to create agents in" 11 | } 12 | 13 | variable "additional_packages" { 14 | default = [] 15 | } 16 | 17 | variable "server_count" { 18 | description = "Number of agent nodes" 19 | default = 2 20 | } 21 | 22 | variable "server_type" { 23 | description = "Server type of agent server group" 24 | default = "cx21" 25 | } 26 | 27 | variable "provisioning_ssh_key_id" { 28 | description = "ID of the hcloud SSH key to provision the node group with" 29 | } 30 | 31 | variable "control_plane_ip" { 32 | description = "Control plane IP to connect to" 33 | } 34 | 35 | variable "public_control_plane_ip" { 36 | description = "Public control plane IP" 37 | } 38 | 39 | variable "taints" { 40 | description = "Taints each worker gets" 41 | type = list(string) 42 | } 43 | 44 | variable "k3s_version" { 45 | description = "K3S version, should match the control plane" 46 | } 47 | 48 | variable "k3s_cluster_secret" { 49 | description = "K3S cluster token to authenticate against control plane" 50 | } 51 | 52 | variable "network_id" { 53 | description = "Network ID to place agents in" 54 | } 55 | 56 | variable "subnet_id" { 57 | description = "ID of the subnet in which agents are started" 58 | } 59 | 60 | variable "subnet_ip_range" { 61 | description = "CIDR block of the subnet" 62 | } 63 | 64 | variable "ip_offset" { 65 | description = "Offset from which agents are IPs are counted upwards. Needs to be adjusted to not cause collisions!" 66 | } 67 | 68 | 69 | variable "ssh_private_key" { 70 | description = "SSH private key to connect directly to server (used for remote-exec)" 71 | } 72 | 73 | variable "common_labels" { 74 | description = "Additional labels to add to server instances" 75 | default = {} 76 | } -------------------------------------------------------------------------------- /control_plane.tf: -------------------------------------------------------------------------------- 1 | resource "hcloud_server" "control_plane" { 2 | for_each = { for i in range(1, var.control_plane_server_count) : "#${i}" => i } 3 | name = "${var.name}-control-plane-${each.value}" 4 | 5 | image = data.hcloud_image.ubuntu.name 6 | server_type = var.control_plane_server_type 7 | location = element(var.server_locations, each.value) 8 | 9 | ssh_keys = [hcloud_ssh_key.provision_public.id] 10 | labels = merge({ 11 | node_type = "control-plane" 12 | }, local.common_labels) 13 | 14 | # Join cluster as server after first boot 15 | user_data = format("%s\n%s", "#cloud-config", yamlencode( 16 | { 17 | runcmd = [ 18 | "curl -sfL https://get.k3s.io | K3S_TOKEN='${random_password.k3s_cluster_secret.result}' INSTALL_K3S_VERSION='${var.k3s_version}' ${local.k3s_server_join_cmd}" 19 | ] 20 | packages = concat(local.server_base_packages, var.server_additional_packages) 21 | } 22 | )) 23 | 24 | network { 25 | network_id = local.network_id 26 | ip = cidrhost(hcloud_network_subnet.k3s_nodes.ip_range, each.value + 1) 27 | } 28 | 29 | provisioner "remote-exec" { 30 | inline = [ 31 | "until systemctl is-active --quiet k3s.service; do sleep 1; done", 32 | "until kubectl get node ${self.name}; do sleep 1; done", 33 | # Disable workloads on master node 34 | "kubectl taint node ${self.name} node-role.kubernetes.io/master=true:NoSchedule", 35 | "kubectl taint node ${self.name} CriticalAddonsOnly=true:NoExecute", 36 | ] 37 | 38 | connection { 39 | host = self.ipv4_address 40 | type = "ssh" 41 | user = "root" 42 | private_key = local.ssh_private_key 43 | } 44 | } 45 | 46 | // Otherwise we would be in a case where this would always be recreated because we switch the primary control plane IP 47 | lifecycle { 48 | ignore_changes = [user_data] 49 | } 50 | 51 | depends_on = [ 52 | hcloud_server.first_control_plane 53 | ] 54 | } 55 | 56 | resource "hcloud_server_network" "control_plane" { 57 | for_each = { for i in range(1, var.control_plane_server_count) : "#${i}" => i } // starts at 1 because master was 0 58 | subnet_id = hcloud_network_subnet.k3s_nodes.id 59 | server_id = hcloud_server.control_plane[each.key].id 60 | ip = cidrhost(hcloud_network_subnet.k3s_nodes.ip_range, each.value + 1) 61 | } 62 | -------------------------------------------------------------------------------- /modules/agent_group/agent.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | node_taint_args_raw = join(" ", [for taint in var.taints : "--node-taint ${taint}"]) 3 | node_taint_args = length(var.taints) == 0 ? "" : "${local.node_taint_args_raw} " // has to end with space to not conflict with next arg 4 | } 5 | 6 | resource "hcloud_server" "agent" { 7 | for_each = { for i in range(0, var.server_count) : "#${i}" => i } 8 | name = "${var.cluster_name}-${var.group_name}-${each.value}-${local.agent_pet_names[each.value]}" 9 | 10 | image = data.hcloud_image.ubuntu.name 11 | server_type = var.server_type 12 | location = element(var.server_locations, each.value) 13 | 14 | ssh_keys = [var.provisioning_ssh_key_id] 15 | labels = merge({ 16 | node_type = "worker" 17 | cluster = var.cluster_name 18 | }, var.common_labels) 19 | 20 | # Join cluster as agent after first boot 21 | # Adding the random pet name as comment is a trick to recreate the server on pet-name change 22 | user_data = format("%s\n#%s\n%s", "#cloud-config", local.agent_pet_names[each.value], yamlencode( 23 | { 24 | runcmd = [ 25 | "curl -sfL https://get.k3s.io | K3S_URL='https://${var.control_plane_ip}:6443' INSTALL_K3S_VERSION='${var.k3s_version}' K3S_TOKEN='${var.k3s_cluster_secret}' sh -s - agent --node-ip='${cidrhost(var.subnet_ip_range, var.ip_offset + each.value)}' ${local.node_taint_args}--kubelet-arg='cloud-provider=external' --kubelet-arg='node-labels=agent-group=${var.group_name},agent-index=${each.value}'" 26 | ] 27 | packages = var.additional_packages 28 | } 29 | )) 30 | 31 | network { 32 | network_id = var.network_id 33 | ip = cidrhost(var.subnet_ip_range, var.ip_offset + each.value) 34 | } 35 | 36 | provisioner "remote-exec" { 37 | inline = [ 38 | "until systemctl is-active --quiet k3s-agent.service; do sleep 1; done" 39 | ] 40 | 41 | connection { 42 | host = self.ipv4_address 43 | type = "ssh" 44 | user = "root" 45 | private_key = var.ssh_private_key 46 | } 47 | } 48 | } 49 | 50 | resource "hcloud_server_network" "agent" { 51 | for_each = { for i in range(0, var.server_count) : "#${i}" => i } 52 | subnet_id = var.subnet_id 53 | server_id = hcloud_server.agent[each.key].id 54 | ip = cidrhost(var.subnet_ip_range, var.ip_offset + each.value) // start at x.y.z.OFFSET 55 | } 56 | -------------------------------------------------------------------------------- /control_plane_primary.tf: -------------------------------------------------------------------------------- 1 | resource "hcloud_server" "first_control_plane" { 2 | name = "${var.name}-control-plane-0" 3 | 4 | image = data.hcloud_image.ubuntu.name 5 | server_type = var.control_plane_server_type 6 | location = var.server_locations[0] 7 | 8 | ssh_keys = [hcloud_ssh_key.provision_public.id] 9 | labels = merge({ 10 | node_type = "control-plane" 11 | }, local.common_labels) 12 | 13 | user_data = format("%s\n%s", "#cloud-config", yamlencode( 14 | { 15 | runcmd = [ 16 | "curl -sfL https://get.k3s.io | K3S_TOKEN='${random_password.k3s_cluster_secret.result}' INSTALL_K3S_VERSION='${var.k3s_version}' ${var.control_plane_already_initialized ? local.k3s_server_join_cmd : local.k3s_server_init_cmd}" 17 | ] 18 | packages = concat(local.server_base_packages, var.server_additional_packages) 19 | } 20 | )) 21 | 22 | provisioner "remote-exec" { 23 | inline = [ 24 | "until systemctl is-active --quiet k3s.service; do sleep 1; done", 25 | "until kubectl get node ${self.name}; do sleep 1; done", 26 | # Disable workloads on master node 27 | "kubectl taint node ${self.name} node-role.kubernetes.io/master=true:NoSchedule", 28 | "kubectl taint node ${self.name} CriticalAddonsOnly=true:NoExecute", 29 | # Install hetzner CCM 30 | "kubectl -n kube-system create secret generic hcloud --from-literal=token=${var.hcloud_token} --from-literal=network=${local.network_name}", 31 | "kubectl apply -f -< 2 | 3 | 5 | 6 | 34 | 36 | 38 | 40 | 42 | 45 | 76 | 79 | 82 | 86 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # k-andy 2 | 3 | 4 | 5 | ### Zero friction Kubernetes stack on Hetzner Cloud 6 | 7 | This [terraform](https://www.terraform.io/) module will install a High Availability [K3s](https://k3s.io/) Cluster with Embedded DB in a private network on [Hetzner Cloud](https://www.hetzner.com/de/cloud). The following resources are provisionised by default (**20€/mo**): 8 | 9 | - 3x Control-plane: _CX11_, 2GB RAM, 1VCPU, 20GB NVMe, 20TB Traffic. 10 | - 2x Worker: _CX21_, 4GB RAM, 2VCPU, 40GB NVMe, 20TB Traffic. 11 | - Network: Private network with one subnet. 12 | - Server and agent nodes are distributed across 3 Datacenters (nbg1, fsn1, hel1) for high availability. 13 | 14 |
15 |
16 | 17 | --- 18 | 19 | > **Note**: Are you looking for the next generation API Developer Platform? 🔎 Have a look at: [WunderGraph](https://github.com/wundergraph/wundergraph) 20 | Turn your services, databases and 3rd party APIs into a secure unified API in just a few minutes. 🪄 21 | 22 | --- 23 | 24 | **What is K3s?** 25 | 26 | K3s is a lightweight certified kubernetes distribution. It's packaged as single binary and comes with solid defaults for storage and networking but we replaced [local-path-provisioner](https://github.com/rancher/local-path-provisioner) with hetzner [CSI-driver](https://github.com/hetznercloud/csi-driver) and [klipper load-balancer](https://github.com/k3s-io/klipper-lb) with hetzner [Cloud Controller Manager](https://github.com/hetznercloud/hcloud-cloud-controller-manager). The default ingress controller (traefik) has been disabled. 27 | 28 | **Hetzner Cloud integration**: 29 | 30 | - Preinstalled [CSI-driver](https://github.com/hetznercloud/csi-driver) for volume support. 31 | - Preinstalled [Cloud Controller Manager for Hetzner Cloud](https://github.com/hetznercloud/hcloud-cloud-controller-manager) for Load Balancer support. 32 | 33 | **Auto-K3s-Upgrades** 34 | 35 | Enable the upgrade-controller (`enable_upgrade_controller = true`) and specify your target k3s version (`upgrade_k3s_target_version`). See [here](https://github.com/k3s-io/k3s/releases) for possible versions. 36 | 37 | Label the nodes you want to upgrade, e.g. `kubectl label nodes core-control-plane-1 k3s-upgrade=true`. The concurrency 38 | of the upgrade plan is set to 1, so you can also label them all at once. Agent nodes will be drained one by one during 39 | the upgrade. 40 | 41 | You can label all control-plane nodes by using `kubectl label nodes -l node-role.kubernetes.io/control-plane=true k3s-upgrade=true`. 42 | All agent nodes can be labelled using `kubectl label nodes -l !node-role.kubernetes.io/control-plane k3s-upgrade=true`. 43 | 44 | To remove the label from all nodes you can run `kubectl label nodes --all k3s-upgrade-`. 45 | 46 | After a successful update you can also remove the upgrade controller and the plans again, setting `enable_upgrade_controller` to `false`. 47 | 48 | ## Usage 49 | 50 | See a more detailed example with walk-through in the [example folder](./example). 51 | 52 | 53 | ### Inputs 54 | 55 | | Name | Description | Type | Default | Required | 56 | |------|-------------|------|---------|:--------:| 57 | | [agent\_groups](#input\_agent\_groups) | Configuration of agent groups |
map(object({
type = string
count = number
ip_offset = number
taints = list(string)
}))
|
{
"default": {
"count": 2,
"ip_offset": 33,
"taints": [],
"type": "cx21"
}
}
| no | 58 | | [cluster\_cidr](#input\_cluster\_cidr) | Network CIDR to use for pod IPs | `string` | `"10.42.0.0/16"` | no | 59 | | [control\_plane\_already\_initialized](#input\_control\_plane\_already\_initialized) | Use this if you have to replace the first control plane and want the primary to join other already existing ones and not do an init anymore. You have to update `control_plane_primary_index` to something else too. | `bool` | `false` | no | 60 | | [control\_plane\_primary\_index](#input\_control\_plane\_primary\_index) | Which of the servers should be the primary to connect to? If you change it from 1, also set `control_plane_already_initialized` to true. (1-indexed!) | `number` | `1` | no | 61 | | [control\_plane\_server\_count](#input\_control\_plane\_server\_count) | Number of control plane nodes | `number` | `3` | no | 62 | | [control\_plane\_server\_type](#input\_control\_plane\_server\_type) | Server type of control plane servers | `string` | `"cx11"` | no | 63 | | [create\_kubeconfig](#input\_create\_kubeconfig) | Create a local kubeconfig file to connect to the cluster | `bool` | `true` | no | 64 | | [enable\_upgrade\_controller](#input\_enable\_upgrade\_controller) | Install the rancher system-upgrade-controller | `bool` | `false` | no | 65 | | [hcloud\_csi\_driver\_version](#input\_hcloud\_csi\_driver\_version) | n/a | `string` | `"v1.6.0"` | no | 66 | | [hcloud\_token](#input\_hcloud\_token) | Token to authenticate against Hetzner Cloud | `any` | n/a | yes | 67 | | [k3s\_version](#input\_k3s\_version) | K3s version | `string` | `"v1.21.3+k3s1"` | no | 68 | | [kubeconfig\_filename](#input\_kubeconfig\_filename) | Specify the filename of the created kubeconfig file (defaults to kubeconfig-${var.name}.yaml | `any` | `null` | no | 69 | | [name](#input\_name) | Cluster name (used in various places, don't use special chars) | `any` | n/a | yes | 70 | | [network\_cidr](#input\_network\_cidr) | Network in which the cluster will be placed. Ignored if network\_id is defined | `string` | `"10.0.0.0/16"` | no | 71 | | [network\_id](#input\_network\_id) | If specified, no new network will be created. Make sure cluster\_cidr and service\_cidr don't collide with anything in the existing network. | `any` | `null` | no | 72 | | [server\_additional\_packages](#input\_server\_additional\_packages) | Additional packages which will be installed on node creation | `list(string)` | `[]` | no | 73 | | [server\_locations](#input\_server\_locations) | Server locations in which servers will be distributed | `list(string)` |
[
"nbg1",
"fsn1",
"hel1"
]
| no | 74 | | [service\_cidr](#input\_service\_cidr) | Network CIDR to use for services IPs | `string` | `"10.43.0.0/16"` | no | 75 | | [ssh\_private\_key\_location](#input\_ssh\_private\_key\_location) | Use this private SSH key instead of generating a new one (Attention: Encrypted keys are not supported) | `string` | `null` | no | 76 | | [subnet\_cidr](#input\_subnet\_cidr) | Subnet in which all nodes are placed | `string` | `"10.0.1.0/24"` | no | 77 | | [upgrade\_controller\_image\_tag](#input\_upgrade\_controller\_image\_tag) | The image tag of the upgrade controller (See https://github.com/rancher/system-upgrade-controller/releases) | `string` | `"v0.8.0"` | no | 78 | | [upgrade\_controller\_kubectl\_image\_tag](#input\_upgrade\_controller\_kubectl\_image\_tag) | rancher/kubectl image tag | `string` | `"v1.21.5"` | no | 79 | | [upgrade\_k3s\_target\_version](#input\_upgrade\_k3s\_target\_version) | Target version of k3s (See https://github.com/k3s-io/k3s/releases) | `string` | `null` | no | 80 | | [upgrade\_node\_additional\_tolerations](#input\_upgrade\_node\_additional\_tolerations) | List of tolerations which upgrade jobs must have to run on every node (for control-plane and agents) | `list(map(any))` | `[]` | no | 81 | 82 | ### Outputs 83 | 84 | | Name | Description | 85 | |------|-------------| 86 | | [agents\_public\_ips](#output\_agents\_public\_ips) | The public IP addresses of the agent servers | 87 | | [cidr\_block](#output\_cidr\_block) | n/a | 88 | | [control\_planes\_public\_ips](#output\_control\_planes\_public\_ips) | The public IP addresses of the control plane servers | 89 | | [k3s\_token](#output\_k3s\_token) | Secret k3s authentication token | 90 | | [kubeconfig](#output\_kubeconfig) | Structured kubeconfig data to supply to other providers | 91 | | [kubeconfig\_file](#output\_kubeconfig\_file) | Kubeconfig file content with external IP address | 92 | | [network\_id](#output\_network\_id) | n/a | 93 | | [server\_locations](#output\_server\_locations) | Array of hetzner server locations we deploy to | 94 | | [ssh\_private\_key](#output\_ssh\_private\_key) | Key to SSH into nodes | 95 | | [subnet\_id](#output\_subnet\_id) | n/a | 96 | 97 | 98 | ## Common Operations 99 | 100 | ### Agent server replacement (common case) 101 | 102 | If you need to cycle an agent, you can do that with a single node following this procedure. 103 | Replace the group name and number with the server you want to recreate! 104 | 105 | Make sure you drain the nodes first. 106 | 107 | ```shell 108 | kubectl drain that-agent 109 | terraform taint 'module.my_cluster.module.agent_group["GROUP_NAME"].random_pet.agent_suffix[1]' 110 | terraform apply 111 | ``` 112 | 113 | This will recreate the agent in that group on next apply. 114 | 115 | ### Sophisticated agent server replacement 116 | 117 | If you did some weird config change or recreate them by changing the base k3s version in the terraform configuration and 118 | terraform wants to replace all your agents at once you can do this. Replacing all by one is probably not a good idea. 119 | 120 | Example for replacement of one agent (the first one of that group): 121 | 122 | ```shell 123 | kubectl drain that-agent 124 | terragrunt taint 'module.agent_group["GROUP_NAME"].random_pet.agent_suffix[0]' 125 | terraform apply --target='module.agent_group["GROUP_NAME"].hcloud_server.agent["#0"]' --target='module.agent_group["GROUP_NAME"].hcloud_server_network.agent["#0"]' --target='module.agent_group["GROUP_NAME"].random_pet.agent_suffix[0]' 126 | ``` 127 | 128 | ### Control Plane server replacement 129 | 130 | Control plane servers do not get recreated when the user-data for cloud-init changes. If you want to recreate one after 131 | you changed something which would change the cloud-init you need to taint them. 132 | 133 | #### Primary server 134 | 135 | If you for some reason need to replace the primary control plane, you'll need to tell it to join the others. 136 | 137 | Set the variable `control_plane_primary_index` to one of the other control plane nodes (e.g. 2 or 3). 138 | Also set `control_plane_already_initialized` to `true` so it won't run a `cluster-init` again. This will make the primary 139 | connect to control-plane 2 or 3 after recreation. 140 | 141 | #### Secondary servers 142 | 143 | This is how you can replace the servers which didn't initialize the cluster. 144 | 145 | ```shell 146 | terraform taint 'module.my_cluster.hcloud_server.control_plane["#1"]' 147 | terraform apply 148 | ``` 149 | 150 | ## Auto-Upgrade 151 | 152 | ### Prerequisite 153 | 154 | Install the system-upgrade-controller in your cluster. 155 | 156 | ``` 157 | KUBECONFIG=kubeconfig.yaml kubectl apply -f ./upgrade/controller.yaml 158 | ``` 159 | 160 | ## Upgrade procedure 161 | 162 | 1. Mark the nodes you want to upgrade (The script will mark all nodes). 163 | 164 | ``` 165 | KUBECONFIG=kubeconfig.yaml kubectl label --all node k3s-upgrade=true 166 | ``` 167 | 168 | 2. Run the plan for the **servers**. 169 | 170 | ``` 171 | KUBECONFIG=kubeconfig.yaml kubectl apply -f ./upgrade/server-plan.yaml 172 | ``` 173 | 174 | > **Warning:** Wait for completion [before you start upgrading your agents](https://github.com/k3s-io/k3s/issues/2996#issuecomment-788352375). 175 | 176 | 3. Run the plan for the **agents**. 177 | 178 | ``` 179 | KUBECONFIG=kubeconfig.yaml kubectl apply -f ./upgrade/agent-plan.yaml 180 | ``` 181 | 182 | ## Backups 183 | 184 | K3s will automatically backup your embedded etcd datastore every 12 hours to `/var/lib/rancher/k3s/server/db/snapshots/`. 185 | You can reset the cluster by pointing to a specific snapshot. 186 | 187 | 1. Stop the master server. 188 | 189 | ```sh 190 | sudo systemctl stop k3s 191 | ``` 192 | 193 | 2. Restore the master server with a snapshot 194 | 195 | ```sh 196 | ./k3s server \ 197 | --cluster-reset \ 198 | --cluster-reset-restore-path= 199 | ``` 200 | 201 | > **Warning:** This forget all peers and the server becomes the sole member of a new cluster. You have to manually rejoin all servers. 202 | 203 | 3. Connect you with the different servers. Backup and delete `/var/lib/rancher/k3s/server/db` on each server. 204 | 205 | ```sh 206 | sudo systemctl stop k3s 207 | rm -rf /var/lib/rancher/k3s/server/db 208 | sudo systemctl start k3s 209 | ``` 210 | 211 | This will rejoin the server one after another. After some time, all servers should be in sync again. Run `kubectl get node` to verify it. 212 | 213 | > **Info:** It exists no official tool to automate the procedure. In future, rancher might provide an operator to handle this ([issue](https://github.com/k3s-io/k3s/issues/3174)). 214 | 215 | ## Debugging 216 | 217 | Cloud init logs can be found on the remote machines in: 218 | 219 | - `/var/log/cloud-init-output.log` 220 | - `/var/log/cloud-init.log` 221 | - `journalctl -u k3s.service -e` last logs of the server 222 | - `journalctl -u k3s-agent.service -e` last logs of the agent 223 | 224 | ## Credits 225 | 226 | - [terraform-hcloud-k3s](https://github.com/cicdteam/terraform-hcloud-k3s) Terraform module which creates a single node cluster. 227 | - [terraform-module-k3](https://github.com/xunleii/terraform-module-k3s) Terraform module which creates a k3s cluster, with multi-server and management features. 228 | - Icon created by [Freepik](https://www.freepik.com) from [www.flaticon.com](https://www.flaticon.com/de/) 229 | --------------------------------------------------------------------------------