├── LICENSE
├── iam.tf
├── main.tf
├── auto-scaling.tf
├── files
    └── userdata.template
├── security-groups.tf
└── README.md


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Tom Hill
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/iam.tf:
--------------------------------------------------------------------------------
 1 | # Autoscaling lifecycle hook role
 2 | # Allows lifecycle hooks to add messages to the SQS queue
 3 | resource "aws_iam_role" "lifecycle_role" {
 4 | 
 5 |     name = "${var.cluster_name}-lifecycle-hooks"
 6 |     assume_role_policy = <<EOF
 7 | {
 8 |   "Version": "2012-10-17",
 9 |   "Statement": [
10 |     {
11 |       "Sid": "",
12 |       "Effect": "Allow",
13 |       "Principal": {
14 |         "Service": "autoscaling.amazonaws.com"
15 |       },
16 |       "Action": "sts:AssumeRole"
17 |     }
18 |   ]
19 | }
20 | EOF
21 | 
22 |     lifecycle {
23 |         create_before_destroy = true
24 |     }
25 | 
26 | }
27 | 
28 | # Attach policy document for access to the sqs queue
29 | resource "aws_iam_role_policy" "lifecycle_role_policy" {
30 |     name = "${var.cluster_name}-lifecycle-hooks-policy"
31 |     role = "${aws_iam_role.lifecycle_role.id}"
32 |     policy = <<EOF
33 | {
34 |   "Version": "2012-10-17",
35 |   "Statement": [{
36 |     "Effect": "Allow",
37 |     "Resource": "${var.lifecycle_hooks_sqs_queue_arn}",
38 |     "Action": [
39 |       "sqs:SendMessage",
40 |       "sqs:GetQueueUrl",
41 |       "sns:Publish"
42 |     ]
43 |   }]
44 | }
45 | EOF
46 | 
47 |     lifecycle {
48 |         create_before_destroy = true
49 |     }
50 |     
51 | }
52 | 


--------------------------------------------------------------------------------
/main.tf:
--------------------------------------------------------------------------------
 1 | # Rancher server details
 2 | variable "server_security_group_id" {
 3 |     description = "Security group id of the Rancher server so we can restrict incoming traffic."
 4 | }
 5 | variable "server_hostname" {
 6 |     description = "Hostname of the Rancher server."
 7 | }
 8 | 
 9 | # Target server environment
10 | variable "environment_id" {
11 |     description = "Target environment id for host registration."
12 | }
13 | variable "environment_access_key" {
14 |     description = "API access key for target environment"
15 | }
16 | variable "environment_secret_key" {
17 |     description = "API secret key for target environment"
18 | }
19 | 
20 | # Cluster setup
21 | variable "cluster_name" {
22 |     description = "The name of the cluster. Best not to include non-alphanumeric characters. Will be used to name resources and tag instances."
23 | }
24 | variable "cluster_autoscaling_group_name" {
25 |     description = "Name of the target autoscaling group."
26 | }
27 | variable "cluster_instance_security_group_id" {
28 |     description = "ID of the security group used for host instances. Will be modified to include rancher specific rules."
29 | }
30 | variable "cluster_instance_labels" {
31 |     description = "Additional labels to attach to host instances. Should be in the format: key=value&key2=value2"
32 |     default = ""
33 | }
34 | 
35 | # Docker options
36 | variable "docker_daemon_options" {
37 |     description = "Docker daemon options to write to the docker config file before startup."
38 |     default = ""
39 | }
40 | 
41 | # Lifecycle hooks queue arn
42 | variable "lifecycle_hooks_sqs_queue_arn" {
43 |     description = "ARN of the SQS queue used to receive autoscaling lifecycle hooks."
44 | }
45 | 


--------------------------------------------------------------------------------
/auto-scaling.tf:
--------------------------------------------------------------------------------
 1 | # User-data template
 2 | # Registers the instance with the rancher server environment
 3 | resource "template_file" "user_data" {
 4 | 
 5 |     template = "${file("${path.module}/files/userdata.template")}"
 6 |     vars {
 7 |         cluster_name            = "${var.cluster_name}"
 8 |         cluster_instance_labels = "${var.cluster_instance_labels}"
 9 |         environment_id          = "${var.environment_id}"
10 |         environment_access_key  = "${var.environment_access_key}"
11 |         environment_secret_key  = "${var.environment_secret_key}"
12 |         server_hostname         = "${var.server_hostname}"
13 |         docker_daemon_options   = "${var.docker_daemon_options}"
14 |     }
15 | 
16 |     lifecycle {
17 |         create_before_destroy = true
18 |     }
19 | 
20 | }
21 | 
22 | # Lifecycle hook
23 | # Triggered when an instance should be removed from the autoscaling
24 | # group. Publishes a message to the supplied SQS queue so that the host
25 | # can be removed from the Rancher server before shutting down.
26 | resource "aws_autoscaling_lifecycle_hook" "cluster_instance_terminating_hook" {
27 | 
28 |     name = "cluster_instance_terminating_hook"
29 |     autoscaling_group_name = "${var.cluster_autoscaling_group_name}"
30 |     lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
31 |     default_result = "CONTINUE"
32 | 
33 |     # 10 mins for rancher server to remove instance
34 |     heartbeat_timeout = 600
35 | 
36 |     # Notification SQS queue
37 |     notification_target_arn = "${var.lifecycle_hooks_sqs_queue_arn}"
38 | 
39 |     role_arn = "${aws_iam_role.lifecycle_role.arn}"
40 | 
41 |     lifecycle {
42 |         create_before_destroy = true
43 |     }
44 | 
45 | }
46 | 
47 | output "host_user_data" {
48 |     value = "${template_file.user_data.rendered}"
49 | }
50 | 


--------------------------------------------------------------------------------
/files/userdata.template:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install jq
 4 | yum install -y jq docker-17.06.2ce-1.94.amzn1
 5 | 
 6 | # Add any docker daemon options
 7 | if [ -n "${docker_daemon_options}" ]; then
 8 | 	echo 'OPTIONS="${docker_daemon_options}"' >> /etc/sysconfig/docker
 9 | fi
10 | 
11 | # Start docker
12 | service docker start
13 | 
14 | # Setup initial vars
15 | serverUrl=https://${environment_access_key}:${environment_secret_key}@${server_hostname}
16 | projectId=${environment_id}
17 | 
18 | # Make initial POST request for a registration token and record the id
19 | response=$(curl -s -X POST $serverUrl/v1/registrationtokens?projectId=$projectId)
20 | requestId=$(echo $response | jq -r '.id')
21 | requestState=$(echo $response | jq -r '.state')
22 | 
23 | # The registration token request is async so keep checking until it's complete
24 | while [[ "$requestState" != "active" ]]; do
25 | 	sleep 2
26 | 	response=$(curl -s $serverUrl/v1/registrationtokens/$requestId)
27 | 	requestState=$(echo $response | jq -r '.state')
28 | done
29 | 
30 | # Get the instance id from metadata
31 | instanceId=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
32 | instancePrivateIp=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4)
33 | 
34 | # Labels
35 | instanceLabels="HOSTID=$instanceId&CLOUD=aws&CLUSTER=${cluster_name}"
36 | customLabels="${cluster_instance_labels}"
37 | if [ -n "$customLabels" ]; then
38 | 	instanceLabels="$instanceLabels&$customLabels"
39 | fi
40 | 
41 | # Add external DNS label if there's a public IP address
42 | instancePublicIp=$(curl -f -s http://169.254.169.254/latest/meta-data/public-ipv4)
43 | if [ -n "$instancePublicIp" ]; then
44 | 	instanceLabels="$instanceLabels&io.rancher.host.external_dns_ip=$instancePublicIp"
45 | fi
46 | 
47 | # Use the command in the response to start the rancher agent
48 | cmd=$(echo $response | jq -r '.command')
49 | eval $${cmd/sudo docker run /docker run -e CATTLE_AGENT_IP=$instancePrivateIp -e CATTLE_HOST_LABELS=\"$instanceLabels\" }
50 | 
51 | # Fix to allow rancher-nfs
52 | ln -s /var/run/rancher/storage/rancher-nfs.sock /run/docker/plugins/rancher-nfs.sock
53 | 


--------------------------------------------------------------------------------
/security-groups.tf:
--------------------------------------------------------------------------------
  1 | # Attach IPSEC rules to host instance security group.
  2 | # Enables the rancher overlay network for connected hosts.
  3 | # Traffic only allowed to and from other machines with this security group.
  4 | resource "aws_security_group_rule" "ipsec_ingress_1" {
  5 | 
  6 |     security_group_id = "${var.cluster_instance_security_group_id}"
  7 |     type = "ingress"
  8 |     from_port = 4500
  9 |     to_port = 4500
 10 |     protocol = "udp"
 11 |     source_security_group_id = "${var.cluster_instance_security_group_id}"
 12 | 
 13 |     lifecycle {
 14 |         create_before_destroy = true
 15 |     }
 16 | 
 17 | }
 18 | 
 19 | resource "aws_security_group_rule" "ipsec_egress_1" {
 20 | 
 21 |     security_group_id = "${var.cluster_instance_security_group_id}"
 22 |     type = "egress"
 23 |     from_port = 4500
 24 |     to_port = 4500
 25 |     protocol = "udp"
 26 |     source_security_group_id = "${var.cluster_instance_security_group_id}"
 27 | 
 28 |     lifecycle {
 29 |         create_before_destroy = true
 30 |     }
 31 | 
 32 | }
 33 | 
 34 | resource "aws_security_group_rule" "ipsec_ingress_2" {
 35 | 
 36 |     security_group_id = "${var.cluster_instance_security_group_id}"
 37 |     type = "ingress"
 38 |     from_port = 500
 39 |     to_port = 500
 40 |     protocol = "udp"
 41 |     source_security_group_id = "${var.cluster_instance_security_group_id}"
 42 | 
 43 |     lifecycle {
 44 |         create_before_destroy = true
 45 |     }
 46 | 
 47 | }
 48 | 
 49 | resource "aws_security_group_rule" "ipsec_egress_2" {
 50 | 
 51 |     security_group_id = "${var.cluster_instance_security_group_id}"
 52 |     type = "egress"
 53 |     from_port = 500
 54 |     to_port = 500
 55 |     protocol = "udp"
 56 |     source_security_group_id = "${var.cluster_instance_security_group_id}"
 57 | 
 58 |     lifecycle {
 59 |         create_before_destroy = true
 60 |     }
 61 | 
 62 | }
 63 | 
 64 | # SSH ingress
 65 | # Required for the server to connect & configure the host.
 66 | resource "aws_security_group_rule" "ssh_ingress" {
 67 | 
 68 |     security_group_id = "${var.cluster_instance_security_group_id}"
 69 |     type = "ingress"
 70 |     from_port = 22
 71 |     to_port = 22
 72 |     protocol = "tcp"
 73 |     source_security_group_id = "${var.server_security_group_id}"
 74 | 
 75 |     lifecycle {
 76 |         create_before_destroy = true
 77 |     }
 78 | 
 79 | }
 80 | 
 81 | # Outgoing HTTP
 82 | # Allows pulling of remote docker images, installing packages, etc.
 83 | resource "aws_security_group_rule" "http_egress" {
 84 | 
 85 |     security_group_id = "${var.cluster_instance_security_group_id}"
 86 |     type = "egress"
 87 |     from_port = 80
 88 |     to_port = 80
 89 |     protocol = "tcp"
 90 |     cidr_blocks = ["0.0.0.0/0"]
 91 | 
 92 |     lifecycle {
 93 |         create_before_destroy = true
 94 |     }
 95 | 
 96 | }
 97 | 
 98 | # Outgoing HTTPS
 99 | # Allows pulling of remote docker images, installing packages, etc.
100 | resource "aws_security_group_rule" "https_egress" {
101 | 
102 |     security_group_id = "${var.cluster_instance_security_group_id}"
103 |     type = "egress"
104 |     from_port = 443
105 |     to_port = 443
106 |     protocol = "tcp"
107 |     cidr_blocks = ["0.0.0.0/0"]
108 | 
109 |     lifecycle {
110 |         create_before_destroy = true
111 |     }
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Rancher host cluster Terraform module
  2 | 
  3 | This is a terraform module to help with creating a rancher host cluster. It is intended for use in combination with [my Rancher server module](https://github.com/greensheep/terraform-aws-rancher-server).
  4 | 
  5 | ### Features
  6 | 
  7 | - Flexible for use with different deployment scenarios.
  8 | - Automatically adds hosts launched by autoscaling to the Rancher server.
  9 | - Registers autoscaling lifecycle hook used to automatically remove instances from the Rancher server on scale down (see [my Rancher server module](https://github.com/greensheep/terraform-aws-rancher-server)).
 10 | - Designed for use in VPC private subnets so can be used for private, backend services or proxy traffic from an ELB for public services.
 11 | - Can be used unlimited times in a terraform config. Allows creation of separate clusters for dev, staging, production, etc.
 12 | 
 13 | ### Requirements
 14 | 
 15 | Terraform 0.6.6 is required.
 16 | 
 17 | On it's own this doesn't do very much. It needs to be included in a Terraform config that creates the following resources:
 18 | 
 19 | - Security group
 20 | - Autoscaling launch configuration
 21 | - Autoscaling group
 22 | 
 23 | Because these resources may vary significantly for your deployment (eg, the type of app you're deploying, expected workload, etc), you need to create these yourself and pass in the necessary variables.
 24 | 
 25 | You'll also need to have your Rancher server setup & configured (did I mention [my Rancher server module](https://github.com/greensheep/terraform-aws-rancher-server)!). Don't be tempted to use this as part of some mega-config that also creates the server.. you need to specify an environment id and API access keys for it to work!
 26 | 
 27 | ### Usage
 28 | 
 29 | Include the following in your existing terraform config:
 30 | 
 31 |     module "staging_cluster" {
 32 | 
 33 |         # Import the module from Github
 34 |         # It's probably better to fork or clone this repo if you intend to use in production
 35 |         # so any future changes dont mess up your existing infrastructure.
 36 |         source = "github.com/greensheep/terraform-aws-rancher-hosts"
 37 | 
 38 |         # Add Rancher server details
 39 |         server_security_group_id = "sg-XXXXXXXX"
 40 |         server_hostname          = "rancher-server.yourdomain.tld"
 41 | 
 42 |         # Rancher environment
 43 |         # In your Rancher server, create an environment and an API keypair. You can have
 44 |         # multiple host clusters per environment if necessary. Instances will be labelled
 45 |         # with the cluster name so you can differentiate between multiple clusters.
 46 |         environment_id         = "1a7"
 47 |         environment_access_key = "ACCESS-KEY"
 48 |         environment_secret_key = "SECRET-KET"
 49 | 
 50 |         # Name your cluster and provide the autoscaling group name and security group id.
 51 |         # See examples below.
 52 |         cluster_name                       = "${var.cluster_name}"
 53 |         cluster_autoscaling_group_name     = "${aws_autoscaling_group.cluster_autoscale_group.id}"
 54 |         cluster_instance_security_group_id = "${aws_security_group.rancher_host_sg.id}"
 55 | 
 56 |         # Lifecycle hooks queue ARN
 57 |         # This is specific to my Rancher server module which creates the SQS queue used to
 58 |         # received autoscaling lifecycle hooks. This module creates a lifecycle hook for the
 59 |         # provided autoscaling group so that instances can be removed from the Rancher
 60 |         # server before they are terminated.
 61 |         lifecycle_hooks_sqs_queue_arn = "${var.lifecycle_hooks_sqs_queue_arn}"
 62 | 
 63 |     }
 64 | 
 65 | ### Examples of required resources
 66 | 
 67 | ##### Security group
 68 | 
 69 |     # Cluster instance security group
 70 |     resource "aws_security_group" "cluster_instance_sg" {
 71 | 
 72 |         name = "Cluster-Instances"
 73 |         description = "Rules for connected Rancher host machines. These are the hosts that run containers placed on the cluster."
 74 |         vpc_id = "${TARGET-VPC-ID}"
 75 | 
 76 |         # NOTE: To allow ELB proxied traffic to private VPC
 77 |         #       hosts, open the necessary ports here..
 78 | 
 79 |         lifecycle {
 80 |             create_before_destroy = true
 81 |         }
 82 | 
 83 |     }
 84 | 
 85 | 
 86 | ##### Autoscaling
 87 | 
 88 |     # Autoscaling launch configuration
 89 |     resource "aws_launch_configuration" "cluster_launch_conf" {
 90 | 
 91 |         name = "Launch-Config"
 92 | 
 93 |         # Amazon linux, eu-west-1
 94 |         image_id = "ami-69b9941e"
 95 | 
 96 |         # No public ip when instances are placed in private subnets. See notes
 97 |         # about creating an ELB to proxy public traffic into the cluster.
 98 |         associate_public_ip_address = false
 99 | 
100 |         # Security groups
101 |         security_groups = [
102 |             "${aws_security_group.cluster_instance_sg.id}"
103 |         ]
104 | 
105 |         # Key
106 |         # NOTE: It's a good idea to use the same key as the Rancher server here.
107 |         key_name = "${UPLOADED-KEY-NAME}"
108 | 
109 |         # Add rendered userdata template
110 |         user_data = "${module.staging_cluster.host_user_data}"
111 | 
112 |         # Misc
113 |         instance_type = "t2.micro"
114 |         enable_monitoring = true
115 | 
116 |         lifecycle {
117 |             create_before_destroy = true
118 |         }
119 | 
120 |     }
121 | 
122 |     # Autoscaling group
123 |     resource "aws_autoscaling_group" "cluster_autoscale_group" {
124 | 
125 |         name = "Cluster-ASG"
126 |         launch_configuration = "${aws_launch_configuration.cluster_launch_conf.name}"
127 |         min_size = "2"
128 |         max_size = "2"
129 |         desired_capacity = "2"
130 |         health_check_grace_period = 180
131 |         health_check_type = "EC2"
132 |         force_delete = false
133 |         termination_policies = ["OldestInstance"]
134 | 
135 |         # Add ELB's here if you're proxying public traffic into the cluster
136 |         # load_balancers = ["${var.instance_cluster_load_balancers}"]
137 | 
138 |         # Target subnets
139 |         vpc_zone_identifier = ["${LIST-OF-VPC-PRIVATE-SUBNET-IDS}"]
140 | 
141 |         tag {
142 |             key = "Name"
143 |             value = "Test-Cluster-Instance"
144 |             propagate_at_launch = true
145 |         }
146 | 
147 |         lifecycle {
148 |             create_before_destroy = true
149 |         }
150 | 
151 |     }
152 | 


--------------------------------------------------------------------------------