├── .gitignore ├── README.md ├── index └── index.py ├── main.tf └── variables.tf /.gitignore: -------------------------------------------------------------------------------- 1 | files -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ECS Instance Draining on Scale In 2 | =================================== 3 | 4 | Heavily inspired by this AWS [blog post](https://aws.amazon.com/blogs/compute/how-to-automate-container-instance-draining-in-amazon-ecs/), this module deploys resources and code to support ECS Instance Draning and ASG lifecycle hook to ensure that running tasks are not obliterated by ASG scale-in events. 5 | 6 | Further details about [AutoScaling Lifecyle Hooks](http://docs.aws.amazon.com/autoscaling/latest/userguide/lifecycle-hooks.html) is available. 7 | 8 | ![alt tag](https://s3.amazonaws.com/chrisb/Architecture.png) 9 | 10 | 11 | Module Input Variables 12 | ---------------------- 13 | 14 | - `region` - The AWS Region where the resources reside. 15 | - `autoscaling_group_name` - The Name of the AutoScaling Group used by the ECS Cluster. 16 | - `cluster_name` - The Name of the ECS Cluster to target 17 | - `function_sleep_time` - Number of seconds the Lambda function should sleep before checking ECS Instance Task Count again. Defaults to 15 seconds. 18 | - `hook_heartbeat_timeout` - Amount of time, in seconds, the lifecycle hook should wait before giving up and moving onto the default result. Defaults to 900 (15 mins). 19 | - `hook_default_result` - Can be one of either ABANDON or CONTINUE. ABANDON stops any remaining actions, such as other lifecycle hooks, while CONTINUE allows any other lifecycle hooks to complete. Default is ABANDON 20 | - `enabled` - boolean expression. If false, the Lifecycle Hook is removed from the AutoScaling Group. Defaults to `true`. 21 | 22 | Usage 23 | ----- 24 | 25 | ```js 26 | resource "aws_autoscaling_group" "ecs" { 27 | #properties omitted 28 | } 29 | 30 | module "ecs_instance_draining_on_scale_in" { 31 | source = "github.com/terraform-community-modules/tf_aws_ecs_instance_draining_on_scale_in" 32 | 33 | region = "eu-west-1" 34 | autoscaling_group_name = "${aws_autoscaling_group.ecs.asg_name}" 35 | cluster-name = "my-cluster" 36 | hook_heartbeat_timeout = 1800 37 | hook_default_result = "ABANDON" 38 | } 39 | ``` 40 | 41 | Author 42 | ------ 43 | Created and maintained by [Shayne Clausson](https://github.com/sclausson) 44 | -------------------------------------------------------------------------------- /index/index.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import os 4 | import boto3 5 | 6 | CLUSTER = os.environ['CLUSTER_NAME'] 7 | REGION = os.environ['REGION'] 8 | SLEEP_TIME = os.environ['SLEEP_TIME'] 9 | 10 | ECS = boto3.client('ecs', region_name=REGION) 11 | ASG = boto3.client('autoscaling', region_name=REGION) 12 | SNS = boto3.client('sns', region_name=REGION) 13 | 14 | 15 | def find_ecs_instance_info(instance_id): 16 | paginator = ECS.get_paginator('list_container_instances') 17 | for list_resp in paginator.paginate(cluster=CLUSTER): 18 | arns = list_resp['containerInstanceArns'] 19 | desc_resp = ECS.describe_container_instances(cluster=CLUSTER, 20 | containerInstances=arns) 21 | for container_instance in desc_resp['containerInstances']: 22 | if container_instance['ec2InstanceId'] != instance_id: 23 | continue 24 | print('Found instance: id=%s, arn=%s, status=%s, runningTasksCount=%s' % 25 | (instance_id, container_instance['containerInstanceArn'], 26 | container_instance['status'], container_instance['runningTasksCount'])) 27 | return (container_instance['containerInstanceArn'], 28 | container_instance['status'], container_instance['runningTasksCount']) 29 | return None, None, 0 30 | 31 | 32 | def instance_has_running_tasks(instance_id): 33 | (instance_arn, container_status, running_tasks) = find_ecs_instance_info(instance_id) 34 | if instance_arn is None: 35 | print('Could not find instance ID %s. Letting autoscaling kill the instance.' % 36 | (instance_id)) 37 | return False 38 | if container_status != 'DRAINING': 39 | print('Setting container instance %s (%s) to DRAINING' % 40 | (instance_id, instance_arn)) 41 | ECS.update_container_instances_state(cluster=CLUSTER, 42 | containerInstances=[instance_arn], 43 | status='DRAINING') 44 | return running_tasks > 0 45 | 46 | 47 | def lambda_handler(event, context): 48 | msg = json.loads(event['Records'][0]['Sns']['Message']) 49 | if 'LifecycleTransition' not in msg.keys() or \ 50 | msg['LifecycleTransition'].find('autoscaling:EC2_INSTANCE_TERMINATING') == -1: 51 | print('Exiting since the lifecycle transition is not EC2_INSTANCE_TERMINATING.') 52 | return 53 | if instance_has_running_tasks(msg['EC2InstanceId']): 54 | print('Tasks are still running on instance %s; posting msg to SNS topic %s' % 55 | (msg['EC2InstanceId'], event['Records'][0]['Sns']['TopicArn'])) 56 | time.sleep(int(SLEEP_TIME)) 57 | sns_resp = SNS.publish(TopicArn=event['Records'][0]['Sns']['TopicArn'], 58 | Message=json.dumps(msg), 59 | Subject='Publishing SNS msg to invoke Lambda again.') 60 | print('Posted msg %s to SNS topic.' % (sns_resp['MessageId'])) 61 | else: 62 | print('No tasks are running on instance %s; setting lifecycle to complete' % 63 | (msg['EC2InstanceId'])) 64 | ASG.complete_lifecycle_action(LifecycleHookName=msg['LifecycleHookName'], 65 | AutoScalingGroupName=msg['AutoScalingGroupName'], 66 | LifecycleActionResult='CONTINUE', 67 | InstanceId=msg['EC2InstanceId']) 68 | 69 | -------------------------------------------------------------------------------- /main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_iam_role" "sns" { 2 | name = "${var.autoscaling_group_name}-notifies-sns" 3 | 4 | assume_role_policy = <