├── .gitignore ├── LICENSE ├── README.md ├── ecscale.py └── policy.json /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Omer Hamerman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EC*SCALE* 2 | ### A serverless app removing underutilized hosts from ECS clusters 3 | 4 | #### Scaling ECS down is not a straightforward task;Based on one metric solely, an instance could be taken down causing 2 effects: 5 | 1. Forcefully removing a host when a container is running will cut off active connections causing service downtime 6 | 2. Removing an instance based on utilization / capacity metric may cause an endless loop of scale 7 | 8 | 9 | #### To such an end, this tool will look for scaleable clusters based on multiple metrics 10 | Once identified, the target is moved to "draining" state, where a new instance of the same task is raised on an available host. Once the new containers are ready, the draining instsnce will start draining connection from active tasks. 11 | Once the draining process is complete, the instance will be terminated. 12 | 13 | 14 | #### Usage: 15 | 1. Throw `ecscale.py` code to AWS Lambda providing relevant role to handle ECS and autoscaling (Instrcutions ahead) 16 | 2. Set repeated run (recommended every 60 minutes using a cloudwatch events trigger for Lambda) 17 | 3. That's it... Your ECS hosts are being gracefully removed if needed. No metrics/alarms needed 18 | 19 | #### Changable Parameters: 20 | * SCALE_IN_CPU_TH = 30 `# Below this EC2 average metric scaling would take action` 21 | * SCALE_IN_MEM_TH = 60 `# Below this cluster average metric scaling would take action` 22 | * FUTURE_MEM_TH = 70 `# Below this future metric scaling would take action` 23 | * DRAIN_ALL_EMPTY_INSTANCES = True `# Set to False to prevent scaling in more than one instance at a time` 24 | * ASG_PREFIX = '' `# Use this when your ASG naming convention requires a prefix (e.g. 'ecs-')` 25 | * ASG_SUFFIX = '' `# Use this when your ASG naming convention requires a suffix (e.g. '-live')` 26 | * ECS_AVOID_STR = 'awseb' `# Use this to avoid clusters containing a specific string (i.e ElasticBeanstalk clusters)` 27 | 28 | ##### How to create a role to run ecscale: 29 | 1. When creating the Lambda function, you'll be asked to select a role or create a new one, choose a new role 30 | 2. Provide the json from `policy.json` to the role policy 31 | 3. All set to allow ecscale to do its work 32 | 33 | ##### Creating a Lambda function step by step: 34 | 35 | #### Flow logic 36 | * Iterate over existing ECS clusters 37 | * Check a cluster's ability to scale-in based on predicted future memory reservation capacity 38 | * Look for empty hosts the can be scaled 39 | * Look for least utilized host 40 | * Choose a candidate and put in draining state 41 | * Terminate a draining host that has no running tasks and decrease the desired number of instances 42 | 43 | [Read about it some more on Medium](https://medium.com/@omerxx/how-to-scale-in-ecs-hosts-2d0906d2ba) 44 | -------------------------------------------------------------------------------- /ecscale.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import datetime 3 | from optparse import OptionParser 4 | import os 5 | 6 | SCALE_IN_CPU_TH = 30 7 | SCALE_IN_MEM_TH = 60 8 | FUTURE_MEM_TH = 70 9 | DRAIN_ALL_EMPTY_INSTANCES = True 10 | ASG_PREFIX = '' 11 | ASG_SUFFIX = '' 12 | ECS_AVOID_STR = 'awseb' 13 | logline = {} 14 | 15 | def clusters(ecsClient): 16 | # Returns an iterable list of cluster names 17 | response = ecsClient.list_clusters() 18 | if not response['clusterArns']: 19 | print 'No ECS cluster found' 20 | return 21 | 22 | return [cluster for cluster in response['clusterArns'] if ECS_AVOID_STR not in cluster] 23 | 24 | 25 | def cluster_memory_reservation(cwClient, clusterName): 26 | # Return cluster mem reservation average per minute cloudwatch metric 27 | try: 28 | response = cwClient.get_metric_statistics( 29 | Namespace='AWS/ECS', 30 | MetricName='MemoryReservation', 31 | Dimensions=[ 32 | { 33 | 'Name': 'ClusterName', 34 | 'Value': clusterName 35 | }, 36 | ], 37 | StartTime=datetime.datetime.utcnow() - datetime.timedelta(seconds=120), 38 | EndTime=datetime.datetime.utcnow(), 39 | Period=60, 40 | Statistics=['Average'] 41 | ) 42 | return response['Datapoints'][0]['Average'] 43 | 44 | except Exception: 45 | logger({'ClusterMemoryError': 'Could not retrieve mem reservation for {}'.format(clusterName)}) 46 | 47 | 48 | def find_asg(clusterName, asgData): 49 | # Returns auto scaling group resourceId based on name 50 | for asg in asgData['AutoScalingGroups']: 51 | for tag in asg['Tags']: 52 | if tag['Key'] == 'Name': 53 | if tag['Value'].split(' ')[0] == '{}{}{}'.format(ASG_PREFIX, clusterName, ASG_SUFFIX): 54 | return tag['ResourceId'] 55 | 56 | else: 57 | logger({'ASGError': 'Auto scaling group for {} not found'.format(clusterName)}) 58 | 59 | 60 | def ec2_avg_cpu_utilization(clusterName, asgData, cwclient): 61 | asg = find_asg(clusterName, asgData) 62 | response = cwclient.get_metric_statistics( 63 | Namespace='AWS/EC2', 64 | MetricName='CPUUtilization', 65 | Dimensions=[ 66 | { 67 | 'Name': 'AutoScalingGroupName', 68 | 'Value': asg 69 | }, 70 | ], 71 | StartTime=datetime.datetime.utcnow() - datetime.timedelta(seconds=120), 72 | EndTime=datetime.datetime.utcnow(), 73 | Period=60, 74 | Statistics=['Average'] 75 | ) 76 | return response['Datapoints'][0]['Average'] 77 | 78 | 79 | def asg_on_min_state(clusterName, asgData, asgClient, activeInstanceCount): 80 | asg = find_asg(clusterName, asgData) 81 | for sg in asgData['AutoScalingGroups']: 82 | if sg['AutoScalingGroupName'] == asg: 83 | if activeInstanceCount <= sg['MinSize']: 84 | return True 85 | 86 | return False 87 | 88 | 89 | def empty_instances(clusterArn, activeContainerDescribed): 90 | # returns a object of empty instances in cluster 91 | instances = [] 92 | empty_instances = {} 93 | 94 | for inst in activeContainerDescribed['containerInstances']: 95 | if inst['runningTasksCount'] == 0 and inst['pendingTasksCount'] == 0: 96 | empty_instances.update({inst['ec2InstanceId']: inst['containerInstanceArn']}) 97 | 98 | return empty_instances 99 | 100 | 101 | def draining_instances(clusterArn, drainingContainerDescribed): 102 | # returns an object of draining instances in cluster 103 | instances = [] 104 | draining_instances = {} 105 | 106 | for inst in drainingContainerDescribed['containerInstances']: 107 | draining_instances.update({inst['ec2InstanceId']: inst['containerInstanceArn']}) 108 | 109 | return draining_instances 110 | 111 | 112 | def terminate_decrease(instanceId, asgClient): 113 | # terminates an instance and decreases the desired number in its auto scaling group 114 | # [ only if desired > minimum ] 115 | try: 116 | response = asgClient.terminate_instance_in_auto_scaling_group( 117 | InstanceId=instanceId, 118 | ShouldDecrementDesiredCapacity=True 119 | ) 120 | logger({'Action': 'Terminate', 'Message': response['Activity']['Cause']}) 121 | 122 | except Exception as e: 123 | logger({'Error': e}) 124 | 125 | 126 | def scale_in_instance(clusterArn, activeContainerDescribed): 127 | # iterates over hosts, finds the least utilized: 128 | # The most under-utilized memory and minimum running tasks 129 | # return instance obj {instanceId, runningInstances, containerinstanceArn} 130 | instanceToScale = {'id': '', 'running': 0, 'freemem': 0} 131 | for inst in activeContainerDescribed['containerInstances']: 132 | for res in inst['remainingResources']: 133 | if res['name'] == 'MEMORY': 134 | if res['integerValue'] > instanceToScale['freemem']: 135 | instanceToScale['freemem'] = res['integerValue'] 136 | instanceToScale['id'] = inst['ec2InstanceId'] 137 | instanceToScale['running'] = inst['runningTasksCount'] 138 | instanceToScale['containerInstanceArn'] = inst['containerInstanceArn'] 139 | 140 | elif res['integerValue'] == instanceToScale['freemem']: 141 | # Two instances with same free memory level, choose the one with less running tasks 142 | if inst['runningTasksCount'] < instanceToScale['running']: 143 | instanceToScale['freemem'] = res['integerValue'] 144 | instanceToScale['id'] = inst['ec2InstanceId'] 145 | instanceToScale['running'] = inst['runningTasksCount'] 146 | instanceToScale['containerInstanceArn'] = inst['containerInstanceArn'] 147 | break 148 | 149 | logger({'Scale candidate': '{} with free {}'.format(instanceToScale['id'], instanceToScale['freemem'])}) 150 | return instanceToScale 151 | 152 | 153 | def running_tasks(instanceId, containerDescribed): 154 | # return a number of running tasks on a given ecs host 155 | for inst in containerDescribed['containerInstances']: 156 | if inst['ec2InstanceId'] == instanceId: 157 | return int(inst['runningTasksCount']) + int(inst['pendingTasksCount']) 158 | 159 | 160 | def drain_instance(containerInstanceId, ecsClient, clusterArn): 161 | # put a given ec2 into draining state 162 | try: 163 | response = ecsClient.update_container_instances_state( 164 | cluster=clusterArn, 165 | containerInstances=[containerInstanceId], 166 | status='DRAINING' 167 | ) 168 | 169 | except Exception as e: 170 | logger({'DrainingError': e}) 171 | 172 | 173 | def future_reservation(activeInstanceCount, clusterMemReservation): 174 | # If the cluster were to scale in an instance, calculate the effect on mem reservation 175 | # return cluster_mem_reserve*active_instance_count / active_instance_count-1 176 | if activeInstanceCount > 1: 177 | futureMem = (clusterMemReservation*activeInstanceCount) / (activeInstanceCount-1) 178 | else: 179 | return 100 180 | 181 | print '*** Current: {} | Future : {}'.format(clusterMemReservation, futureMem) 182 | 183 | return futureMem 184 | 185 | 186 | def retrieve_cluster_data(ecsClient, cwClient, asgClient, cluster): 187 | clusterName = cluster.split('/')[1] 188 | print '*** {} ***'.format(clusterName) 189 | activeContainerInstances = ecsClient.list_container_instances(cluster=cluster, status='ACTIVE') 190 | clusterMemReservation = cluster_memory_reservation(cwClient, clusterName) 191 | 192 | if activeContainerInstances['containerInstanceArns']: 193 | activeContainerDescribed = ecsClient.describe_container_instances(cluster=cluster, containerInstances=activeContainerInstances['containerInstanceArns']) 194 | else: 195 | print 'No active instances in cluster' 196 | return False 197 | drainingContainerInstances = ecsClient.list_container_instances(cluster=cluster, status='DRAINING') 198 | if drainingContainerInstances['containerInstanceArns']: 199 | drainingContainerDescribed = ecsClient.describe_container_instances(cluster=cluster, containerInstances=drainingContainerInstances['containerInstanceArns']) 200 | drainingInstances = draining_instances(cluster, drainingContainerDescribed) 201 | else: 202 | drainingInstances = {} 203 | drainingContainerDescribed = [] 204 | emptyInstances = empty_instances(cluster, activeContainerDescribed) 205 | 206 | dataObj = { 207 | 'clusterName': clusterName, 208 | 'clusterMemReservation': clusterMemReservation, 209 | 'activeContainerDescribed': activeContainerDescribed, 210 | 'drainingInstances': drainingInstances, 211 | 'emptyInstances': emptyInstances, 212 | 'drainingContainerDescribed': drainingContainerDescribed 213 | } 214 | 215 | return dataObj 216 | 217 | 218 | def logger(entry, action='log'): 219 | # print log as one-line json from cloudwatch integration 220 | if action == 'log': 221 | global logline 222 | logline.update(entry) 223 | elif action == 'print': 224 | print logline 225 | 226 | 227 | def main(run='normal'): 228 | ecsClient = boto3.client('ecs') 229 | cwClient = boto3.client('cloudwatch') 230 | asgClient = boto3.client('autoscaling') 231 | asgData = asgClient.describe_auto_scaling_groups() 232 | clusterList = clusters(ecsClient) 233 | 234 | for cluster in clusterList: 235 | ########### Cluster data retrival ########## 236 | clusterData = retrieve_cluster_data(ecsClient, cwClient, asgClient, cluster) 237 | if not clusterData: 238 | continue 239 | else: 240 | clusterName = clusterData['clusterName'] 241 | clusterMemReservation = clusterData['clusterMemReservation'] 242 | activeContainerDescribed = clusterData['activeContainerDescribed'] 243 | activeInstanceCount = len(activeContainerDescribed['containerInstances']) 244 | drainingInstances = clusterData['drainingInstances'] 245 | emptyInstances = clusterData['emptyInstances'] 246 | ########## Cluster scaling rules ########### 247 | 248 | if drainingInstances.keys(): 249 | # There are draining instances to terminate 250 | for instanceId, containerInstId in drainingInstances.iteritems(): 251 | if not running_tasks(instanceId, clusterData['drainingContainerDescribed']): 252 | if run == 'dry': 253 | print 'Would have terminated {}'.format(instanceId) 254 | else: 255 | print 'Terminating draining instance with no containers {}'.format(instanceId) 256 | terminate_decrease(instanceId, asgClient) 257 | else: 258 | print 'Draining instance not empty' 259 | 260 | if asg_on_min_state(clusterName, asgData, asgClient, activeInstanceCount): 261 | print '{}: in Minimum state, skipping'.format(clusterName) 262 | continue 263 | 264 | if (clusterMemReservation < FUTURE_MEM_TH and 265 | future_reservation(activeInstanceCount, clusterMemReservation) < FUTURE_MEM_TH): 266 | # Future memory levels allow scale 267 | if DRAIN_ALL_EMPTY_INSTANCES and emptyInstances.keys(): 268 | # There are empty instances 269 | for instanceId, containerInstId in emptyInstances.iteritems(): 270 | if run == 'dry': 271 | print 'Would have drained {}'.format(instanceId) 272 | else: 273 | print 'Draining empty instance {}'.format(instanceId) 274 | drain_instance(containerInstId, ecsClient, cluster) 275 | 276 | if (clusterMemReservation < SCALE_IN_MEM_TH): 277 | # Cluster mem reservation level requires scale 278 | if (ec2_avg_cpu_utilization(clusterName, asgData, cwClient) < SCALE_IN_CPU_TH): 279 | instanceToScale = scale_in_instance(cluster, activeContainerDescribed)['containerInstanceArn'] 280 | if run == 'dry': 281 | print 'Would have scaled {}'.format(instanceToScale) 282 | else: 283 | print 'Draining least utilized instanced {}'.format(instanceToScale) 284 | drain_instance(instanceToScale, ecsClient, cluster) 285 | else: 286 | print 'CPU higher than TH, cannot scale' 287 | 288 | print '***' 289 | 290 | def lambda_handler(event, context): 291 | parser = OptionParser() 292 | parser.add_option("-a", "--access-key", dest="AWS_ACCESS_KEY_ID", help="Provide AWS access key") 293 | parser.add_option("-s", "--secret-key", dest="AWS_SECRET_ACCESS_KEY", help="Provide AWS secret key") 294 | parser.add_option("-d", "--dry-run", action="store_true", dest="DRY_RUN", default=False, help="Dry run the process") 295 | (options, args) = parser.parse_args() 296 | 297 | if options.AWS_ACCESS_KEY_ID and options.AWS_SECRET_ACCESS_KEY: 298 | os.environ['AWS_ACCESS_KEY_ID'] = options.AWS_ACCESS_KEY_ID 299 | os.environ['AWS_SECRET_ACCESS_KEY'] = options.AWS_SECRET_ACCESS_KEY 300 | elif options.AWS_ACCESS_KEY_ID or options.AWS_SECRET_ACCESS_KEY: 301 | print 'AWS key or secret are missing' 302 | 303 | runType = 'dry' if options.DRY_RUN else 'normal' 304 | main(run=runType) 305 | 306 | 307 | if __name__ == '__main__': 308 | # lambda_handler({}, '') 309 | main() 310 | 311 | -------------------------------------------------------------------------------- /policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "Stmt1000000000009", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "ecs:*" 9 | ], 10 | "Resource": [ 11 | "*" 12 | ] 13 | }, 14 | { 15 | "Sid": "Stmt1000000000008", 16 | "Effect": "Allow", 17 | "Action": [ 18 | "ec2:*" 19 | ], 20 | "Resource": [ 21 | "*" 22 | ] 23 | }, 24 | { 25 | "Sid": "Stmt1000000000007", 26 | "Effect": "Allow", 27 | "Action": [ 28 | "cloudwatch:*" 29 | ], 30 | "Resource": [ 31 | "*" 32 | ] 33 | }, 34 | { 35 | "Sid": "Stmt10000000000006", 36 | "Effect": "Allow", 37 | "Action": [ 38 | "autoscaling:*" 39 | ], 40 | "Resource": [ 41 | "*" 42 | ] 43 | } 44 | ] 45 | } 46 | --------------------------------------------------------------------------------