├── .github └── workflows │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── Gemfile ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── ecs_deploy.gemspec ├── exe └── ecs_auto_scaler ├── lib ├── ecs_deploy.rb └── ecs_deploy │ ├── auto_scaler.rb │ ├── auto_scaler │ ├── auto_scaling_group_config.rb │ ├── cluster_resource_manager.rb │ ├── config_base.rb │ ├── instance_drainer.rb │ ├── service_config.rb │ ├── spot_fleet_request_config.rb │ └── trigger_config.rb │ ├── capistrano.rb │ ├── configuration.rb │ ├── instance_fluctuation_manager.rb │ ├── scheduled_task.rb │ ├── service.rb │ ├── task_definition.rb │ └── version.rb ├── renovate.json └── spec ├── ecs_deploy ├── auto_scaler │ ├── auto_scaling_group_config_spec.rb │ ├── cluster_resource_manager_spec.rb │ ├── instance_drainer_spec.rb │ └── service_config_spec.rb ├── auto_scaler_spec.rb └── instance_fluctuation_manager_spec.rb ├── fixtures └── files │ ├── ecs_auto_scaler_config_in_new_format.yaml │ └── ecs_auto_scaler_config_in_old_format.yaml └── spec_helper.rb /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | test: 9 | 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | ruby-version: ['2.5', '2.6', '2.7', '3.0', '3.1', '3.2'] 14 | 15 | steps: 16 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 17 | - name: Set up Ruby 18 | uses: ruby/setup-ruby@13e7a03dc3ac6c3798f4570bfead2aed4d96abfb # v1.244.0 19 | with: 20 | ruby-version: ${{ matrix.ruby-version }} 21 | bundler-cache: true 22 | - name: Run tests 23 | run: bundle exec rake 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | 11 | .rspec_status 12 | .envrc 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # v1.0 2 | 3 | ## Release v1.0.7 - 2024/08/08 4 | 5 | ### Bug fixes 6 | 7 | - Fix Aws::AutoScaling::Errors::ValidationError https://github.com/reproio/ecs_deploy/pull/85 8 | 9 | ## Release v1.0.6 - 2024/03/19 10 | 11 | ### Enhancement 12 | 13 | - Make region fallback logic consistent in Capistrano tasks https://github.com/reproio/ecs_deploy/pull/91 14 | - Add runtime_platform to task_definition. https://github.com/reproio/ecs_deploy/pull/92 15 | 16 | ### Bug fixes 17 | 18 | - Delete option :placement_strategy if DAEMON service https://github.com/reproio/ecs_deploy/pull/93 19 | 20 | ## Release v1.0.5 - 2023/03/15 21 | 22 | ### Enhancement 23 | 24 | - Add variable of capistrano `ecs_client_retry_params` to override parameter of ECS::Client#initialize https://github.com/reproio/ecs_deploy/pull/88 25 | 26 | ## Release v1.0.4 - 2023/02/10 27 | 28 | ### Bug fixes 29 | 30 | - Fix Aws::AutoScaling::Errors::ValidationError https://github.com/reproio/ecs_deploy/pull/85 31 | 32 | - Fix Timeout::Error that occurs in trigger_capacity_update https://github.com/reproio/ecs_deploy/pull/80 33 | 34 | - use force a new deployment, when switching from launch type to capacity provider strategy on an existing service https://github.com/reproio/ecs_deploy/pull/75 35 | 36 | ### Enhancement 37 | 38 | - Run test with Ruby 3.2 https://github.com/reproio/ecs_deploy/pull/83 39 | 40 | - Merge `propagate_tags` to service_options when updating service https://github.com/reproio/ecs_deploy/pull/82 41 | 42 | - Show service event logs while waiting for services https://github.com/reproio/ecs_deploy/pull/81 43 | 44 | - Stop supporting ruby 2.4 https://github.com/reproio/ecs_deploy/pull/79 45 | 46 | - Display warning that desired count has reached max value https://github.com/reproio/ecs_deploy/pull/78 47 | 48 | - Make draining feature opt-outable https://github.com/reproio/ecs_deploy/pull/77 49 | 50 | - Add capacity_provider_strategy options to Service https://github.com/reproio/ecs_deploy/pull/74 51 | 52 | ## Release v1.0.3 - 2021/11/17 53 | 54 | ### Bug fixes 55 | * Fix bug that `InstanceFluctuationManager#decrement` tries to detach instances whose status is 'DEREGISTERING' 56 | https://github.com/reproio/ecs_deploy/pull/72 57 | 58 | ### Enhancement 59 | * Add a cluster name to deployment logs 60 | https://github.com/reproio/ecs_deploy/pull/71 61 | 62 | 63 | ## Release v1.0.2 - 2021/05/26 64 | 65 | ### Enhancement 66 | 67 | * add option enable_execute_command to support ECS Exec 68 | https://github.com/reproio/ecs_deploy/pull/69 69 | 70 | ## Release v1.0.1 - 2021/05/19 71 | 72 | ### Enhancement 73 | 74 | * retry register_task_definition by AWS SDK feature 75 | https://github.com/reproio/ecs_deploy/pull/67 76 | * Support Ruby 3.0 77 | https://github.com/reproio/ecs_deploy/pull/66 78 | * Wait until stop old tasks 79 | https://github.com/reproio/ecs_deploy/pull/65 80 | * Add prioritized_over_upscale_triggers option to triggers 81 | https://github.com/reproio/ecs_deploy/pull/62 82 | * Display only unstable services in EcsDeploy::Service#wait_all_running 83 | https://github.com/reproio/ecs_deploy/pull/61 84 | 85 | ## Release v1.0.0 - 2019/12/24 86 | 87 | ### New feature 88 | 89 | * Add tasks to deploy the application faster 90 | https://github.com/reproio/ecs_deploy/pull/57 91 | 92 | ### Enhancement 93 | 94 | * Add parameters `ecs_wait_until_services_stable_max_attempts` and `ecs_wait_until_services_stable_delay` 95 | https://github.com/reproio/ecs_deploy/pull/30 96 | * Detect region automatically according to AWS SDK 97 | https://github.com/reproio/ecs_deploy/pull/31 98 | * Support new features of ECS to support Fargate 99 | https://github.com/reproio/ecs_deploy/pull/32 100 | * Ignore running tasks which don't belong to the ECS services on deregistering container instances 101 | https://github.com/reproio/ecs_deploy/pull/33 102 | * Drop AWS SDK 2 support 103 | https://github.com/reproio/ecs_deploy/pull/34 104 | * Support scheduling_strategy option 105 | https://github.com/reproio/ecs_deploy/pull/35 106 | * Support execution_role_arn on task_definition 107 | https://github.com/reproio/ecs_deploy/pull/36 108 | * Support spot fleet requests and container instance draining 109 | https://github.com/reproio/ecs_deploy/pull/40 110 | * Add network_configuration paramters to ScheduledTask 111 | https://github.com/reproio/ecs_deploy/pull/46 112 | * Support tagging ECS resources 113 | https://github.com/reproio/ecs_deploy/pull/48 114 | https://github.com/reproio/ecs_deploy/pull/49 115 | * Wait for stopping tasks until tasks stop 116 | https://github.com/reproio/ecs_deploy/pull/50 117 | * Improve performance when start tasks 118 | https://github.com/reproio/ecs_deploy/pull/53 119 | * Improve stability of auto scaling groups managed by ecs_auto_scaler 120 | https://github.com/reproio/ecs_deploy/pull/55 121 | 122 | ### Bug fixes 123 | 124 | * Fix infinite loop that occurs when there are more than 100 container instances 125 | https://github.com/reproio/ecs_deploy/pull/38 126 | * Fix errors that occur on decreasing more than 20 container instances 127 | https://github.com/reproio/ecs_deploy/pull/39 128 | 129 | # Ancient releases 130 | 131 | ## Release v0.3.2 - 2017/23/10 132 | 133 | ### Enhancement 134 | 135 | * Remove execution feature 136 | https://github.com/reproio/ecs_deploy/pull/24 137 | * Support container overrides in scheduled tasks 138 | https://github.com/reproio/ecs_deploy/pull/26 139 | 140 | ### Bug fixes 141 | 142 | * Fix deployment errors that occur when `ecs_scheduled_tasks` is not set 143 | https://github.com/reproio/ecs_deploy/pull/27 144 | 145 | ## Release v0.3.1 - 2017/04/08 146 | 147 | ### Bug fixes 148 | 149 | * Fix block parameter name 150 | 151 | ## Release v0.3.0 - 2017/03/08 152 | 153 | ### New feature 154 | 155 | * Support ScheduledTask deployment 156 | https://github.com/reproio/ecs_deploy/pull/22 157 | 158 | ### Enhancement 159 | 160 | * Support network_mode and placement_constraints 161 | * Introduce `ecs_registered_tasks` capistrano variable 162 | https://github.com/reproio/ecs_deploy/pull/23 163 | 164 | ### Bug fixes 165 | 166 | * Filter inactive services 167 | https://github.com/reproio/ecs_deploy/pull/19 168 | * Wait 10 services at once 169 | https://github.com/reproio/ecs_deploy/pull/20 170 | https://github.com/reproio/ecs_deploy/pull/21 171 | 172 | ## Release v0.2.0 - 2016/31/10 173 | 174 | ### Enhancement 175 | 176 | * Support task role arn 177 | https://github.com/reproio/ecs_deploy/pull/13 178 | * Make the scale-in process safe 179 | https://github.com/reproio/ecs_deploy/pull/14 180 | * Support ALB 181 | https://github.com/reproio/ecs_deploy/pull/15 182 | 183 | ## Release v0.1.2 - 2016/28/07 184 | 185 | ### Bug fixes 186 | 187 | * Fix rollback bug 188 | https://github.com/reproio/ecs_deploy/pull/11 189 | 190 | ## Release v0.1.1 - 2016/03/07 191 | 192 | ### Bug fixes 193 | 194 | * Add missing desired_count for backend services 195 | https://github.com/reproio/ecs_deploy/pull/9 196 | 197 | ## Release v0.1.0 - 2016/27/06 198 | 199 | Initial release. 200 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in ecs_deploy.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EcsDeploy 2 | 3 | Helper script for deployment to Amazon ECS, designed to be compatible with `capistrano`. 4 | 5 | This gem is experimental. 6 | 7 | ## Installation 8 | 9 | Add this line to your application's Gemfile: 10 | 11 | ```ruby 12 | gem "ecs_deploy", github: "reproio/ecs_deploy" 13 | ``` 14 | 15 | And then execute: 16 | 17 | $ bundle 18 | 19 | ## Configuration 20 | 21 | ```ruby 22 | # Capfile 23 | require "ecs_deploy/capistrano" 24 | 25 | # deploy.rb 26 | set :ecs_default_cluster, "ecs-cluster-name" 27 | set :ecs_region, %w(ap-northeast-1) # optional, if nil, use environment variable 28 | set :ecs_service_role, "customEcsServiceRole" # default: ecsServiceRole 29 | set :ecs_deploy_wait_timeout, 600 # default: 300 30 | set :ecs_wait_until_services_stable_max_attempts, 40 # optional 31 | set :ecs_wait_until_services_stable_delay, 15 # optional 32 | set :ecs_client_params, { retry_mode: "standard", max_attempts: 10 } # default: {} 33 | 34 | set :ecs_tasks, [ 35 | { 36 | name: "myapp-#{fetch(:rails_env)}", 37 | container_definitions: [ 38 | { 39 | name: "myapp", 40 | image: "#{fetch(:docker_registry_host_with_port)}/myapp:#{fetch(:sha1)}", 41 | cpu: 1024, 42 | memory: 512, 43 | port_mappings: [], 44 | essential: true, 45 | environment: [ 46 | {name: "RAILS_ENV", value: fetch(:rails_env)}, 47 | ], 48 | mount_points: [ 49 | { 50 | source_volume: "sockets_path", 51 | container_path: "/app/tmp/sockets", 52 | read_only: false, 53 | }, 54 | ], 55 | volumes_from: [], 56 | log_configuration: { 57 | log_driver: "fluentd", 58 | options: { 59 | "tag" => "docker.#{fetch(:rails_env)}.#{name}.{{.ID}}", 60 | }, 61 | }, 62 | }, 63 | { 64 | name: "nginx", 65 | image: "#{fetch(:docker_registry_host_with_port)}/my-nginx", 66 | cpu: 256, 67 | memory: 256, 68 | links: [], 69 | port_mappings: [ 70 | {container_port: 443, host_port: 443, protocol: "tcp"}, 71 | ], 72 | essential: true, 73 | environment: {}, 74 | mount_points: [], 75 | volumes_from: [ 76 | {source_container: "myapp-#{fetch(:rails_env)}", read_only: false}, 77 | ], 78 | log_configuration: { 79 | log_driver: "fluentd", 80 | options: { 81 | "tag" => "docker.#{fetch(:rails_env)}.#{name}.{{.ID}}", 82 | }, 83 | }, 84 | } 85 | ], 86 | volumes: [{name: "sockets_path", host: {}}], 87 | }, 88 | ] 89 | 90 | set :ecs_scheduled_tasks, [ 91 | { 92 | cluster: "default", # Defaults to fetch(:ecs_default_cluster) 93 | rule_name: "schedule_name", 94 | schedule_expression: "cron(0 12 * * ? *)", 95 | description: "schedule_description", # Optional 96 | target_id: "task_name", # Defaults to the task_definition_name 97 | task_definition_name: "myapp-#{fetch(:rails_env)}", 98 | task_count: 2, # Default 1 99 | revision: 12, # Optional 100 | role_arn: "TaskRoleArn", # Optional 101 | container_overrides: [ # Optional 102 | name: "myapp-main", 103 | command: ["ls"], 104 | ] 105 | } 106 | ] 107 | 108 | set :ecs_services, [ 109 | { 110 | name: "myapp-#{fetch(:rails_env)}", 111 | load_balancers: [ 112 | { 113 | load_balancer_name: "service-elb-name", 114 | container_port: 443, 115 | container_name: "nginx", 116 | }, 117 | { 118 | target_group_arn: "alb_target_group_arn", 119 | container_port: 443, 120 | container_name: "nginx", 121 | } 122 | ], 123 | desired_count: 1, 124 | deployment_configuration: {maximum_percent: 200, minimum_healthy_percent: 50}, 125 | }, 126 | ] 127 | ``` 128 | 129 | ## Usage 130 | 131 | ```sh 132 | bundle exec cap ecs:register_task_definition # register ecs_tasks as TaskDefinition 133 | bundle exec cap ecs:deploy_scheduled_task # register ecs_scheduled_tasks to CloudWatchEvent 134 | bundle exec cap ecs:deploy # create or update Service by ecs_services info 135 | 136 | bundle exec cap ecs:rollback # deregister current task definition and update Service by previous revision of current task definition 137 | ``` 138 | 139 | ### Rollback example 140 | 141 | | sequence | taskdef | service | desc | 142 | | -------- | -------- | ------------- | ------ | 143 | | 1 | myapp:12 | myapp-service | | 144 | | 2 | myapp:13 | myapp-service | | 145 | | 3 | myapp:14 | myapp-service | current | 146 | 147 | After rollback 148 | 149 | | sequence | taskdef | service | desc | 150 | | -------- | -------- | ------------- | ------ | 151 | | 1 | myapp:12 | myapp-service | | 152 | | 2 | myapp:13 | myapp-service | | 153 | | 3 | myapp:14 | myapp-service | deregister | 154 | | 4 | myapp:13 | myapp-service | current | 155 | 156 | And rollback again 157 | 158 | | sequence | taskdef | service | desc | 159 | | -------- | -------- | ------------- | ------ | 160 | | 1 | myapp:12 | myapp-service | | 161 | | 2 | myapp:13 | myapp-service | previous | 162 | | 3 | myapp:14 | myapp-service | deregister | 163 | | 4 | myapp:13 | myapp-service | deregister | 164 | | 5 | myapp:12 | myapp-service | current | 165 | 166 | And deploy new version 167 | 168 | | sequence | taskdef | service | desc | 169 | | -------- | -------- | ------------- | ------ | 170 | | 1 | myapp:12 | myapp-service | | 171 | | 2 | myapp:13 | myapp-service | | 172 | | 3 | myapp:14 | myapp-service | deregister | 173 | | 4 | myapp:13 | myapp-service | deregister | 174 | | 5 | myapp:12 | myapp-service | | 175 | | 6 | myapp:15 | myapp-service | current | 176 | 177 | And rollback 178 | 179 | | sequence | taskdef | service | desc | 180 | | -------- | -------- | ------------- | ------ | 181 | | 1 | myapp:12 | myapp-service | | 182 | | 2 | myapp:13 | myapp-service | | 183 | | 3 | myapp:14 | myapp-service | deregister | 184 | | 4 | myapp:13 | myapp-service | deregister | 185 | | 5 | myapp:12 | myapp-service | | 186 | | 6 | myapp:15 | myapp-service | deregister | 187 | | 7 | myapp:12 | myapp-service | current | 188 | 189 | ## Autoscaler 190 | 191 | The autoscaler of `ecs_deploy` supports auto scaling of ECS services and clusters. 192 | 193 | ### Prerequisits 194 | 195 | * An ECS cluster whose instances belong to either an Auto Scaling group or a Spot Fleet request 196 | * You have CloudWatch alarms and you want to scale services when their state changes 197 | 198 | ### How to use autoscaler 199 | 200 | First, write a configuration file (YAML format) like below: 201 | 202 | ```yaml 203 | # ポーリング時にupscale_triggersに指定した状態のalarmがあればstep分serviceとinstanceを増やす (max_task_countまで) 204 | # ポーリング時にdownscale_triggersに指定した状態のalarmがあればstep分serviceとinstanceを減らす (min_task_countまで) 205 | # max_task_countは段階的にリミットを設けられるようにする 206 | # 一回リミットに到達するとcooldown_for_reach_maxを越えても状態が継続したら再開するようにする 207 | 208 | polling_interval: 60 209 | 210 | auto_scaling_groups: 211 | - name: ecs-cluster-nodes 212 | region: ap-northeast-1 213 | cluster: ecs-cluster 214 | # autoscaler will set the capacity to (buffer + desired_tasks * required_capacity). 215 | # Adjust this value if it takes much time to prepare ECS instances and launch new tasks. 216 | buffer: 1 217 | disable_draining: false # cf. spot_instance_intrp_warns_queue_urls 218 | services: 219 | - name: repro-api-production 220 | step: 1 221 | idle_time: 240 222 | max_task_count: [10, 25] 223 | scheduled_min_task_count: 224 | - {from: "1:45", to: "4:30", count: 8} 225 | cooldown_time_for_reach_max: 600 226 | min_task_count: 0 227 | # Required capacity per task (default: 1) 228 | # You should specify "binpack" as task placement strategy if the value is less than 1 and you use an auto scaling group. 229 | required_capacity: 0.5 230 | upscale_triggers: 231 | - alarm_name: "ECS [repro-api-production] CPUUtilization" 232 | state: ALARM 233 | - alarm_name: "ELB repro-api-a HTTPCode_Backend_5XX" 234 | state: ALARM 235 | step: 2 236 | downscale_triggers: 237 | - alarm_name: "ECS [repro-api-production] CPUUtilization (low)" 238 | state: OK 239 | 240 | spot_fleet_requests: 241 | - id: sfr-354de735-2c17-4565-88c9-10ada5b957e5 242 | region: ap-northeast-1 243 | cluster: ecs-cluster-for-worker 244 | buffer: 1 245 | disable_draining: false # cf. spot_instance_intrp_warns_queue_urls 246 | services: 247 | - name: repro-worker-production 248 | step: 1 249 | idle_time: 240 250 | cooldown_time_for_reach_max: 600 251 | min_task_count: 0 252 | # Required capacity per task (default: 1) 253 | # The capacity assumes that WeightedCapacity is equal to the number of vCPUs. 254 | required_capacity: 2 255 | upscale_triggers: 256 | - alarm_name: "ECS [repro-worker-production] CPUUtilization" 257 | state: ALARM 258 | downscale_triggers: 259 | - alarm_name: "ECS [repro-worker-production] CPUUtilization (low)" 260 | state: OK 261 | - alarm_name: "Aurora DMLLatency is high" 262 | state: ALARM 263 | prioritized_over_upscale_triggers: true 264 | 265 | # When you use spot instances, instances that receive interruption warnings should be drained. 266 | # If you set URLs of SQS queues for spot instance interruption warnings to `spot_instance_intrp_warns_queue_urls`, 267 | # autoscaler drains instances to interrupt and detaches the instances from the auto scaling groups with 268 | # should_decrement_desired_capacity false. 269 | # If you set ECS_ENABLE_SPOT_INSTANCE_DRAINING to true, we recommend that you opt out of the draining feature 270 | # by setting disable_draining to true in the configurations of auto scaling groups and spot fleet requests. 271 | # Otherwise, instances don't seem to be drained on rare occasions. 272 | # Even if you opt out of the feature, you still have the advantage of setting `spot_instance_intrp_warns_queue_urls` 273 | # because instances to interrupt are replaced with new instances as soon as possible. 274 | spot_instance_intrp_warns_queue_urls: 275 | - https://sqs.ap-northeast-1.amazonaws.com//spot-instance-intrp-warns 276 | ``` 277 | 278 | Then, execute the following command: 279 | 280 | ```sh 281 | ecs_auto_scaler 282 | ``` 283 | 284 | It is recommended to run the `ecs_auto_scaler` via a container on ECS. 285 | 286 | ### Signals 287 | 288 | Signal | Description 289 | -----------|------------------------------------------------------------ 290 | TERM, INT | Shutdown gracefully 291 | CONT | Resume auto scaling 292 | TSTP | Pause auto scaling (Run only container instance draining) 293 | 294 | ### IAM policy for autoscaler 295 | 296 | The following permissions are required for the preceding configuration of "repro-api-production" service: 297 | 298 | ``` 299 | { 300 | "Version": "2012-10-17", 301 | "Statement": [ 302 | { 303 | "Effect": "Allow", 304 | "Action": [ 305 | "autoscaling:DescribeAutoScalingGroups", 306 | "cloudwatch:DescribeAlarms", 307 | "ec2:DescribeInstances", 308 | "ec2:TerminateInstances", 309 | "ecs:ListTasks" 310 | ], 311 | "Resource": "*" 312 | }, 313 | { 314 | "Effect": "Allow", 315 | "Action": [ 316 | "ecs:DescribeServices", 317 | "ecs:UpdateService" 318 | ], 319 | "Resource": [ 320 | "arn:aws:ecs:ap-northeast-1::service/ecs-cluster/repro-api-production" 321 | ] 322 | }, 323 | { 324 | "Effect": "Allow", 325 | "Action": [ 326 | "ecs:DescribeTasks" 327 | ], 328 | "Resource": [ 329 | "arn:aws:ecs:ap-northeast-1::task/ecs-cluster/*" 330 | ] 331 | }, 332 | { 333 | "Effect": "Allow", 334 | "Action": [ 335 | "autoscaling:DetachInstances", 336 | "autoscaling:UpdateAutoScalingGroup" 337 | ], 338 | "Resource": [ 339 | "arn:aws:autoscaling:ap-northeast-1::autoScalingGroup::autoScalingGroupName/ecs-cluster-nodes" 340 | ] 341 | }, 342 | { 343 | "Effect": "Allow", 344 | "Action": [ 345 | "ecs:DescribeContainerInstances" 346 | ], 347 | "Resource": [ 348 | "arn:aws:ecs:ap-northeast-1::container-instance/ecs-cluster/*" 349 | ] 350 | }, 351 | { 352 | "Effect": "Allow", 353 | "Action": [ 354 | "ecs:DeregisterContainerInstance", 355 | "ecs:ListContainerInstances" 356 | ], 357 | "Resource": [ 358 | "arn:aws:ecs:ap-northeast-1::cluster/ecs-cluster" 359 | ] 360 | } 361 | ] 362 | } 363 | ``` 364 | 365 | If you use spot instances, additional permissions are required like below: 366 | 367 | ``` 368 | { 369 | "Version": "2012-10-17", 370 | "Statement": [ 371 | { 372 | "Effect": "Allow", 373 | "Action": "ecs:UpdateContainerInstancesState", 374 | "Resource": "arn:aws:ecs:ap-northeast-1::container-instance/ecs-cluster/*" 375 | }, 376 | { 377 | "Effect": "Allow", 378 | "Action": [ 379 | "sqs:DeleteMessage", 380 | "sqs:DeleteMessageBatch", 381 | "sqs:ReceiveMessage" 382 | ], 383 | "Resource": "arn:aws:sqs:ap-northeast-1::spot-instance-intrp-warns" 384 | } 385 | ] 386 | } 387 | ``` 388 | 389 | The following permissions are required for the preceding configuration of "repro-worker-production" service: 390 | 391 | ``` 392 | { 393 | "Version": "2012-10-17", 394 | "Statement": [ 395 | { 396 | "Effect": "Allow", 397 | "Action": [ 398 | "sqs:DeleteMessage", 399 | "sqs:DeleteMessageBatch", 400 | "sqs:ReceiveMessage" 401 | ], 402 | "Resource": "arn:aws:sqs:ap-northeast-1::spot-instance-intrp-warns" 403 | }, 404 | { 405 | "Effect": "Allow", 406 | "Action": [ 407 | "cloudwatch:DescribeAlarms", 408 | "ec2:DescribeInstances", 409 | "ec2:DescribeSpotFleetInstances", 410 | "ec2:DescribeSpotFleetRequests", 411 | "ec2:ModifySpotFleetRequest", 412 | "ec2:TerminateInstances", 413 | "ecs:ListTasks" 414 | ], 415 | "Resource": "*" 416 | }, 417 | { 418 | "Effect": "Allow", 419 | "Action": [ 420 | "ecs:DescribeServices", 421 | "ecs:UpdateService" 422 | ], 423 | "Resource": [ 424 | "arn:aws:ecs:ap-northeast-1::service/ecs-cluster-for-worker/repro-worker-production" 425 | ] 426 | }, 427 | { 428 | "Effect": "Allow", 429 | "Action": [ 430 | "ecs:DescribeTasks" 431 | ], 432 | "Resource": [ 433 | "arn:aws:ecs:ap-northeast-1::task/ecs-cluster-for-worker/*" 434 | ] 435 | }, 436 | { 437 | "Effect": "Allow", 438 | "Action": [ 439 | "ecs:DescribeContainerInstances", 440 | "ecs:UpdateContainerInstancesState" 441 | ], 442 | "Resource": [ 443 | "arn:aws:ecs:ap-northeast-1::container-instance/ecs-cluster-for-worker/*" 444 | ] 445 | }, 446 | { 447 | "Effect": "Allow", 448 | "Action": [ 449 | "ecs:ListContainerInstances" 450 | ], 451 | "Resource": [ 452 | "arn:aws:ecs:ap-northeast-1::cluster/ecs-cluster-for-worker" 453 | ] 454 | } 455 | ] 456 | } 457 | ``` 458 | 459 | ### How to deploy faster with Auto Scaling Group 460 | 461 | Add the following configuration and hooks to your `config/deploy.rb`: 462 | 463 | ```ruby 464 | # deploy.rb 465 | set :ecs_instance_fluctuation_manager_configs, [ 466 | { 467 | region: "ap-northeast-1", 468 | cluster: "CLUSTER_NAME", 469 | auto_scaling_group_name: "AUTO_SCALING_GROUP_NAME", 470 | desired_capacity: 20, # original capacity of auto scaling group 471 | } 472 | ] 473 | ``` 474 | 475 | This configuration enables tasks `ecs:increase_instances_to_max_size` and `ecs:terminate_redundant_instances`. 476 | If this configuration is not set, the above tasks do nothing. 477 | The task `ecs:increase_instances_to_max_size` will increase ECS instances. 478 | The task `ecs:terminate_redundant_instances` will decrease ECS instances considering AZ balance. 479 | 480 | Hook configuration example: 481 | 482 | ```ruby 483 | after "deploy:updating", "ecs:increase_instances_to_max_size" 484 | after "deploy:finished", "ecs:terminate_redundant_instances" 485 | after "deploy:failed", "ecs:terminate_redundant_instances" 486 | ``` 487 | 488 | ## Development 489 | 490 | After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 491 | 492 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). 493 | 494 | ## Contributing 495 | 496 | Bug reports and pull requests are welcome on GitHub at https://github.com/reproio/ecs_deploy. 497 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "ecs_deploy" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start 15 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /ecs_deploy.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'ecs_deploy/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "ecs_deploy" 8 | spec.version = EcsDeploy::VERSION 9 | spec.authors = ["joker1007"] 10 | spec.email = ["kakyoin.hierophant@gmail.com"] 11 | 12 | spec.summary = %q{AWS ECS deploy helper} 13 | spec.description = %q{AWS ECS deploy helper} 14 | spec.homepage = "https://github.com/reproio/ecs_deploy" 15 | 16 | spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } 17 | spec.bindir = "exe" 18 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 19 | spec.require_paths = ["lib"] 20 | 21 | spec.add_runtime_dependency "aws-sdk-autoscaling", "~> 1" 22 | spec.add_runtime_dependency "aws-sdk-cloudwatch", "~> 1" 23 | spec.add_runtime_dependency "aws-sdk-cloudwatchevents", "~> 1" 24 | spec.add_runtime_dependency "aws-sdk-ec2", "~> 1" 25 | spec.add_runtime_dependency "aws-sdk-ecs", "~> 1" 26 | spec.add_runtime_dependency "aws-sdk-sqs", "~> 1" 27 | spec.add_runtime_dependency "terminal-table" 28 | spec.add_runtime_dependency "paint" 29 | 30 | spec.add_development_dependency "bundler", ">= 1.11", "< 3" 31 | spec.add_development_dependency "rake", ">= 10.0" 32 | spec.add_development_dependency "rspec", "~> 3.0" 33 | spec.add_development_dependency "rexml" # For aws-sdk-* 34 | end 35 | -------------------------------------------------------------------------------- /exe/ecs_auto_scaler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "ecs_deploy" 4 | require "ecs_deploy/auto_scaler" 5 | 6 | EcsDeploy::AutoScaler.run(*ARGV) 7 | -------------------------------------------------------------------------------- /lib/ecs_deploy.rb: -------------------------------------------------------------------------------- 1 | require "ecs_deploy/version" 2 | require "ecs_deploy/configuration" 3 | 4 | require 'aws-sdk-ecs' 5 | require 'logger' 6 | require 'terminal-table' 7 | require 'paint' 8 | 9 | module EcsDeploy 10 | def self.logger 11 | @logger ||= Logger.new(STDOUT).tap do |l| 12 | l.level = Logger.const_get(config.log_level.to_s.upcase) 13 | end 14 | end 15 | 16 | def self.config 17 | @config ||= Configuration.new 18 | end 19 | 20 | def self.configure(&block) 21 | if block_given? 22 | yield config 23 | @logger = nil 24 | end 25 | end 26 | end 27 | 28 | require "ecs_deploy/task_definition" 29 | require "ecs_deploy/service" 30 | require "ecs_deploy/scheduled_task" 31 | -------------------------------------------------------------------------------- /lib/ecs_deploy/auto_scaler.rb: -------------------------------------------------------------------------------- 1 | require "logger" 2 | require "time" 3 | require "yaml" 4 | 5 | require "ecs_deploy/auto_scaler/auto_scaling_group_config" 6 | require "ecs_deploy/auto_scaler/instance_drainer" 7 | require "ecs_deploy/auto_scaler/service_config" 8 | require "ecs_deploy/auto_scaler/spot_fleet_request_config" 9 | 10 | module EcsDeploy 11 | module AutoScaler 12 | class << self 13 | attr_reader :logger, :error_logger 14 | 15 | def run(yaml_path, log_file = nil, error_log_file = nil) 16 | @enable_auto_scaling = true 17 | setup_signal_handlers 18 | @logger = Logger.new(log_file || STDOUT) 19 | @logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"] 20 | STDOUT.sync = true unless log_file 21 | @error_logger = Logger.new(error_log_file || STDERR) 22 | @error_logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"] 23 | STDERR.sync = true unless error_log_file 24 | load_config(yaml_path) 25 | 26 | ths = (auto_scaling_group_configs + spot_fleet_request_configs).map do |cluster_scaling_config| 27 | Thread.new(cluster_scaling_config, &method(:main_loop)).tap { |th| th.abort_on_exception = true } 28 | end 29 | 30 | if @config["spot_instance_intrp_warns_queue_urls"] 31 | drainer = EcsDeploy::AutoScaler::InstanceDrainer.new( 32 | auto_scaling_group_configs: auto_scaling_group_configs, 33 | spot_fleet_request_configs: spot_fleet_request_configs, 34 | logger: logger, 35 | ) 36 | polling_ths = @config["spot_instance_intrp_warns_queue_urls"].map do |queue_url| 37 | Thread.new(queue_url) do |url| 38 | drainer.poll_spot_instance_interruption_warnings(url) 39 | end.tap { |th| th.abort_on_exception = true } 40 | end 41 | end 42 | 43 | ths.each(&:join) 44 | 45 | drainer&.stop 46 | polling_ths&.each(&:join) 47 | end 48 | 49 | def main_loop(cluster_scaling_config) 50 | loop_with_polling_interval("loop of #{cluster_scaling_config.name}") do 51 | ths = cluster_scaling_config.service_configs.map do |service_config| 52 | Thread.new(service_config) do |s| 53 | @logger.debug "Scaling service #{s.name}" 54 | s.adjust_desired_count(cluster_scaling_config.cluster_resource_manager) 55 | end 56 | end 57 | ths.each { |th| th.abort_on_exception = true } 58 | 59 | ths.each(&:join) 60 | 61 | @logger.debug "Scaling cluster #{cluster_scaling_config.name}" 62 | 63 | required_capacity = cluster_scaling_config.service_configs.sum { |s| s.desired_count * s.required_capacity } 64 | cluster_scaling_config.update_desired_capacity(required_capacity) 65 | 66 | cluster_scaling_config.service_configs.each(&:wait_until_desired_count_updated) 67 | end 68 | end 69 | 70 | def load_config(yaml_path) 71 | @config = YAML.load_file(yaml_path) 72 | @polling_interval = @config["polling_interval"] || 30 73 | if @config["services"] 74 | @error_logger&.warn('"services" property in root-level is deprecated. Please define it in "auto_scaling_groups" property or "spot_fleet_requests" property.') 75 | @config.delete("services").each do |svc| 76 | if svc["auto_scaling_group_name"] && svc["spot_fleet_request_id"] 77 | raise "You can specify only one of 'auto_scaling_group_name' or 'spot_fleet_request_name'" 78 | end 79 | 80 | svc_region = svc.delete("region") 81 | if svc["auto_scaling_group_name"] 82 | asg_name = svc.delete("auto_scaling_group_name") 83 | asg = @config["auto_scaling_groups"].find { |g| g["region"] == svc_region && g["name"] == asg_name } 84 | asg["services"] ||= [] 85 | asg["services"] << svc 86 | asg["cluster"] = svc.delete("cluster") 87 | end 88 | 89 | if svc["spot_fleet_request_id"] 90 | sfr_id = svc.delete("spot_fleet_request_id") 91 | sfr = @config["spot_fleet_requests"].find { |r| r["region"] == svc_region && r["id"] == sfr_id } 92 | sfr["services"] ||= [] 93 | sfr["services"] << svc 94 | sfr["cluster"] = svc.delete("cluster") 95 | end 96 | end 97 | end 98 | end 99 | 100 | def auto_scaling_group_configs 101 | @auto_scaling_group_configs ||= (@config["auto_scaling_groups"] || []).each.with_object({}) do |c, configs| 102 | configs[c["name"]] ||= {} 103 | if configs[c["name"]][c["region"]] 104 | raise "Duplicate entry in auto_scaling_groups (name: #{c["name"]}, region: #{c["region"]})" 105 | end 106 | configs[c["name"]][c["region"]] = AutoScalingGroupConfig.new(c, @logger) 107 | end.values.flat_map(&:values) 108 | end 109 | 110 | def spot_fleet_request_configs 111 | @spot_fleet_request_configs ||= (@config["spot_fleet_requests"] || []).each.with_object({}) do |c, configs| 112 | configs[c["id"]] ||= {} 113 | if configs[c["id"]][c["region"]] 114 | raise "Duplicate entry in spot_fleet_requests (id: #{c["id"]}, region: #{c["region"]})" 115 | end 116 | configs[c["id"]][c["region"]] = SpotFleetRequestConfig.new(c, @logger) 117 | end.values.flat_map(&:values) 118 | end 119 | 120 | private 121 | 122 | def setup_signal_handlers 123 | # Use a thread and a queue to avoid "log writing failed. can't be called from trap context" 124 | # cf. https://bugs.ruby-lang.org/issues/14222#note-3 125 | signals = Queue.new 126 | %i(TERM INT CONT TSTP).each do |sig| 127 | trap(sig) { signals << sig } 128 | end 129 | 130 | Thread.new do 131 | loop do 132 | sig = signals.pop 133 | case sig 134 | when :INT, :TERM 135 | @logger.info "Received SIG#{sig}, shutting down gracefully" 136 | @stop = true 137 | when :CONT 138 | @logger.info "Received SIGCONT, resume auto scaling" 139 | @enable_auto_scaling = true 140 | when :TSTP 141 | @logger.info "Received SIGTSTP, pause auto scaling. Send SIGCONT to resume it." 142 | @enable_auto_scaling = false 143 | end 144 | end 145 | end 146 | end 147 | 148 | def wait_polling_interval?(last_executed_at) 149 | current = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) 150 | diff = current - last_executed_at 151 | diff <= @polling_interval 152 | end 153 | 154 | def loop_with_polling_interval(name) 155 | @logger.debug "Start #{name}" 156 | 157 | last_executed_at = 0 158 | loop do 159 | break if @stop 160 | sleep 1 161 | next unless @enable_auto_scaling 162 | next if wait_polling_interval?(last_executed_at) 163 | yield 164 | last_executed_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) 165 | @logger.debug "#{name} is last executed at #{last_executed_at}" 166 | end 167 | 168 | @logger.debug "Stop #{name}" 169 | end 170 | end 171 | end 172 | end 173 | -------------------------------------------------------------------------------- /lib/ecs_deploy/auto_scaler/auto_scaling_group_config.rb: -------------------------------------------------------------------------------- 1 | require "aws-sdk-autoscaling" 2 | require "aws-sdk-ec2" 3 | 4 | require "ecs_deploy" 5 | require "ecs_deploy/auto_scaler/config_base" 6 | require "ecs_deploy/auto_scaler/cluster_resource_manager" 7 | 8 | module EcsDeploy 9 | module AutoScaler 10 | AutoScalingGroupConfig = Struct.new(:name, :region, :cluster, :buffer, :service_configs, :disable_draining) do 11 | include ConfigBase 12 | 13 | MAX_DETACHABLE_INSTANCE_COUNT = 20 14 | 15 | def initialize(attributes = {}, logger) 16 | attributes = attributes.dup 17 | services = attributes.delete("services") 18 | super(attributes, logger) 19 | self.service_configs = services.map do |s| 20 | ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger) 21 | end 22 | end 23 | 24 | def update_desired_capacity(required_capacity) 25 | detach_and_terminate_orphan_instances 26 | 27 | desired_capacity = (required_capacity + buffer.to_f).ceil 28 | 29 | current_asg = client.describe_auto_scaling_groups({ 30 | auto_scaling_group_names: [name], 31 | }).auto_scaling_groups[0] 32 | 33 | if current_asg.desired_capacity > desired_capacity 34 | decreased_capacity = decrease_desired_capacity(current_asg.desired_capacity - desired_capacity) 35 | if decreased_capacity > 0 36 | new_desired_capacity = current_asg.desired_capacity - decreased_capacity 37 | cluster_resource_manager.trigger_capacity_update(current_asg.desired_capacity, new_desired_capacity) 38 | @logger.info "#{log_prefix} Updated desired_capacity to #{new_desired_capacity}" 39 | else 40 | @logger.info "#{log_prefix} Tried to Update desired_capacity but there were no deregisterable instances" 41 | end 42 | elsif current_asg.desired_capacity < desired_capacity 43 | client.update_auto_scaling_group( 44 | auto_scaling_group_name: name, 45 | min_size: 0, 46 | max_size: [current_asg.max_size, desired_capacity].max, 47 | desired_capacity: desired_capacity, 48 | ) 49 | cluster_resource_manager.trigger_capacity_update(current_asg.desired_capacity, desired_capacity) 50 | @logger.info "#{log_prefix} Updated desired_capacity to #{desired_capacity}" 51 | end 52 | rescue => e 53 | AutoScaler.error_logger.error(e) 54 | end 55 | 56 | def cluster_resource_manager 57 | @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new( 58 | region: region, 59 | cluster: cluster, 60 | service_configs: service_configs, 61 | capacity_based_on: "instances", 62 | logger: @logger, 63 | ) 64 | end 65 | 66 | # NOTE: InstanceDrainer calls this method when it receives spot instance interruption warnings 67 | def detach_instances(instance_ids:, should_decrement_desired_capacity:) 68 | return if instance_ids.empty? 69 | 70 | # detach only detachable instances 71 | detachable_instance_ids = instance_ids & describe_detachable_instances.map(&:instance_id) 72 | 73 | detachable_instance_ids.each_slice(MAX_DETACHABLE_INSTANCE_COUNT) do |ids| 74 | client.detach_instances( 75 | auto_scaling_group_name: name, 76 | instance_ids: ids, 77 | should_decrement_desired_capacity: should_decrement_desired_capacity, 78 | ) 79 | end 80 | 81 | @logger.info "#{log_prefix} Detached instances from ASG: #{instance_ids.inspect}" 82 | end 83 | 84 | private 85 | 86 | def decrease_desired_capacity(count) 87 | container_instance_arns_in_service = cluster_resource_manager.fetch_container_instance_arns_in_service 88 | container_instances_in_cluster = cluster_resource_manager.fetch_container_instances_in_cluster 89 | auto_scaling_group_instances = describe_detachable_instances 90 | deregisterable_instances = container_instances_in_cluster.select do |i| 91 | i.pending_tasks_count == 0 && 92 | !running_essential_task?(i, container_instance_arns_in_service) && 93 | auto_scaling_group_instances.any? {|instance| instance.instance_id == i.ec2_instance_id } 94 | end 95 | 96 | @logger.info "#{log_prefix} Fetched deregisterable instances: #{deregisterable_instances.map(&:ec2_instance_id).inspect}" 97 | 98 | az_to_instance_count = auto_scaling_group_instances.each_with_object(Hash.new(0)) { |i, h| h[i.availability_zone] += 1 } 99 | az_to_deregisterable_instances = deregisterable_instances.group_by do |i| 100 | i.attributes.find { |a| a.name == "ecs.availability-zone" }.value 101 | end 102 | 103 | deregistered_instance_ids = [] 104 | prev_max_count = nil 105 | # Select instances to be deregistered keeping the balance of instance count per availability zone 106 | while deregistered_instance_ids.size < count 107 | max_count = az_to_instance_count.each_value.max 108 | break if max_count == prev_max_count # No more deregistable instances with keeping the balance 109 | 110 | azs = az_to_instance_count.select { |_, c| c == max_count }.keys 111 | azs.each do |az| 112 | instance = az_to_deregisterable_instances[az]&.pop 113 | next if instance.nil? 114 | begin 115 | cluster_resource_manager.deregister_container_instance(instance.container_instance_arn) 116 | deregistered_instance_ids << instance.ec2_instance_id 117 | az_to_instance_count[az] -= 1 118 | rescue EcsDeploy::AutoScaler::ClusterResourceManager::DeregisterContainerInstanceFailed 119 | end 120 | break if deregistered_instance_ids.size >= count 121 | end 122 | prev_max_count = max_count 123 | end 124 | 125 | @logger.info "#{log_prefix} Deregistered instances: #{deregistered_instance_ids.inspect}" 126 | 127 | detach_and_terminate_instances(deregistered_instance_ids) 128 | 129 | deregistered_instance_ids.size 130 | end 131 | 132 | def detach_and_terminate_instances(instance_ids) 133 | return if instance_ids.empty? 134 | 135 | detach_instances( 136 | instance_ids: instance_ids, 137 | should_decrement_desired_capacity: true 138 | ) 139 | 140 | sleep 3 141 | 142 | ec2_client.terminate_instances(instance_ids: instance_ids) 143 | 144 | @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}" 145 | rescue => e 146 | AutoScaler.error_logger.error(e) 147 | end 148 | 149 | def detach_and_terminate_orphan_instances 150 | container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id) 151 | orphans = describe_detachable_instances.reject do |i| 152 | next true if container_instance_ids.include?(i.instance_id) 153 | end.map(&:instance_id) 154 | 155 | return if orphans.empty? 156 | 157 | targets = ec2_client.describe_instances(instance_ids: orphans).reservations.flat_map(&:instances).select do |i| 158 | (Time.now - i.launch_time) > 600 159 | end 160 | 161 | detach_and_terminate_instances(targets.map(&:instance_id)) 162 | rescue => e 163 | AutoScaler.error_logger.error(e) 164 | end 165 | 166 | def client 167 | Aws::AutoScaling::Client.new( 168 | access_key_id: EcsDeploy.config.access_key_id, 169 | secret_access_key: EcsDeploy.config.secret_access_key, 170 | region: region, 171 | logger: logger 172 | ) 173 | end 174 | 175 | def ec2_client 176 | Aws::EC2::Client.new( 177 | access_key_id: EcsDeploy.config.access_key_id, 178 | secret_access_key: EcsDeploy.config.secret_access_key, 179 | region: region, 180 | logger: logger 181 | ) 182 | end 183 | 184 | def describe_detachable_instances 185 | client.describe_auto_scaling_groups({ auto_scaling_group_names: [name] }).auto_scaling_groups[0].instances.reject do |i| 186 | # The lifecycle state of terminated instances becomes "Detaching", "Terminating", "Terminating:Wait", or "Terminating:Proceed", 187 | # and we can't detach instances in such a state. 188 | i.lifecycle_state.start_with?("Terminating") || i.lifecycle_state == "Detaching" || 189 | # EC2 instance sometimes stays in Pending state for more than 10 minutes 190 | i.lifecycle_state == "Pending" 191 | end 192 | end 193 | 194 | def running_essential_task?(instance, container_instance_arns_in_service) 195 | return false if instance.running_tasks_count == 0 196 | 197 | container_instance_arns_in_service.include?(instance.container_instance_arn) 198 | end 199 | 200 | def log_prefix 201 | "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]" 202 | end 203 | end 204 | end 205 | end 206 | -------------------------------------------------------------------------------- /lib/ecs_deploy/auto_scaler/cluster_resource_manager.rb: -------------------------------------------------------------------------------- 1 | require "timeout" 2 | 3 | require "aws-sdk-ecs" 4 | 5 | module EcsDeploy 6 | module AutoScaler 7 | class ClusterResourceManager 8 | class DeregisterContainerInstanceFailed < StandardError; end 9 | 10 | MAX_DESCRIBABLE_SERVICE_COUNT = 10 11 | 12 | def initialize(region:, cluster:, service_configs:, logger: nil, capacity_based_on:) 13 | @region = region 14 | @cluster = cluster 15 | @logger = logger 16 | @service_configs = service_configs 17 | @capacity_based_on = capacity_based_on 18 | if @capacity_based_on != "instances" && @capacity_based_on != "vCPUs" 19 | raise ArgumentError, 'capacity_based_on should be either "instances" or "vCPUs"' 20 | end 21 | 22 | @mutex = Mutex.new 23 | @resource = ConditionVariable.new 24 | @used_capacity = @service_configs.sum { |s| s.desired_count * s.required_capacity } 25 | @capacity = calculate_active_instance_capacity 26 | end 27 | 28 | def acquire(capacity, timeout: nil) 29 | @mutex.synchronize do 30 | @logger&.debug("#{log_prefix} Trying to acquire #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})") 31 | Timeout.timeout(timeout) do 32 | while @capacity - @used_capacity < capacity 33 | @resource.wait(@mutex) 34 | end 35 | end 36 | @used_capacity += capacity 37 | @logger&.debug("#{log_prefix} Acquired #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})") 38 | end 39 | true 40 | rescue Timeout::Error 41 | false 42 | end 43 | 44 | def release(capacity) 45 | @mutex.synchronize do 46 | @used_capacity -= capacity 47 | @resource.broadcast 48 | end 49 | @logger&.debug("#{log_prefix} Released #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})") 50 | true 51 | end 52 | 53 | def fetch_container_instances_in_cluster 54 | cl = ecs_client 55 | resp = cl.list_container_instances(cluster: @cluster) 56 | if resp.container_instance_arns.empty? 57 | [] 58 | else 59 | resp.flat_map do |resp| 60 | cl.describe_container_instances(cluster: @cluster, container_instances: resp.container_instance_arns).container_instances 61 | end 62 | end 63 | end 64 | 65 | def fetch_container_instance_arns_in_service 66 | task_groups = @service_configs.map { |s| "service:#{s.name}" } 67 | ecs_client.list_container_instances(cluster: @cluster, filter: "task:group in [#{task_groups.join(",")}]").flat_map(&:container_instance_arns) 68 | end 69 | 70 | def deregister_container_instance(container_instance_arn) 71 | ecs_client.deregister_container_instance(cluster: @cluster, container_instance: container_instance_arn, force: true) 72 | rescue Aws::ECS::Errors::InvalidParameterException 73 | raise DeregisterContainerInstanceFailed 74 | end 75 | 76 | def trigger_capacity_update(old_desired_capacity, new_desired_capacity, interval: 5, wait_until_capacity_updated: false) 77 | return if new_desired_capacity == old_desired_capacity 78 | 79 | th = Thread.new do 80 | @logger&.info "#{log_prefix} Updating capacity: #{old_desired_capacity} -> #{new_desired_capacity}" 81 | Timeout.timeout(180) do 82 | until @capacity == new_desired_capacity || 83 | (new_desired_capacity > old_desired_capacity && @capacity > new_desired_capacity) || 84 | (new_desired_capacity < old_desired_capacity && @capacity < new_desired_capacity) 85 | @mutex.synchronize do 86 | @capacity = calculate_active_instance_capacity 87 | @resource.broadcast 88 | rescue => e 89 | AutoScaler.error_logger.warn("#{log_prefix} `#{__method__}': #{e} (#{e.class})") 90 | end 91 | 92 | sleep interval 93 | end 94 | @logger&.info "#{log_prefix} updated capacity to #{@capacity}" 95 | end 96 | rescue Timeout::Error => e 97 | msg = "#{log_prefix} `#{__method__}': #{e} (#{e.class})" 98 | if @capacity_based_on == "vCPUs" 99 | # Timeout::Error sometimes occur. 100 | # For example, the following case never meats the condition of until 101 | # * old_desired_capaacity is 102 102 | # * new_desired_capaacity is 101 103 | # * all instances have 2 vCPUs 104 | AutoScaler.error_logger.warn(msg) 105 | else 106 | AutoScaler.error_logger.error(msg) 107 | end 108 | end 109 | 110 | if wait_until_capacity_updated 111 | @logger&.info "#{log_prefix} Waiting for the number of active instances to reach #{new_desired_capacity} (from #{old_desired_capacity})" 112 | th.join 113 | end 114 | end 115 | 116 | def calculate_active_instance_capacity 117 | cl = ecs_client 118 | 119 | if @capacity_based_on == "instances" 120 | return cl.list_container_instances(cluster: @cluster, status: "ACTIVE").sum do |resp| 121 | resp.container_instance_arns.size 122 | end 123 | end 124 | 125 | total_cpu = cl.list_container_instances(cluster: @cluster, status: "ACTIVE").sum do |resp| 126 | next 0 if resp.container_instance_arns.empty? 127 | ecs_client.describe_container_instances( 128 | cluster: @cluster, 129 | container_instances: resp.container_instance_arns, 130 | ).container_instances.sum { |ci| ci.registered_resources.find { |r| r.name == "CPU" }.integer_value } 131 | end 132 | 133 | total_cpu / 1024 134 | end 135 | 136 | private 137 | 138 | def ecs_client 139 | Aws::ECS::Client.new( 140 | access_key_id: EcsDeploy.config.access_key_id, 141 | secret_access_key: EcsDeploy.config.secret_access_key, 142 | region: @region, 143 | logger: @logger, 144 | ) 145 | end 146 | 147 | def log_prefix 148 | "[#{self.class.to_s.gsub(/\AEcsDeploy::AutoScaler::/, "")} #{@region} #{@cluster}]" 149 | end 150 | end 151 | end 152 | end 153 | -------------------------------------------------------------------------------- /lib/ecs_deploy/auto_scaler/config_base.rb: -------------------------------------------------------------------------------- 1 | module EcsDeploy 2 | module AutoScaler 3 | module ConfigBase 4 | def initialize(attributes = {}, logger) 5 | attributes.each do |key, val| 6 | send("#{key}=", val) 7 | end 8 | @logger = logger 9 | end 10 | 11 | def logger 12 | @logger 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/ecs_deploy/auto_scaler/instance_drainer.rb: -------------------------------------------------------------------------------- 1 | require "aws-sdk-ec2" 2 | require "aws-sdk-ecs" 3 | require "aws-sdk-sqs" 4 | 5 | require "ecs_deploy" 6 | 7 | module EcsDeploy 8 | module AutoScaler 9 | class InstanceDrainer 10 | def initialize(auto_scaling_group_configs:, spot_fleet_request_configs:, logger:) 11 | @auto_scaling_group_configs = auto_scaling_group_configs || [] 12 | @spot_fleet_request_configs = spot_fleet_request_configs || [] 13 | @logger = logger 14 | @stop = false 15 | end 16 | 17 | def poll_spot_instance_interruption_warnings(queue_url) 18 | @logger.debug "Start polling spot instance interruption warnings of #{queue_url}" 19 | 20 | # cf. https://docs.aws.amazon.com/general/latest/gr/rande.html#sqs_region 21 | region = URI.parse(queue_url).host.split(".")[1] 22 | 23 | poller = Aws::SQS::QueuePoller.new(queue_url, client: sqs_client(region)) 24 | poller.before_request do |stats| 25 | throw :stop_polling if @stop 26 | end 27 | 28 | until @stop 29 | begin 30 | poller.poll(max_number_of_messages: 10, visibility_timeout: 15) do |messages, _| 31 | instance_ids = messages.map do |msg| 32 | JSON.parse(msg.body).dig("detail", "instance-id") 33 | end 34 | 35 | config_to_instance_ids = build_config_to_instance_ids(instance_ids, region) 36 | set_instance_state_to_draining(config_to_instance_ids, region) 37 | # Detach the instances to launch other instances 38 | detach_instances_from_auto_scaling_groups(config_to_instance_ids, region) 39 | end 40 | rescue => e 41 | AutoScaler.error_logger.error(e) 42 | end 43 | end 44 | 45 | @logger.debug "Stop polling spot instance interruption warnings of #{queue_url}" 46 | end 47 | 48 | def stop 49 | @stop = true 50 | end 51 | 52 | private 53 | 54 | def build_config_to_instance_ids(instance_ids, region) 55 | config_to_instance_ids = Hash.new{ |h, k| h[k] = [] } 56 | ec2_client(region).describe_instances(instance_ids: instance_ids).each do |resp| 57 | resp.reservations.each do |reservation| 58 | reservation.instances.each do |i| 59 | sfr_id = i.tags.find { |t| t.key == "aws:ec2spot:fleet-request-id" }&.value 60 | if sfr_id 61 | config = @spot_fleet_request_configs.find { |c| c.id == sfr_id && c.region == region } 62 | config_to_instance_ids[config] << i.instance_id if config 63 | next 64 | end 65 | 66 | asg_name = i.tags.find { |t| t.key == "aws:autoscaling:groupName" }&.value 67 | if asg_name 68 | config = @auto_scaling_group_configs.find { |c| c.name == asg_name && c.region == region } 69 | config_to_instance_ids[config] << i.instance_id if config 70 | end 71 | end 72 | end 73 | end 74 | 75 | config_to_instance_ids 76 | end 77 | 78 | def set_instance_state_to_draining(config_to_instance_ids, region) 79 | cl = ecs_client(region) 80 | config_to_instance_ids.each do |config, instance_ids| 81 | if config.disable_draining == true || config.disable_draining == "true" 82 | @logger.info "Skipped draining instances: region: #{region}, cluster: #{config.cluster}, instance_ids: #{instance_ids.inspect}" 83 | next 84 | end 85 | 86 | arns = cl.list_container_instances( 87 | cluster: config.cluster, 88 | filter: "ec2InstanceId in [#{instance_ids.join(",")}]", 89 | ).container_instance_arns 90 | 91 | if instance_ids.size != arns.size 92 | AutoScaler.error_logger.warn("The number of ARNs differs from the number of instance IDs: instance_ids: #{instance_ids.inspect}, container_instance_arns: #{arns.inspect}") 93 | end 94 | next if arns.empty? 95 | 96 | cl.update_container_instances_state( 97 | cluster: config.cluster, 98 | container_instances: arns, 99 | status: "DRAINING", 100 | ) 101 | @logger.info "Draining instances: region: #{region}, cluster: #{config.cluster}, instance_ids: #{instance_ids.inspect}, container_instance_arns: #{arns.inspect}" 102 | end 103 | end 104 | 105 | def detach_instances_from_auto_scaling_groups(config_to_instance_ids, region) 106 | @auto_scaling_group_configs.each do |config| 107 | config.detach_instances(instance_ids: config_to_instance_ids[config], should_decrement_desired_capacity: false) 108 | end 109 | end 110 | 111 | def ec2_client(region) 112 | Aws::EC2::Client.new( 113 | access_key_id: EcsDeploy.config.access_key_id, 114 | secret_access_key: EcsDeploy.config.secret_access_key, 115 | region: region, 116 | logger: @logger, 117 | ) 118 | end 119 | 120 | def ecs_client(region) 121 | Aws::ECS::Client.new( 122 | access_key_id: EcsDeploy.config.access_key_id, 123 | secret_access_key: EcsDeploy.config.secret_access_key, 124 | region: region, 125 | logger: @logger, 126 | ) 127 | end 128 | 129 | def sqs_client(region) 130 | Aws::SQS::Client.new( 131 | access_key_id: EcsDeploy.config.access_key_id, 132 | secret_access_key: EcsDeploy.config.secret_access_key, 133 | region: region, 134 | logger: @logger, 135 | ) 136 | end 137 | end 138 | end 139 | end 140 | -------------------------------------------------------------------------------- /lib/ecs_deploy/auto_scaler/service_config.rb: -------------------------------------------------------------------------------- 1 | require "aws-sdk-ecs" 2 | require "ecs_deploy" 3 | require "ecs_deploy/auto_scaler/config_base" 4 | require "ecs_deploy/auto_scaler/trigger_config" 5 | 6 | module EcsDeploy 7 | module AutoScaler 8 | SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count required_capacity) 9 | ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do 10 | include ConfigBase 11 | 12 | MAX_DESCRIBABLE_TASK_COUNT = 100 13 | 14 | def initialize(attributes = {}, logger) 15 | super 16 | self.idle_time ||= 60 17 | self.max_task_count = Array(max_task_count) 18 | self.upscale_triggers = upscale_triggers.to_a.map do |t| 19 | TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger) 20 | end 21 | self.downscale_triggers = downscale_triggers.to_a.map do |t| 22 | TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger) 23 | end 24 | self.max_task_count.sort! 25 | self.desired_count = fetch_service.desired_count 26 | self.required_capacity ||= 1 27 | @reach_max_at = nil 28 | @last_updated_at = nil 29 | @logger = logger 30 | end 31 | 32 | def adjust_desired_count(cluster_resource_manager) 33 | if idle? 34 | @logger.debug "#{name} is idling" 35 | return 36 | end 37 | 38 | difference = 0 39 | upscale_triggers.each do |trigger| 40 | next if difference >= trigger.step 41 | 42 | if trigger.match? 43 | @logger.info "#{log_prefix} Firing upscale trigger by #{trigger.alarm_name} #{trigger.state}" 44 | difference = trigger.step 45 | end 46 | end 47 | 48 | if desired_count > current_min_task_count 49 | downscale_triggers.each do |trigger| 50 | next if difference > 0 && !trigger.prioritized_over_upscale_triggers? 51 | next unless trigger.match? 52 | 53 | @logger.info "#{log_prefix} Firing downscale trigger by #{trigger.alarm_name} #{trigger.state}" 54 | difference = [difference, -trigger.step].min 55 | end 56 | end 57 | 58 | if current_min_task_count > desired_count + difference 59 | difference = current_min_task_count - desired_count 60 | end 61 | 62 | if difference >= 0 && desired_count > max_task_count.max 63 | difference = max_task_count.max - desired_count 64 | end 65 | 66 | if difference != 0 67 | update_service(difference, cluster_resource_manager) 68 | end 69 | end 70 | 71 | def wait_until_desired_count_updated 72 | @increase_desired_count_thread&.join 73 | rescue => e 74 | AutoScaler.error_logger.warn("`#{__method__}': #{e} (#{e.class})") 75 | ensure 76 | @increase_desired_count_thread = nil 77 | end 78 | 79 | private 80 | 81 | def client 82 | Aws::ECS::Client.new( 83 | access_key_id: EcsDeploy.config.access_key_id, 84 | secret_access_key: EcsDeploy.config.secret_access_key, 85 | region: region, 86 | logger: logger 87 | ) 88 | end 89 | 90 | def idle? 91 | return false unless @last_updated_at 92 | 93 | diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at 94 | diff < idle_time 95 | end 96 | 97 | def current_min_task_count 98 | return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty? 99 | 100 | scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s| 101 | from = Time.parse(s["from"]) 102 | to = Time.parse(s["to"]) 103 | (from..to).cover?(Time.now) 104 | }["count"] 105 | end 106 | 107 | def overheat? 108 | return false unless @reach_max_at 109 | (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max 110 | end 111 | 112 | def fetch_service 113 | res = client.describe_services(cluster: cluster, services: [name]) 114 | raise "Service \"#{name}\" is not found" if res.services.empty? 115 | res.services[0] 116 | rescue => e 117 | AutoScaler.error_logger.error(e) 118 | end 119 | 120 | def update_service(difference, cluster_resource_manager) 121 | next_desired_count = desired_count + difference 122 | current_level = max_task_level(desired_count) 123 | next_level = max_task_level(next_desired_count) 124 | if current_level < next_level && overheat? # next max 125 | level = next_level 126 | @reach_max_at = nil 127 | @logger.info "#{log_prefix} Service is overheated, uses next max count" 128 | elsif current_level < next_level && !overheat? # wait cooldown 129 | level = current_level 130 | now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) 131 | @reach_max_at ||= now 132 | @logger.info "#{log_prefix} Service waiting for cooldown period to elapse #{(now - @reach_max_at).to_i}sec" 133 | elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max 134 | level = current_level 135 | now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) 136 | @reach_max_at ||= now 137 | @logger.info "#{log_prefix} Service waiting for cooldown period to elapse #{(now - @reach_max_at).to_i}sec" 138 | if next_desired_count > max_task_count[current_level] && current_level == max_task_count.size - 1 139 | @logger.warn "#{log_prefix} Desired count has reached the maximum value and couldn't be increased" 140 | end 141 | elsif current_level == next_level && next_desired_count < max_task_count[current_level] 142 | level = current_level 143 | @reach_max_at = nil 144 | @logger.info "#{log_prefix} Service has finished cooling down" 145 | elsif current_level > next_level 146 | level = next_level 147 | @reach_max_at = nil 148 | @logger.info "#{log_prefix} Service has finished cooling down" 149 | end 150 | 151 | next_desired_count = [next_desired_count, max_task_count[level]].min 152 | if next_desired_count > desired_count 153 | increase_desired_count(next_desired_count - desired_count, cluster_resource_manager) 154 | else 155 | decrease_desired_count(desired_count - next_desired_count, cluster_resource_manager) 156 | end 157 | 158 | @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) 159 | @logger.info "#{log_prefix} Updated desired_count to #{next_desired_count}" 160 | rescue => e 161 | AutoScaler.error_logger.error(e) 162 | end 163 | 164 | def increase_desired_count(by, cluster_resource_manager) 165 | applied_desired_count = desired_count 166 | self.desired_count += by 167 | 168 | wait_until = Process.clock_gettime(Process::CLOCK_MONOTONIC) + 180 169 | @increase_desired_count_thread = Thread.new do 170 | cl = client 171 | by.times do 172 | timeout = wait_until - Process.clock_gettime(Process::CLOCK_MONOTONIC) 173 | break if timeout <= 0 174 | break unless cluster_resource_manager.acquire(required_capacity, timeout: timeout) 175 | begin 176 | cl.update_service(cluster: cluster, service: name, desired_count: applied_desired_count + 1) 177 | applied_desired_count += 1 178 | rescue => e 179 | cluster_resource_manager.release(required_capacity) 180 | AutoScaler.error_logger.error(e) 181 | break 182 | end 183 | end 184 | 185 | if applied_desired_count != desired_count 186 | self.desired_count = applied_desired_count 187 | @logger.info "#{log_prefix} Failed to update service and set desired_count to #{desired_count}" 188 | end 189 | end 190 | end 191 | 192 | def decrease_desired_count(by, cluster_resource_manager) 193 | cl = client 194 | running_task_arns = cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns) 195 | 196 | cl.update_service(cluster: cluster, service: name, desired_count: desired_count - by) 197 | 198 | cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w| 199 | w.before_wait do 200 | @logger.debug "#{log_prefix} waiting for service to stabilize" 201 | end 202 | end 203 | 204 | stopping_task_arns = running_task_arns - cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns) 205 | stopping_task_arns.each_slice(MAX_DESCRIBABLE_TASK_COUNT) do |arns| 206 | cl.wait_until(:tasks_stopped, cluster: cluster, tasks: arns) do |w| 207 | w.before_wait do 208 | @logger.debug "#{log_prefix} waiting for tasks to finish stopping" 209 | end 210 | end 211 | end 212 | 213 | cluster_resource_manager.release(required_capacity * by) 214 | self.desired_count -= by 215 | end 216 | 217 | def max_task_level(count) 218 | max_task_count.index { |i| count <= i } || max_task_count.size - 1 219 | end 220 | 221 | def log_prefix 222 | "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]" 223 | end 224 | end 225 | end 226 | end 227 | -------------------------------------------------------------------------------- /lib/ecs_deploy/auto_scaler/spot_fleet_request_config.rb: -------------------------------------------------------------------------------- 1 | require "json" 2 | require "timeout" 3 | 4 | require "aws-sdk-ec2" 5 | require "ecs_deploy" 6 | require "ecs_deploy/auto_scaler/config_base" 7 | require "ecs_deploy/auto_scaler/cluster_resource_manager" 8 | 9 | module EcsDeploy 10 | module AutoScaler 11 | SpotFleetRequestConfig = Struct.new(:id, :region, :cluster, :buffer, :service_configs, :disable_draining) do 12 | include ConfigBase 13 | 14 | def initialize(attributes = {}, logger) 15 | attributes = attributes.dup 16 | services = attributes.delete("services") 17 | super(attributes, logger) 18 | self.service_configs = services.map do |s| 19 | ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger) 20 | end 21 | end 22 | 23 | def name 24 | id 25 | end 26 | 27 | def update_desired_capacity(required_capacity) 28 | terminate_orphan_instances 29 | 30 | desired_capacity = (required_capacity + buffer.to_f).ceil 31 | 32 | request_config = ec2_client.describe_spot_fleet_requests( 33 | spot_fleet_request_ids: [id] 34 | ).spot_fleet_request_configs[0].spot_fleet_request_config 35 | 36 | return if desired_capacity == request_config.target_capacity 37 | 38 | ec2_client.modify_spot_fleet_request(spot_fleet_request_id: id, target_capacity: desired_capacity) 39 | 40 | cluster_resource_manager.trigger_capacity_update( 41 | request_config.target_capacity, 42 | desired_capacity, 43 | # Wait until the capacity is updated to prevent the process from terminating before container draining is completed 44 | wait_until_capacity_updated: desired_capacity < request_config.target_capacity, 45 | ) 46 | @logger.info "#{log_prefix} Updated desired_capacity to #{desired_capacity}" 47 | rescue => e 48 | AutoScaler.error_logger.error(e) 49 | end 50 | 51 | def cluster_resource_manager 52 | @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new( 53 | region: region, 54 | cluster: cluster, 55 | service_configs: service_configs, 56 | capacity_based_on: "vCPUs", 57 | logger: @logger, 58 | ) 59 | end 60 | 61 | private 62 | 63 | def terminate_orphan_instances 64 | container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id) 65 | spot_fleet_instances = ec2_client.describe_spot_fleet_instances(spot_fleet_request_id: id).active_instances 66 | orphans = spot_fleet_instances.reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id) 67 | 68 | return if orphans.empty? 69 | 70 | running_instances = ec2_client.describe_instances( 71 | instance_ids: orphans, 72 | filters: [{ name: "instance-state-name", values: ["running"] }], 73 | ).reservations.flat_map(&:instances) 74 | # instances which have just launched might not be registered to the cluster yet. 75 | instance_ids = running_instances.select { |i| (Time.now - i.launch_time) > 600 }.map(&:instance_id) 76 | 77 | return if instance_ids.empty? 78 | 79 | # Terminate orpahns without canceling spot instance request 80 | # because we can't terminate canceled spot instances by decreasing the capacity 81 | ec2_client.terminate_instances(instance_ids: instance_ids) 82 | 83 | @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}" 84 | rescue => e 85 | AutoScaler.error_logger.error(e) 86 | end 87 | 88 | def ec2_client 89 | Aws::EC2::Client.new( 90 | access_key_id: EcsDeploy.config.access_key_id, 91 | secret_access_key: EcsDeploy.config.secret_access_key, 92 | region: region, 93 | logger: logger, 94 | ) 95 | end 96 | 97 | def log_prefix 98 | "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]" 99 | end 100 | end 101 | end 102 | end 103 | -------------------------------------------------------------------------------- /lib/ecs_deploy/auto_scaler/trigger_config.rb: -------------------------------------------------------------------------------- 1 | require "aws-sdk-cloudwatch" 2 | require "ecs_deploy" 3 | require "ecs_deploy/auto_scaler" 4 | require "ecs_deploy/auto_scaler/config_base" 5 | 6 | module EcsDeploy 7 | module AutoScaler 8 | TriggerConfig = Struct.new(:alarm_name, :region, :state, :step, :prioritized_over_upscale_triggers) do 9 | include ConfigBase 10 | 11 | def match? 12 | fetch_alarm.state_value == state 13 | end 14 | 15 | def prioritized_over_upscale_triggers? 16 | !!prioritized_over_upscale_triggers 17 | end 18 | 19 | private 20 | 21 | def client 22 | Aws::CloudWatch::Client.new( 23 | access_key_id: EcsDeploy.config.access_key_id, 24 | secret_access_key: EcsDeploy.config.secret_access_key, 25 | region: region, 26 | logger: logger 27 | ) 28 | end 29 | 30 | def fetch_alarm 31 | res = client.describe_alarms(alarm_names: [alarm_name]) 32 | 33 | raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty? 34 | res.metric_alarms[0].tap do |alarm| 35 | AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}") 36 | end 37 | rescue => e 38 | AutoScaler.error_logger.error(e) 39 | end 40 | end 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /lib/ecs_deploy/capistrano.rb: -------------------------------------------------------------------------------- 1 | require 'ecs_deploy' 2 | require 'ecs_deploy/instance_fluctuation_manager' 3 | 4 | namespace :ecs do 5 | task :configure do 6 | EcsDeploy.configure do |c| 7 | c.log_level = fetch(:ecs_log_level) if fetch(:ecs_log_level) 8 | c.deploy_wait_timeout = fetch(:ecs_deploy_wait_timeout) if fetch(:ecs_deploy_wait_timeout) 9 | c.ecs_service_role = fetch(:ecs_service_role) if fetch(:ecs_service_role) 10 | c.default_region = Array(fetch(:ecs_region))[0] if fetch(:ecs_region) 11 | c.ecs_wait_until_services_stable_max_attempts = fetch(:ecs_wait_until_services_stable_max_attempts) if fetch(:ecs_wait_until_services_stable_max_attempts) 12 | c.ecs_wait_until_services_stable_delay = fetch(:ecs_wait_until_services_stable_delay) if fetch(:ecs_wait_until_services_stable_delay) 13 | c.ecs_client_params = fetch(:ecs_client_params) if fetch(:ecs_client_params) 14 | end 15 | 16 | if ENV["TARGET_CLUSTER"] 17 | set :target_cluster, ENV["TARGET_CLUSTER"].split(",").map(&:strip) 18 | end 19 | if ENV["TARGET_TASK_DEFINITION"] 20 | set :target_task_definition, ENV["TARGET_TASK_DEFINITION"].split(",").map(&:strip) 21 | end 22 | end 23 | 24 | task register_task_definition: [:configure] do 25 | if fetch(:ecs_tasks) 26 | regions = Array(fetch(:ecs_region)) 27 | regions = [EcsDeploy.config.default_region] if regions.empty? 28 | ecs_registered_tasks = {} 29 | regions.each do |region| 30 | ecs_registered_tasks[region] = {} 31 | fetch(:ecs_tasks).each do |t| 32 | task_definition = EcsDeploy::TaskDefinition.new( 33 | region: region, 34 | task_definition_name: t[:name], 35 | container_definitions: t[:container_definitions], 36 | task_role_arn: t[:task_role_arn], 37 | execution_role_arn: t[:execution_role_arn], 38 | volumes: t[:volumes], 39 | network_mode: t[:network_mode], 40 | placement_constraints: t[:placement_constraints], 41 | requires_compatibilities: t[:requires_compatibilities], 42 | cpu: t[:cpu], 43 | memory: t[:memory], 44 | tags: t[:tags], 45 | runtime_platform: t[:runtime_platform], 46 | ) 47 | result = task_definition.register 48 | ecs_registered_tasks[region][t[:name]] = result 49 | end 50 | end 51 | 52 | set :ecs_registered_tasks, ecs_registered_tasks 53 | end 54 | end 55 | 56 | task deploy_scheduled_task: [:configure, :register_task_definition] do 57 | if fetch(:ecs_scheduled_tasks) 58 | regions = Array(fetch(:ecs_region)) 59 | regions = [EcsDeploy.config.default_region] if regions.empty? 60 | regions.each do |r| 61 | fetch(:ecs_scheduled_tasks).each do |t| 62 | scheduled_task = EcsDeploy::ScheduledTask.new( 63 | region: r, 64 | cluster: t[:cluster] || fetch(:ecs_default_cluster), 65 | rule_name: t[:rule_name], 66 | schedule_expression: t[:schedule_expression], 67 | enabled: t[:enabled] != false, 68 | description: t[:description], 69 | target_id: t[:target_id], 70 | task_definition_name: t[:task_definition_name], 71 | network_configuration: t[:network_configuration], 72 | launch_type: t[:launch_type], 73 | platform_version: t[:platform_version], 74 | group: t[:group], 75 | revision: t[:revision], 76 | task_count: t[:task_count], 77 | role_arn: t[:role_arn], 78 | container_overrides: t[:container_overrides], 79 | ) 80 | scheduled_task.deploy 81 | end 82 | end 83 | end 84 | end 85 | 86 | task deploy: [:configure, :register_task_definition] do 87 | if fetch(:ecs_services) 88 | regions = Array(fetch(:ecs_region)) 89 | regions = [EcsDeploy.config.default_region] if regions.empty? 90 | regions.each do |r| 91 | services = fetch(:ecs_services).map do |service| 92 | if fetch(:target_cluster) && fetch(:target_cluster).size > 0 93 | next unless fetch(:target_cluster).include?(service[:cluster]) 94 | end 95 | if fetch(:target_task_definition) && fetch(:target_task_definition).size > 0 96 | next unless fetch(:target_task_definition).include?(service[:task_definition_name]) 97 | end 98 | 99 | service_options = { 100 | region: r, 101 | cluster: service[:cluster] || fetch(:ecs_default_cluster), 102 | service_name: service[:name], 103 | task_definition_name: service[:task_definition_name], 104 | load_balancers: service[:load_balancers], 105 | desired_count: service[:desired_count], 106 | launch_type: service[:launch_type], 107 | network_configuration: service[:network_configuration], 108 | health_check_grace_period_seconds: service[:health_check_grace_period_seconds], 109 | delete: service[:delete], 110 | enable_ecs_managed_tags: service[:enable_ecs_managed_tags], 111 | tags: service[:tags], 112 | propagate_tags: service[:propagate_tags], 113 | enable_execute_command: service[:enable_execute_command], 114 | } 115 | service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration] 116 | service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints] 117 | service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy] 118 | service_options[:capacity_provider_strategy] = service[:capacity_provider_strategy] if service[:capacity_provider_strategy] 119 | service_options[:scheduling_strategy] = service[:scheduling_strategy] if service[:scheduling_strategy] 120 | s = EcsDeploy::Service.new(**service_options) 121 | s.deploy 122 | s 123 | end 124 | EcsDeploy::Service.wait_all_running(services) 125 | end 126 | end 127 | end 128 | 129 | task rollback: [:configure] do 130 | if fetch(:ecs_services) 131 | regions = Array(fetch(:ecs_region)) 132 | regions = [EcsDeploy.config.default_region] if regions.empty? 133 | 134 | rollback_routes = {} 135 | regions.each do |r| 136 | services = fetch(:ecs_services).map do |service| 137 | if fetch(:target_cluster) && fetch(:target_cluster).size > 0 138 | next unless fetch(:target_cluster).include?(service[:cluster]) 139 | end 140 | if fetch(:target_task_definition) && fetch(:target_task_definition).size > 0 141 | next unless fetch(:target_task_definition).include?(service[:task_definition_name]) 142 | end 143 | 144 | task_definition_arns = EcsDeploy::TaskDefinition.new( 145 | region: r, 146 | task_definition_name: service[:task_definition_name] || service[:name], 147 | ).recent_task_definition_arns 148 | 149 | rollback_step = (ENV["STEP"] || 1).to_i 150 | 151 | current_task_definition_arn = EcsDeploy::Service.new( 152 | region: r, 153 | cluster: service[:cluster] || fetch(:ecs_default_cluster), 154 | service_name: service[:name], 155 | ).current_task_definition_arn 156 | 157 | unless (rollback_arn = rollback_routes[current_task_definition_arn]) 158 | current_arn_index = task_definition_arns.index do |arn| 159 | arn == current_task_definition_arn 160 | end 161 | 162 | rollback_arn = task_definition_arns[current_arn_index + rollback_step] 163 | 164 | rollback_routes[current_task_definition_arn] = rollback_arn 165 | end 166 | 167 | EcsDeploy.logger.info "#{current_task_definition_arn} -> #{rollback_arn}" 168 | 169 | raise "Past task_definition_arns is empty" unless rollback_arn 170 | 171 | service_options = { 172 | region: r, 173 | cluster: service[:cluster] || fetch(:ecs_default_cluster), 174 | service_name: service[:name], 175 | task_definition_name: rollback_arn, 176 | load_balancers: service[:load_balancers], 177 | desired_count: service[:desired_count], 178 | launch_type: service[:launch_type], 179 | network_configuration: service[:network_configuration], 180 | health_check_grace_period_seconds: service[:health_check_grace_period_seconds], 181 | } 182 | service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration] 183 | service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints] 184 | service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy] 185 | service_options[:capacity_provider_strategy] = service[:capacity_provider_strategy] if service[:capacity_provider_strategy] 186 | s = EcsDeploy::Service.new(**service_options) 187 | s.deploy 188 | EcsDeploy::TaskDefinition.deregister(current_task_definition_arn, region: r) 189 | s 190 | end 191 | EcsDeploy::Service.wait_all_running(services) 192 | end 193 | end 194 | end 195 | 196 | task increase_instances_to_max_size: [:configure] do 197 | configs = fetch(:ecs_instance_fluctuation_manager_configs, []) 198 | unless configs.empty? 199 | regions = Array(fetch(:ecs_region)) 200 | regions = [EcsDeploy.config.default_region] if regions.empty? 201 | regions.each do |region| 202 | configs.each do |config| 203 | logger = config.fetch(:logger, EcsDeploy.logger) 204 | m = EcsDeploy::InstanceFluctuationManager.new( 205 | region: config[:region] || region, 206 | cluster: config[:cluster] || fetch(:ecs_default_cluster), 207 | auto_scaling_group_name: config[:auto_scaling_group_name], 208 | desired_capacity: config[:desired_capacity], 209 | logger: logger 210 | ) 211 | m.increase 212 | end 213 | end 214 | end 215 | end 216 | 217 | task terminate_redundant_instances: [:configure] do 218 | configs = fetch(:ecs_instance_fluctuation_manager_configs, []) 219 | unless configs.empty? 220 | regions = Array(fetch(:ecs_region)) 221 | regions = [EcsDeploy.config.default_region] if regions.empty? 222 | regions.each do |region| 223 | configs.each do |config| 224 | logger = config.fetch(:logger, EcsDeploy.logger) 225 | m = EcsDeploy::InstanceFluctuationManager.new( 226 | region: config[:region] || region, 227 | cluster: config[:cluster] || fetch(:ecs_default_cluster), 228 | auto_scaling_group_name: config[:auto_scaling_group_name], 229 | desired_capacity: config[:desired_capacity], 230 | logger: logger 231 | ) 232 | m.decrease 233 | end 234 | end 235 | end 236 | end 237 | end 238 | -------------------------------------------------------------------------------- /lib/ecs_deploy/configuration.rb: -------------------------------------------------------------------------------- 1 | module EcsDeploy 2 | class Configuration 3 | attr_accessor \ 4 | :log_level, 5 | :access_key_id, 6 | :secret_access_key, 7 | :default_region, 8 | :deploy_wait_timeout, 9 | :ecs_service_role, 10 | :ecs_wait_until_services_stable_max_attempts, 11 | :ecs_wait_until_services_stable_delay, 12 | :ecs_client_params 13 | 14 | def initialize 15 | @log_level = :info 16 | @deploy_wait_timeout = 300 17 | # The following values are the default values of Aws::ECS::Waiters::ServicesStable 18 | @ecs_wait_until_services_stable_max_attempts = 40 19 | @ecs_wait_until_services_stable_delay = 15 20 | @ecs_client_params = {} 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/ecs_deploy/instance_fluctuation_manager.rb: -------------------------------------------------------------------------------- 1 | require "aws-sdk-autoscaling" 2 | require "aws-sdk-ec2" 3 | require "aws-sdk-ecs" 4 | 5 | module EcsDeploy 6 | class InstanceFluctuationManager 7 | attr_reader :logger 8 | 9 | MAX_UPDATABLE_ECS_CONTAINER_COUNT = 10 10 | MAX_DETACHEABLE_EC2_INSTACE_COUNT = 20 11 | MAX_DESCRIBABLE_ECS_TASK_COUNT = 100 12 | 13 | def initialize(region:, cluster:, auto_scaling_group_name:, desired_capacity:, logger:) 14 | @region = region 15 | @cluster = cluster 16 | @auto_scaling_group_name = auto_scaling_group_name 17 | @desired_capacity = desired_capacity 18 | @logger = logger 19 | end 20 | 21 | def increase 22 | asg = fetch_auto_scaling_group 23 | 24 | @logger.info("Increasing desired capacity of #{@auto_scaling_group_name}: #{asg.desired_capacity} => #{asg.max_size}") 25 | as_client.update_auto_scaling_group(auto_scaling_group_name: @auto_scaling_group_name, desired_capacity: asg.max_size) 26 | 27 | # Run in background because increasing instances may take time 28 | Thread.new do 29 | loop do 30 | cluster = ecs_client.describe_clusters(clusters: [@cluster]).clusters.first 31 | instance_count = cluster.registered_container_instances_count 32 | if instance_count == asg.max_size 33 | @logger.info("Succeeded in increasing instances!") 34 | break 35 | end 36 | @logger.info("Current registered instance count: #{instance_count}") 37 | sleep 5 38 | end 39 | end 40 | end 41 | 42 | def decrease 43 | asg = fetch_auto_scaling_group 44 | 45 | decrease_count = asg.desired_capacity - @desired_capacity 46 | if decrease_count <= 0 47 | @logger.info("The capacity is already #{asg.desired_capacity}") 48 | return 49 | end 50 | @logger.info("Decreasing desired capacity of #{@auto_scaling_group_name}: #{asg.desired_capacity} => #{@desired_capacity}") 51 | 52 | container_instances = ecs_client.list_container_instances(cluster: @cluster).flat_map do |resp| 53 | ecs_client.describe_container_instances( 54 | cluster: @cluster, 55 | container_instances: resp.container_instance_arns 56 | ).container_instances 57 | end 58 | 59 | # The status of ECS instances sometimes seems to remain 'DEREGISTERING' for a few minutes after they are terminated. 60 | container_instances.reject! { |ci| ci.status == 'DEREGISTERING' } 61 | 62 | az_to_container_instances = container_instances.sort_by {|ci| - ci.running_tasks_count }.group_by do |ci| 63 | ci.attributes.find {|attribute| attribute.name == "ecs.availability-zone" }.value 64 | end 65 | if az_to_container_instances.empty? 66 | @logger.info("There are no instances to terminate.") 67 | return 68 | end 69 | 70 | target_container_instances = extract_target_container_instances(decrease_count, az_to_container_instances) 71 | 72 | @logger.info("running tasks: #{ecs_client.list_tasks(cluster: @cluster).task_arns.size}") 73 | all_running_task_arns = [] 74 | target_container_instances.map(&:container_instance_arn).each_slice(MAX_UPDATABLE_ECS_CONTAINER_COUNT) do |arns| 75 | @logger.info(arns) 76 | ecs_client.update_container_instances_state( 77 | cluster: @cluster, 78 | container_instances: arns, 79 | status: "DRAINING" 80 | ) 81 | arns.each do |arn| 82 | all_running_task_arns.concat(list_running_task_arns(arn)) 83 | end 84 | end 85 | 86 | stop_tasks_not_belonging_service(all_running_task_arns) 87 | wait_until_tasks_stopped(all_running_task_arns) 88 | 89 | instance_ids = target_container_instances.map(&:ec2_instance_id) 90 | terminate_instances(instance_ids) 91 | @logger.info("Succeeded in decreasing instances!") 92 | end 93 | 94 | private 95 | 96 | def aws_params 97 | { 98 | access_key_id: EcsDeploy.config.access_key_id, 99 | secret_access_key: EcsDeploy.config.secret_access_key, 100 | region: @region, 101 | logger: @logger 102 | }.reject do |_key, value| 103 | value.nil? 104 | end 105 | end 106 | 107 | def as_client 108 | @as_client ||= Aws::AutoScaling::Client.new(aws_params) 109 | end 110 | 111 | def ec2_client 112 | @ec2_client ||= Aws::EC2::Client.new(aws_params) 113 | end 114 | 115 | def ecs_client 116 | @ecs_client ||= Aws::ECS::Client.new(aws_params.merge(EcsDeploy.config.ecs_client_params)) 117 | end 118 | 119 | def fetch_auto_scaling_group 120 | as_client.describe_auto_scaling_groups(auto_scaling_group_names: [@auto_scaling_group_name]).auto_scaling_groups.first 121 | end 122 | 123 | # Extract container instances to terminate considering AZ balance 124 | def extract_target_container_instances(decrease_count, az_to_container_instances) 125 | target_container_instances = [] 126 | decrease_count.times do 127 | @logger.debug do 128 | "AZ balance: #{az_to_container_instances.sort_by {|az, _| az }.map {|az, instances| [az, instances.size] }.to_h}" 129 | end 130 | az = az_to_container_instances.max_by {|_az, instances| instances.size }.first 131 | target_container_instances << az_to_container_instances[az].pop 132 | end 133 | @logger.info do 134 | "AZ balance: #{az_to_container_instances.sort_by {|az, _| az }.map {|az, instances| [az, instances.size] }.to_h}" 135 | end 136 | 137 | target_container_instances 138 | end 139 | 140 | # list tasks whose desired_status is "RUNNING" or 141 | # whoose desired_status is "STOPPED" but last_status is "RUNNING" on the ECS container 142 | def list_running_task_arns(container_instance_arn) 143 | running_tasks_arn = ecs_client.list_tasks(cluster: @cluster, container_instance: container_instance_arn).flat_map(&:task_arns) 144 | stopped_tasks_arn = ecs_client.list_tasks(cluster: @cluster, container_instance: container_instance_arn, desired_status: "STOPPED").flat_map(&:task_arns) 145 | stopped_running_task_arns = stopped_tasks_arn.each_slice(MAX_DESCRIBABLE_ECS_TASK_COUNT).flat_map do |arns| 146 | ecs_client.describe_tasks(cluster: @cluster, tasks: arns).tasks.select do |task| 147 | task.desired_status == "STOPPED" && task.last_status == "RUNNING" 148 | end 149 | end.map(&:task_arn) 150 | running_tasks_arn + stopped_running_task_arns 151 | end 152 | 153 | def wait_until_tasks_stopped(task_arns) 154 | @logger.info("All old tasks: #{task_arns.size}") 155 | task_arns.each_slice(MAX_DESCRIBABLE_ECS_TASK_COUNT).each do |arns| 156 | ecs_client.wait_until(:tasks_stopped, cluster: @cluster, tasks: arns) 157 | end 158 | @logger.info("All old tasks are stopped") 159 | end 160 | 161 | def stop_tasks_not_belonging_service(running_task_arns) 162 | @logger.info("Running tasks: #{running_task_arns.size}") 163 | unless running_task_arns.empty? 164 | running_task_arns.each_slice(MAX_DESCRIBABLE_ECS_TASK_COUNT).each do |arns| 165 | ecs_client.describe_tasks(cluster: @cluster, tasks: arns).tasks.each do |task| 166 | ecs_client.stop_task(cluster: @cluster, task: task.task_arn) if task.group.start_with?("family:") 167 | end 168 | end 169 | end 170 | end 171 | 172 | def terminate_instances(instance_ids) 173 | if instance_ids.empty? 174 | @logger.info("There are no instances to terminate.") 175 | return 176 | end 177 | instance_ids.each_slice(MAX_DETACHEABLE_EC2_INSTACE_COUNT) do |ids| 178 | as_client.detach_instances( 179 | auto_scaling_group_name: @auto_scaling_group_name, 180 | instance_ids: ids, 181 | should_decrement_desired_capacity: true 182 | ) 183 | end 184 | 185 | ec2_client.terminate_instances(instance_ids: instance_ids) 186 | 187 | ec2_client.wait_until(:instance_terminated, instance_ids: instance_ids) do |w| 188 | w.before_wait do |attempts, response| 189 | @logger.info("Waiting for stopping all instances...#{attempts}") 190 | instances = response.reservations.flat_map(&:instances) 191 | instances.sort_by(&:instance_id).each do |instance| 192 | @logger.info("#{instance.instance_id}\t#{instance.state.name}") 193 | end 194 | end 195 | end 196 | end 197 | end 198 | end 199 | -------------------------------------------------------------------------------- /lib/ecs_deploy/scheduled_task.rb: -------------------------------------------------------------------------------- 1 | require 'aws-sdk-cloudwatchevents' 2 | require 'timeout' 3 | 4 | module EcsDeploy 5 | class ScheduledTask 6 | class PutTargetsFailure < StandardError; end 7 | 8 | attr_reader :cluster, :region, :schedule_rule_name 9 | 10 | def initialize( 11 | cluster:, rule_name:, schedule_expression:, enabled: true, description: nil, target_id: nil, 12 | task_definition_name:, revision: nil, task_count: nil, role_arn:, network_configuration: nil, launch_type: nil, platform_version: nil, group: nil, 13 | region: nil, container_overrides: nil 14 | ) 15 | @cluster = cluster 16 | @rule_name = rule_name 17 | @schedule_expression = schedule_expression 18 | @enabled = enabled 19 | @description = description 20 | @target_id = target_id || task_definition_name 21 | @task_definition_name = task_definition_name 22 | @task_count = task_count || 1 23 | @revision = revision 24 | @role_arn = role_arn 25 | @network_configuration = network_configuration 26 | @launch_type = launch_type || "EC2" 27 | @platform_version = platform_version 28 | @group = group 29 | region ||= EcsDeploy.config.default_region 30 | params ||= EcsDeploy.config.ecs_client_params 31 | @container_overrides = container_overrides 32 | 33 | @client = region ? Aws::ECS::Client.new(params.merge(region: region)) : Aws::ECS::Client.new(params) 34 | @region = @client.config.region 35 | @cloud_watch_events = Aws::CloudWatchEvents::Client.new(region: @region) 36 | end 37 | 38 | def deploy 39 | put_rule 40 | put_targets 41 | end 42 | 43 | private 44 | 45 | def cluster_arn 46 | cl = @client.describe_clusters(clusters: [@cluster]).clusters[0] 47 | if cl 48 | cl.cluster_arn 49 | end 50 | end 51 | 52 | def task_definition_arn 53 | suffix = @revision ? ":#{@revision}" : "" 54 | name = "#{@task_definition_name}#{suffix}" 55 | @client.describe_task_definition(task_definition: name).task_definition.task_definition_arn 56 | end 57 | 58 | def put_rule 59 | res = @cloud_watch_events.put_rule( 60 | name: @rule_name, 61 | schedule_expression: @schedule_expression, 62 | state: @enabled ? "ENABLED" : "DISABLED", 63 | description: @description, 64 | ) 65 | EcsDeploy.logger.info "created cloudwatch event rule [#{res.rule_arn}] [#{@region}] [#{Paint['OK', :green]}]" 66 | end 67 | 68 | def put_targets 69 | target = { 70 | id: @target_id, 71 | arn: cluster_arn, 72 | role_arn: @role_arn, 73 | ecs_parameters: { 74 | task_definition_arn: task_definition_arn, 75 | task_count: @task_count, 76 | network_configuration: @network_configuration, 77 | launch_type: @launch_type, 78 | platform_version: @platform_version, 79 | group: @group, 80 | }, 81 | } 82 | target[:ecs_parameters].compact! 83 | 84 | if @container_overrides 85 | target.merge!(input: { containerOverrides: @container_overrides }.to_json) 86 | end 87 | 88 | res = @cloud_watch_events.put_targets( 89 | rule: @rule_name, 90 | targets: [target] 91 | ) 92 | if res.failed_entry_count.zero? 93 | EcsDeploy.logger.info "created cloudwatch event target [#{@target_id}] [#{@region}] [#{Paint['OK', :green]}]" 94 | else 95 | res.failed_entries.each do |entry| 96 | EcsDeploy.logger.error "failed to create cloudwatch event target [#{@region}] target_id=#{entry.target_id} error_code=#{entry.error_code} error_message=#{entry.error_message}" 97 | end 98 | raise PutTargetsFailure 99 | end 100 | end 101 | end 102 | end 103 | -------------------------------------------------------------------------------- /lib/ecs_deploy/service.rb: -------------------------------------------------------------------------------- 1 | require 'timeout' 2 | 3 | module EcsDeploy 4 | class Service 5 | CHECK_INTERVAL = 5 6 | MAX_DESCRIBE_SERVICES = 10 7 | 8 | class TooManyAttemptsError < StandardError; end 9 | 10 | attr_reader :cluster, :region, :service_name, :delete, :deploy_started_at 11 | 12 | def initialize( 13 | cluster:, service_name:, task_definition_name: nil, revision: nil, 14 | load_balancers: nil, 15 | desired_count: nil, deployment_configuration: {maximum_percent: 200, minimum_healthy_percent: 100}, 16 | launch_type: nil, 17 | placement_constraints: [], 18 | placement_strategy: [], 19 | capacity_provider_strategy: nil, 20 | network_configuration: nil, 21 | health_check_grace_period_seconds: nil, 22 | scheduling_strategy: 'REPLICA', 23 | enable_ecs_managed_tags: nil, 24 | tags: nil, 25 | propagate_tags: nil, 26 | region: nil, 27 | delete: false, 28 | enable_execute_command: false 29 | ) 30 | @cluster = cluster 31 | @service_name = service_name 32 | @task_definition_name = task_definition_name || service_name 33 | @load_balancers = load_balancers 34 | @desired_count = desired_count 35 | @deployment_configuration = deployment_configuration 36 | @launch_type = launch_type 37 | @placement_constraints = placement_constraints 38 | @placement_strategy = placement_strategy 39 | @capacity_provider_strategy = capacity_provider_strategy 40 | @network_configuration = network_configuration 41 | @health_check_grace_period_seconds = health_check_grace_period_seconds 42 | @scheduling_strategy = scheduling_strategy 43 | @revision = revision 44 | @enable_ecs_managed_tags = enable_ecs_managed_tags 45 | @tags = tags 46 | @propagate_tags = propagate_tags 47 | @enable_execute_command = enable_execute_command 48 | 49 | @response = nil 50 | 51 | region ||= EcsDeploy.config.default_region 52 | params ||= EcsDeploy.config.ecs_client_params 53 | @client = region ? Aws::ECS::Client.new(params.merge(region: region)) : Aws::ECS::Client.new(params) 54 | @region = @client.config.region 55 | 56 | @delete = delete 57 | end 58 | 59 | def current_task_definition_arn 60 | res = @client.describe_services(cluster: @cluster, services: [@service_name]) 61 | res.services[0].task_definition 62 | end 63 | 64 | def deploy 65 | @deploy_started_at = Time.now 66 | res = @client.describe_services(cluster: @cluster, services: [@service_name]) 67 | service_options = { 68 | cluster: @cluster, 69 | task_definition: task_definition_name_with_revision, 70 | deployment_configuration: @deployment_configuration, 71 | network_configuration: @network_configuration, 72 | health_check_grace_period_seconds: @health_check_grace_period_seconds, 73 | capacity_provider_strategy: @capacity_provider_strategy, 74 | enable_execute_command: @enable_execute_command, 75 | enable_ecs_managed_tags: @enable_ecs_managed_tags, 76 | placement_constraints: @placement_constraints, 77 | placement_strategy: @placement_strategy, 78 | } 79 | 80 | if @load_balancers && EcsDeploy.config.ecs_service_role 81 | service_options.merge!({ 82 | role: EcsDeploy.config.ecs_service_role, 83 | }) 84 | end 85 | 86 | if @load_balancers 87 | service_options.merge!({ 88 | load_balancers: @load_balancers, 89 | }) 90 | end 91 | 92 | if res.services.select{ |s| s.status == 'ACTIVE' }.empty? 93 | return if @delete 94 | 95 | service_options.merge!({ 96 | service_name: @service_name, 97 | desired_count: @desired_count.to_i, 98 | launch_type: @launch_type, 99 | tags: @tags, 100 | propagate_tags: @propagate_tags, 101 | }) 102 | 103 | if @scheduling_strategy == 'DAEMON' 104 | service_options[:scheduling_strategy] = @scheduling_strategy 105 | service_options.delete(:desired_count) 106 | service_options.delete(:placement_strategy) 107 | end 108 | @response = @client.create_service(service_options) 109 | EcsDeploy.logger.info "created service [#{@service_name}] [#{@cluster}] [#{@region}] [#{Paint['OK', :green]}]" 110 | else 111 | return delete_service if @delete 112 | 113 | service_options.merge!({service: @service_name}) 114 | service_options.merge!({desired_count: @desired_count}) if @desired_count 115 | service_options.merge!({propagate_tags: @propagate_tags}) if @propagate_tags 116 | 117 | current_service = res.services[0] 118 | service_options.merge!({force_new_deployment: true}) if need_force_new_deployment?(current_service) 119 | 120 | update_tags(@service_name, @tags) 121 | if @scheduling_strategy == 'DAEMON' 122 | service_options.delete(:placement_strategy) 123 | end 124 | @response = @client.update_service(service_options) 125 | EcsDeploy.logger.info "updated service [#{@service_name}] [#{@cluster}] [#{@region}] [#{Paint['OK', :green]}]" 126 | end 127 | end 128 | 129 | private def need_force_new_deployment?(service) 130 | return false unless @capacity_provider_strategy 131 | return true unless service.capacity_provider_strategy 132 | 133 | return true if @capacity_provider_strategy.size != service.capacity_provider_strategy.size 134 | 135 | match_array = @capacity_provider_strategy.all? do |strategy| 136 | service.capacity_provider_strategy.find do |current_strategy| 137 | strategy[:capacity_provider] == current_strategy.capacity_provider && 138 | strategy[:weight] == current_strategy.weight && 139 | strategy[:base] == current_strategy.base 140 | end 141 | end 142 | 143 | !match_array 144 | end 145 | 146 | def delete_service 147 | if @scheduling_strategy != 'DAEMON' 148 | @client.update_service(cluster: @cluster, service: @service_name, desired_count: 0) 149 | sleep 1 150 | end 151 | @client.delete_service(cluster: @cluster, service: @service_name) 152 | EcsDeploy.logger.info "deleted service [#{@service_name}] [#{@cluster}] [#{@region}] [#{Paint['OK', :green]}]" 153 | end 154 | 155 | def update_tags(service_name, tags) 156 | service_arn = @client.describe_services(cluster: @cluster, services: [service_name]).services.first.service_arn 157 | if service_arn.split('/').size == 2 158 | if tags 159 | EcsDeploy.logger.warn "#{service_name} doesn't support tagging operations, so tags are ignored. Long arn format must be used for tagging operations." 160 | end 161 | return 162 | end 163 | 164 | tags ||= [] 165 | current_tag_keys = @client.list_tags_for_resource(resource_arn: service_arn).tags.map(&:key) 166 | deleted_tag_keys = current_tag_keys - tags.map { |t| t[:key] } 167 | 168 | unless deleted_tag_keys.empty? 169 | @client.untag_resource(resource_arn: service_arn, tag_keys: deleted_tag_keys) 170 | end 171 | 172 | unless tags.empty? 173 | @client.tag_resource(resource_arn: service_arn, tags: tags) 174 | end 175 | end 176 | 177 | def log_events(ecs_service) 178 | ecs_service.events.sort_by(&:created_at).each do |e| 179 | next if e.created_at <= deploy_started_at 180 | next if @last_event && e.created_at <= @last_event.created_at 181 | 182 | EcsDeploy.logger.info e.message 183 | @last_event = e 184 | end 185 | end 186 | 187 | def self.wait_all_running(services) 188 | services.group_by { |s| [s.cluster, s.region] }.flat_map do |(cl, region), ss| 189 | params ||= EcsDeploy.config.ecs_client_params 190 | client = Aws::ECS::Client.new(params.merge(region: region)) 191 | ss.reject(&:delete).map(&:service_name).each_slice(MAX_DESCRIBE_SERVICES).map do |chunked_service_names| 192 | Thread.new do 193 | EcsDeploy.config.ecs_wait_until_services_stable_max_attempts.times do 194 | EcsDeploy.logger.info "waiting for services to stabilize [#{chunked_service_names.join(", ")}] [#{cl}]" 195 | resp = client.describe_services(cluster: cl, services: chunked_service_names) 196 | resp.services.each do |s| 197 | # cf. https://github.com/aws/aws-sdk-ruby/blob/master/gems/aws-sdk-ecs/lib/aws-sdk-ecs/waiters.rb#L91-L96 198 | if s.deployments.size == 1 && s.running_count == s.desired_count 199 | chunked_service_names.delete(s.service_name) 200 | end 201 | service = ss.detect {|sc| sc.service_name == s.service_name } 202 | service.log_events(s) 203 | end 204 | break if chunked_service_names.empty? 205 | sleep EcsDeploy.config.ecs_wait_until_services_stable_delay 206 | end 207 | raise TooManyAttemptsError unless chunked_service_names.empty? 208 | end 209 | end 210 | end.each(&:join) 211 | end 212 | 213 | private 214 | 215 | def task_definition_name_with_revision 216 | suffix = @revision ? ":#{@revision}" : "" 217 | "#{@task_definition_name}#{suffix}" 218 | end 219 | end 220 | end 221 | -------------------------------------------------------------------------------- /lib/ecs_deploy/task_definition.rb: -------------------------------------------------------------------------------- 1 | module EcsDeploy 2 | class TaskDefinition 3 | def self.deregister(arn, region: nil) 4 | region ||= EcsDeploy.config.default_region 5 | params ||= EcsDeploy.config.ecs_client_params 6 | client = region ? Aws::ECS::Client.new(params.merge(region: region)) : Aws::ECS::Client.new(params) 7 | client.deregister_task_definition({ 8 | task_definition: arn, 9 | }) 10 | EcsDeploy.logger.info "deregistered task definition [#{arn}] [#{client.config.region}] [#{Paint['OK', :green]}]" 11 | end 12 | 13 | def initialize( 14 | task_definition_name:, region: nil, 15 | network_mode: "bridge", volumes: [], container_definitions: [], placement_constraints: [], 16 | task_role_arn: nil, 17 | execution_role_arn: nil, 18 | requires_compatibilities: nil, 19 | cpu: nil, memory: nil, 20 | tags: nil, 21 | runtime_platform: {} 22 | ) 23 | @task_definition_name = task_definition_name 24 | @task_role_arn = task_role_arn 25 | @execution_role_arn = execution_role_arn 26 | region ||= EcsDeploy.config.default_region 27 | params ||= EcsDeploy.config.ecs_client_params 28 | 29 | @container_definitions = container_definitions.map do |cd| 30 | if cd[:docker_labels] 31 | cd[:docker_labels] = cd[:docker_labels].map { |k, v| [k.to_s, v] }.to_h 32 | end 33 | if cd.dig(:log_configuration, :options) 34 | cd[:log_configuration][:options] = cd.dig(:log_configuration, :options).map { |k, v| [k.to_s, v] }.to_h 35 | end 36 | cd 37 | end 38 | @volumes = volumes 39 | @network_mode = network_mode 40 | @placement_constraints = placement_constraints 41 | @requires_compatibilities = requires_compatibilities 42 | @cpu = cpu&.to_s 43 | @memory = memory&.to_s 44 | @tags = tags 45 | @client = region ? Aws::ECS::Client.new(params.merge(region: region)) : Aws::ECS::Client.new(params) 46 | @region = @client.config.region 47 | @runtime_platform = runtime_platform 48 | end 49 | 50 | def recent_task_definition_arns 51 | resp = @client.list_task_definitions( 52 | family_prefix: @task_definition_name, 53 | sort: "DESC" 54 | ) 55 | resp.task_definition_arns 56 | rescue 57 | [] 58 | end 59 | 60 | def register 61 | res = @client.register_task_definition({ 62 | family: @task_definition_name, 63 | network_mode: @network_mode, 64 | container_definitions: @container_definitions, 65 | volumes: @volumes, 66 | placement_constraints: @placement_constraints, 67 | task_role_arn: @task_role_arn, 68 | execution_role_arn: @execution_role_arn, 69 | requires_compatibilities: @requires_compatibilities, 70 | cpu: @cpu, memory: @memory, 71 | tags: @tags, 72 | runtime_platform: @runtime_platform 73 | }) 74 | EcsDeploy.logger.info "registered task definition [#{@task_definition_name}] [#{@region}] [#{Paint['OK', :green]}]" 75 | res.task_definition 76 | end 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /lib/ecs_deploy/version.rb: -------------------------------------------------------------------------------- 1 | module EcsDeploy 2 | VERSION = "1.0.7" 3 | end 4 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /spec/ecs_deploy/auto_scaler/auto_scaling_group_config_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | require "ecs_deploy/auto_scaler/auto_scaling_group_config" 4 | require "ecs_deploy/auto_scaler/service_config" 5 | 6 | RSpec.describe EcsDeploy::AutoScaler::AutoScalingGroupConfig do 7 | describe "#update_desired_capacity" do 8 | subject(:auto_scaling_group_config) do 9 | described_class.new({ 10 | "name" => asg_name, 11 | "region" => "ap-northeast-1", 12 | "buffer" => buffer, 13 | "services" => [], 14 | }, Logger.new(nil)) 15 | end 16 | 17 | let(:asg_name) { "asg_name" } 18 | let(:buffer) { 1 } 19 | let(:cluster_resource_manager) { instance_double("EcsDeploy::AutoScaler::ClusterResourceManager") } 20 | 21 | before do 22 | allow(auto_scaling_group_config).to receive(:cluster_resource_manager) { cluster_resource_manager } 23 | end 24 | 25 | context "when the current desired capacity is greater than expected" do 26 | before do 27 | allow_any_instance_of(Aws::AutoScaling::Client).to receive(:describe_auto_scaling_groups).with( 28 | auto_scaling_group_names: [asg_name], 29 | ).and_return( 30 | double( 31 | auto_scaling_groups: [ 32 | double( 33 | desired_capacity: container_instances.size, 34 | instances: container_instances.map do |i| 35 | double( 36 | availability_zone: i.attributes.find { |a| a.name == "ecs.availability-zone" }.value, 37 | instance_id: i.ec2_instance_id, 38 | lifecycle_state: "InService", 39 | ) 40 | end, 41 | ) 42 | ] 43 | ) 44 | ) 45 | 46 | allow(cluster_resource_manager).to receive(:fetch_container_instances_in_cluster).and_return(container_instances) 47 | allow(auto_scaling_group_config).to receive(:sleep).and_return(nil) 48 | end 49 | 50 | context "when there are deregistable instances in all availability zones" do 51 | let(:container_instances) do 52 | [ 53 | Aws::ECS::Types::ContainerInstance.new( 54 | pending_tasks_count: 1, 55 | running_tasks_count: 0, 56 | ec2_instance_id: "i-000000", 57 | container_instance_arn: "with_pending_task", 58 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1a")], 59 | ), 60 | Aws::ECS::Types::ContainerInstance.new( 61 | pending_tasks_count: 0, 62 | running_tasks_count: 0, 63 | ec2_instance_id: "i-111111", 64 | container_instance_arn: "with_no_task_in_ap_notrheast_1a", 65 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1a")], 66 | ), 67 | Aws::ECS::Types::ContainerInstance.new( 68 | pending_tasks_count: 0, 69 | running_tasks_count: 1, 70 | ec2_instance_id: "i-222222", 71 | container_instance_arn: "with_essential_running_task", 72 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1a")], 73 | ), 74 | Aws::ECS::Types::ContainerInstance.new( 75 | pending_tasks_count: 0, 76 | running_tasks_count: 0, 77 | ec2_instance_id: "i-333333", 78 | container_instance_arn: "with_no_task_in_ap_notrheast_1c", 79 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1c")], 80 | ), 81 | Aws::ECS::Types::ContainerInstance.new( 82 | pending_tasks_count: 0, 83 | running_tasks_count: 1, 84 | ec2_instance_id: "i-444444", 85 | container_instance_arn: "with_no_essential_running_task", 86 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1c")], 87 | ), 88 | Aws::ECS::Types::ContainerInstance.new( 89 | pending_tasks_count: 0, 90 | running_tasks_count: 0, 91 | ec2_instance_id: "i-555555", 92 | container_instance_arn: "with_no_task_in_ap_notrheast_1a_2", 93 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1a")], 94 | ), 95 | ] 96 | end 97 | 98 | before do 99 | allow(cluster_resource_manager).to receive(:fetch_container_instance_arns_in_service).and_return(["with_essential_running_task"]) 100 | end 101 | 102 | it "terminates instances without esesstial running tasks" do 103 | expect(auto_scaling_group_config).to receive(:detach_and_terminate_orphan_instances) 104 | expect(cluster_resource_manager).to receive(:deregister_container_instance).with("with_no_task_in_ap_notrheast_1a") 105 | expect(cluster_resource_manager).to receive(:deregister_container_instance).with("with_no_essential_running_task") 106 | expect(cluster_resource_manager).to receive(:deregister_container_instance).with("with_no_task_in_ap_notrheast_1a_2") 107 | expect(cluster_resource_manager).to receive(:trigger_capacity_update).with(container_instances.size, 3) 108 | expect_any_instance_of(Aws::AutoScaling::Client).to receive(:detach_instances).with( 109 | auto_scaling_group_name: asg_name, 110 | instance_ids: ["i-555555", "i-111111", "i-444444"], 111 | should_decrement_desired_capacity: true, 112 | ) 113 | expect_any_instance_of(Aws::EC2::Client).to receive(:terminate_instances).with(instance_ids: ["i-555555", "i-111111", "i-444444"]) 114 | 115 | auto_scaling_group_config.update_desired_capacity(2) 116 | end 117 | end 118 | 119 | context "when there are deregisterable instances only in one availability zone where there are fewer instances" do 120 | let(:container_instances) do 121 | [ 122 | Aws::ECS::Types::ContainerInstance.new( 123 | pending_tasks_count: 0, 124 | running_tasks_count: 1, 125 | ec2_instance_id: "i-000000", 126 | container_instance_arn: "with_essential_running_task_1a_0", 127 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1a")], 128 | ), 129 | Aws::ECS::Types::ContainerInstance.new( 130 | pending_tasks_count: 0, 131 | running_tasks_count: 1, 132 | ec2_instance_id: "i-111111", 133 | container_instance_arn: "with_essential_running_task_1a_1", 134 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1a")], 135 | ), 136 | Aws::ECS::Types::ContainerInstance.new( 137 | pending_tasks_count: 0, 138 | running_tasks_count: 0, 139 | ec2_instance_id: "i-222222", 140 | container_instance_arn: "with_no_essential_running_task_1c", 141 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1c")], 142 | ), 143 | ] 144 | end 145 | 146 | before do 147 | allow(cluster_resource_manager).to receive(:fetch_container_instance_arns_in_service).and_return([ 148 | "with_essential_running_task_1a_0", 149 | "with_essential_running_task_1a_1", 150 | ]) 151 | end 152 | 153 | it "dosen't terminates any instances" do 154 | expect(auto_scaling_group_config).to receive(:detach_and_terminate_orphan_instances) 155 | expect(cluster_resource_manager).to_not receive(:deregister_container_instance) 156 | expect(cluster_resource_manager).to_not receive(:trigger_capacity_update) 157 | expect_any_instance_of(Aws::AutoScaling::Client).to_not receive(:detach_instances) 158 | expect_any_instance_of(Aws::EC2::Client).to_not receive(:terminate_instances) 159 | 160 | auto_scaling_group_config.update_desired_capacity(1) 161 | end 162 | end 163 | end 164 | 165 | context "when the current desired capacity is less than expected" do 166 | let(:current_capacity) { 2 } 167 | let(:desired_capacity) { current_capacity + buffer } 168 | 169 | before do 170 | allow_any_instance_of(Aws::AutoScaling::Client).to receive(:describe_auto_scaling_groups).with( 171 | auto_scaling_group_names: [asg_name] 172 | ).and_return(double(auto_scaling_groups: [double(desired_capacity: current_capacity, max_size: 100)])) 173 | end 174 | 175 | it "updates the desired capacity of the auto scaling group" do 176 | expect(auto_scaling_group_config).to receive(:detach_and_terminate_orphan_instances) 177 | expect(cluster_resource_manager).to receive(:trigger_capacity_update).with(current_capacity, desired_capacity) 178 | expect_any_instance_of(Aws::AutoScaling::Client).to receive(:update_auto_scaling_group).with( 179 | auto_scaling_group_name: asg_name, 180 | min_size: 0, 181 | max_size: 100, 182 | desired_capacity: desired_capacity, 183 | ) 184 | 185 | auto_scaling_group_config.update_desired_capacity(current_capacity) 186 | end 187 | end 188 | 189 | context "when the current desired capacity is expected" do 190 | let(:current_capacity) { 2 + buffer } 191 | 192 | before do 193 | allow_any_instance_of(Aws::AutoScaling::Client).to receive(:describe_auto_scaling_groups).with( 194 | auto_scaling_group_names: [asg_name] 195 | ).and_return(double(auto_scaling_groups: [double(desired_capacity: current_capacity)])) 196 | end 197 | 198 | it "does nothing" do 199 | expect(auto_scaling_group_config).to receive(:detach_and_terminate_orphan_instances) 200 | expect(cluster_resource_manager).to_not receive(:trigger_capacity_update) 201 | expect_any_instance_of(Aws::EC2::Client).to_not receive(:terminate_instances) 202 | expect_any_instance_of(Aws::AutoScaling::Client).to_not receive(:update_auto_scaling_group) 203 | 204 | auto_scaling_group_config.update_desired_capacity(current_capacity - buffer) 205 | end 206 | end 207 | 208 | context "when detached instance is still in the ecs cluster" do 209 | let(:container_instances) do 210 | [ 211 | Aws::ECS::Types::ContainerInstance.new( 212 | pending_tasks_count: 0, 213 | running_tasks_count: 0, 214 | ec2_instance_id: "i-000000", 215 | container_instance_arn: "with_no_pending_and_running_task_1a", 216 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1a")], 217 | ), 218 | Aws::ECS::Types::ContainerInstance.new( 219 | pending_tasks_count: 0, 220 | running_tasks_count: 0, 221 | ec2_instance_id: "i-111111", 222 | container_instance_arn: "already_detached_by_drainer_but_still_in_the_cluster", 223 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1a")], 224 | ), 225 | Aws::ECS::Types::ContainerInstance.new( 226 | pending_tasks_count: 0, 227 | running_tasks_count: 1, 228 | ec2_instance_id: "i-222222", 229 | container_instance_arn: "with_running_task", 230 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1c")], 231 | ), 232 | Aws::ECS::Types::ContainerInstance.new( 233 | pending_tasks_count: 0, 234 | running_tasks_count: 0, 235 | ec2_instance_id: "i-333333", 236 | container_instance_arn: "with_no_pending_and_running_task_1c", 237 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "ap-notrheast-1c")], 238 | ), 239 | ] 240 | end 241 | let(:auto_scaling_group_instances) do 242 | [ 243 | Aws::AutoScaling::Types::Instance.new( 244 | instance_id: "i-000000", 245 | availability_zone: "ap-notrheast-1a", 246 | lifecycle_state: "InService", 247 | health_status: "Healthy", 248 | launch_template: "launch_template", 249 | protected_from_scale_in: true, 250 | ), 251 | Aws::AutoScaling::Types::Instance.new( 252 | instance_id: "i-222222", 253 | availability_zone: "ap-notrheast-1c", 254 | lifecycle_state: "InService", 255 | health_status: "Healthy", 256 | launch_template: "launch_template", 257 | protected_from_scale_in: true, 258 | ), 259 | Aws::AutoScaling::Types::Instance.new( 260 | instance_id: "i-333333", 261 | availability_zone: "ap-notrheast-1c", 262 | lifecycle_state: "InService", 263 | health_status: "Healthy", 264 | launch_template: "launch_template", 265 | protected_from_scale_in: true, 266 | ), 267 | ] 268 | end 269 | 270 | before do 271 | allow_any_instance_of(Aws::AutoScaling::Client).to receive(:describe_auto_scaling_groups).with( 272 | auto_scaling_group_names: [asg_name], 273 | ).and_return( 274 | double( 275 | auto_scaling_groups: [ 276 | double( 277 | desired_capacity: container_instances.size, 278 | instances: auto_scaling_group_instances.map do |i| 279 | double( 280 | availability_zone: i.availability_zone, 281 | instance_id: i.instance_id, 282 | lifecycle_state: "InService", 283 | ) 284 | end, 285 | ) 286 | ] 287 | ) 288 | ) 289 | 290 | allow(cluster_resource_manager).to receive(:fetch_container_instances_in_cluster).and_return(container_instances) 291 | allow(auto_scaling_group_config).to receive(:sleep).and_return(nil) 292 | allow(cluster_resource_manager).to receive(:fetch_container_instance_arns_in_service).and_return(["with_running_task"]) 293 | end 294 | 295 | it "terminates auto scaliing group instances without esesstial running tasks" do 296 | expect(auto_scaling_group_config).to receive(:detach_and_terminate_orphan_instances) 297 | expect(cluster_resource_manager).to receive(:deregister_container_instance).with("with_no_pending_and_running_task_1c") 298 | expect(cluster_resource_manager).not_to receive(:deregister_container_instance).with("already_detached_but_still_in_the_cluster") 299 | expect_any_instance_of(Aws::AutoScaling::Client).to receive(:detach_instances).with( 300 | auto_scaling_group_name: asg_name, 301 | instance_ids: ["i-333333"], 302 | should_decrement_desired_capacity: true, 303 | ) 304 | expect_any_instance_of(Aws::EC2::Client).to receive(:terminate_instances).with(instance_ids: ["i-333333"]) 305 | expect(cluster_resource_manager).to receive(:trigger_capacity_update).with(container_instances.size, 3) 306 | 307 | auto_scaling_group_config.update_desired_capacity(2) 308 | end 309 | end 310 | end 311 | 312 | describe "#detach_instances" do 313 | subject(:auto_scaling_group_config) do 314 | described_class.new({ 315 | "name" => asg_name, 316 | "region" => "ap-northeast-1", 317 | "buffer" => 0, 318 | "services" => [], 319 | }, Logger.new(nil)) 320 | end 321 | 322 | let(:asg_name) { "asg_name" } 323 | let(:auto_scaling_group_instances) do 324 | [ 325 | Aws::AutoScaling::Types::Instance.new( 326 | instance_id: "i-000000", 327 | availability_zone: "ap-notrheast-1a", 328 | lifecycle_state: "InService", 329 | health_status: "Healthy", 330 | launch_template: "launch_template", 331 | protected_from_scale_in: true, 332 | ), 333 | Aws::AutoScaling::Types::Instance.new( 334 | instance_id: "i-222222", 335 | availability_zone: "ap-notrheast-1c", 336 | lifecycle_state: "Standby", 337 | health_status: "Healthy", 338 | launch_template: "launch_template", 339 | protected_from_scale_in: true, 340 | ), 341 | Aws::AutoScaling::Types::Instance.new( 342 | instance_id: "i-333333", 343 | availability_zone: "ap-notrheast-1c", 344 | lifecycle_state: "Terminating", 345 | health_status: "", 346 | launch_template: "launch_template", 347 | protected_from_scale_in: true, 348 | ), 349 | Aws::AutoScaling::Types::Instance.new( 350 | instance_id: "i-444444", 351 | availability_zone: "ap-notrheast-1c", 352 | lifecycle_state: "Pending", 353 | health_status: "", 354 | launch_template: "launch_template", 355 | protected_from_scale_in: true, 356 | ), 357 | ] 358 | end 359 | 360 | before do 361 | allow_any_instance_of(Aws::AutoScaling::Client).to receive(:describe_auto_scaling_groups).with( 362 | auto_scaling_group_names: [asg_name], 363 | ).and_return( 364 | double( 365 | auto_scaling_groups: [ 366 | double( 367 | desired_capacity: auto_scaling_group_instances.size, 368 | instances: auto_scaling_group_instances.map do |i| 369 | double( 370 | availability_zone: i.availability_zone, 371 | instance_id: i.instance_id, 372 | lifecycle_state: i.lifecycle_state, 373 | ) 374 | end, 375 | ) 376 | ] 377 | ) 378 | ) 379 | end 380 | 381 | it "detaches only detachable instances" do 382 | expect_any_instance_of(Aws::AutoScaling::Client).to receive(:detach_instances).with( 383 | auto_scaling_group_name: asg_name, 384 | instance_ids: ["i-000000", "i-222222"], 385 | should_decrement_desired_capacity: false, 386 | ) 387 | 388 | auto_scaling_group_config.detach_instances(instance_ids: ["i-000000", "i-222222", "i-333333"], should_decrement_desired_capacity: false) 389 | end 390 | end 391 | 392 | end 393 | -------------------------------------------------------------------------------- /spec/ecs_deploy/auto_scaler/cluster_resource_manager_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | require "ecs_deploy/auto_scaler/cluster_resource_manager" 4 | 5 | RSpec.describe EcsDeploy::AutoScaler::ClusterResourceManager do 6 | let(:cluster_resource_manager) do 7 | described_class.new( 8 | region: "ap-northeast-1", 9 | cluster: "cluster", 10 | service_configs: service_configs, 11 | capacity_based_on: capacity_based_on, 12 | ) 13 | end 14 | let(:service_configs) { [] } 15 | 16 | describe "#acquire" do 17 | let(:capacity_based_on) { "instances" } 18 | let(:service_configs) { [service_config] } 19 | let(:service_config) do 20 | double(name: "service_name", required_capacity: 0.5, desired_count: 4) 21 | end 22 | 23 | before do 24 | @container_instance_arns = ["arn", "arn"] 25 | Aws.config[:ecs] = { 26 | stub_responses: { 27 | list_container_instances: ->(_) { 28 | { container_instance_arns: @container_instance_arns } 29 | } 30 | } 31 | } 32 | end 33 | 34 | it do 35 | cluster_resource_manager.trigger_capacity_update(2, 3, interval: 0.1) 36 | 37 | expect(cluster_resource_manager.acquire(1, timeout: 0.5)).to be false 38 | @container_instance_arns << "arn" 39 | expect(cluster_resource_manager.acquire(1, timeout: 0.5)).to be true 40 | end 41 | end 42 | 43 | describe "#calculate_active_instance_capacity" do 44 | context "when capacity_based_on is 'instances'" do 45 | let(:capacity_based_on) { "instances" } 46 | 47 | before do 48 | Aws.config[:ecs] = { 49 | stub_responses: { 50 | list_container_instances: { 51 | container_instance_arns: %w[arn1 arn2], 52 | } 53 | } 54 | } 55 | end 56 | 57 | it do 58 | expect(cluster_resource_manager.calculate_active_instance_capacity).to eq 2 59 | end 60 | end 61 | 62 | context "when capacity_based_on is 'instances'" do 63 | let(:capacity_based_on) { "vCPUs" } 64 | 65 | let(:container_instances) do 66 | [ 67 | Aws::ECS::Types::ContainerInstance.new( 68 | container_instance_arn: "2vCPUs_instance_arn", 69 | registered_resources: [ 70 | { 71 | integer_value: 2048, 72 | name: "CPU", 73 | }, 74 | ], 75 | ), 76 | Aws::ECS::Types::ContainerInstance.new( 77 | container_instance_arn: "4vCPUs_instance_arn", 78 | registered_resources: [ 79 | { 80 | integer_value: 4096, 81 | name: "CPU", 82 | }, 83 | ], 84 | ), 85 | ] 86 | end 87 | 88 | before do 89 | ecs_client = Aws::ECS::Client.new(stub_responses: true) 90 | ecs_client.stub_responses(:list_container_instances, { 91 | container_instance_arns: container_instances.map(&:container_instance_arn), 92 | }) 93 | ecs_client.stub_responses(:describe_container_instances, { 94 | container_instances: container_instances, 95 | }) 96 | allow(cluster_resource_manager).to receive(:ecs_client) { ecs_client } 97 | end 98 | 99 | it do 100 | expect(cluster_resource_manager.calculate_active_instance_capacity).to eq 6 101 | end 102 | end 103 | end 104 | end 105 | -------------------------------------------------------------------------------- /spec/ecs_deploy/auto_scaler/instance_drainer_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | require "securerandom" 4 | 5 | require "ecs_deploy/auto_scaler/auto_scaling_group_config" 6 | require "ecs_deploy/auto_scaler/instance_drainer" 7 | 8 | RSpec.describe EcsDeploy::AutoScaler::InstanceDrainer do 9 | describe "#poll_spot_instance_interruption_warnings" do 10 | subject(:drainer) do 11 | described_class.new( 12 | auto_scaling_group_configs: [asg_config], 13 | spot_fleet_request_configs: [double(id: "sfr_id", region: "ap-northeast-1", cluster: nil, disable_draining: disable_draining)], 14 | logger: Logger.new(nil), 15 | ) 16 | end 17 | 18 | let(:asg_config) do 19 | instance_double("EcsDeploy::AutoScaler::AutoScalingGroupConfig", 20 | name: "asg_name", 21 | region: "ap-northeast-1", 22 | cluster: "ecs-cluster", 23 | disable_draining: disable_draining, 24 | ) 25 | end 26 | 27 | let(:instances) do 28 | [ 29 | { instance_id: 'i-000000', tags: [{ key: "aws:ec2spot:fleet-request-id", value: "sfr_id" }] }, 30 | { instance_id: 'i-111111', tags: [{ key: "aws:ec2spot:fleet-request-id", value: "another_sfr_id" }] }, 31 | { instance_id: 'i-222222', tags: [{ key: "aws:autoscaling:groupName", value: "asg_name" }] }, 32 | { instance_id: 'i-333333', tags: [{ key: "aws:autoscaling:groupName", value: "another_asg_name" }] }, 33 | { instance_id: 'i-444444', tags: [] }, 34 | ] 35 | end 36 | 37 | let(:messages) do 38 | instances.map do |i| 39 | { 40 | message_id: SecureRandom.uuid, 41 | body: %Q|{"version":"0","id":"478e68b4-9ad3-1fb4-e8a2-aef2d793738d","detail-type":"EC2 Spot Instance Interruption Warning","source":"aws.ec2","account":"1234","time":"2019-10-05T14:19:37Z","region":"ap-northeast-1","resources":["arn:aws:ec2:ap-northeast-1a:instance/#{i[:instance_id]}"],"detail":{"instance-id":"#{i[:instance_id]}","instance-action":"terminate"}}|, 42 | } 43 | end 44 | end 45 | 46 | let(:ec2_client) { Aws::EC2::Client.new(stub_responses: true) } 47 | let(:ecs_client) { Aws::ECS::Client.new(stub_responses: true) } 48 | let(:sqs_client) { Aws::SQS::Client.new(stub_responses: true) } 49 | 50 | before do 51 | allow(drainer).to receive(:ec2_client) { ec2_client } 52 | allow(drainer).to receive(:ecs_client) { ecs_client } 53 | allow(drainer).to receive(:sqs_client) { sqs_client } 54 | 55 | sqs_client.stub_responses(:receive_message, { messages: messages }) 56 | allow(sqs_client).to receive(:delete_message_batch) do 57 | drainer.stop 58 | throw :stop_polling 59 | end 60 | 61 | ec2_client.stub_responses(:describe_instances, ->(context) { 62 | if context.params[:instance_ids] == instances.map { |i| i[:instance_id] } 63 | { reservations: [{ instances: instances }] } 64 | else 65 | {} 66 | end 67 | }) 68 | 69 | ecs_client.stub_responses(:list_container_instances, ->(context) { 70 | if context.params[:cluster] == nil && context.params[:filter] == "ec2InstanceId in [i-000000]" 71 | { container_instance_arns: ["arn:i-000000"] } 72 | elsif context.params[:cluster] == "ecs-cluster" && context.params[:filter] == "ec2InstanceId in [i-222222]" 73 | { container_instance_arns: ["arn:i-222222"] } 74 | else 75 | {} 76 | end 77 | }) 78 | end 79 | 80 | [nil, false, "false"].each do |disable_draining| 81 | context "with disable_draining #{disable_draining.inspect}" do 82 | let(:disable_draining) { disable_draining } 83 | 84 | it "updates the state of interrupted instances to 'DRAINING'" do 85 | expect(asg_config).to receive(:detach_instances).with(instance_ids: ["i-222222"], should_decrement_desired_capacity: false) 86 | 87 | drainer.poll_spot_instance_interruption_warnings("https://sqs.ap-northeast-1.amazonaws.com/account_id/queue_name") 88 | 89 | expect(ecs_client.api_requests).to include({ 90 | operation_name: :update_container_instances_state, 91 | params: { cluster: nil, container_instances: ["arn:i-000000"], status: "DRAINING" }, 92 | context: a_kind_of(Seahorse::Client::RequestContext), 93 | }) 94 | expect(ecs_client.api_requests).to include({ 95 | operation_name: :update_container_instances_state, 96 | params: { cluster: "ecs-cluster", container_instances: ["arn:i-222222"], status: "DRAINING" }, 97 | context: a_kind_of(Seahorse::Client::RequestContext), 98 | }) 99 | end 100 | end 101 | end 102 | 103 | [true, "true"].each do |disable_draining| 104 | context "with disable_draining #{disable_draining.inspect}" do 105 | let(:disable_draining) { disable_draining } 106 | 107 | it "updates the state of interrupted instances to 'DRAINING'" do 108 | expect(asg_config).to receive(:detach_instances).with(instance_ids: ["i-222222"], should_decrement_desired_capacity: false) 109 | 110 | drainer.poll_spot_instance_interruption_warnings("https://sqs.ap-northeast-1.amazonaws.com/account_id/queue_name") 111 | 112 | expect(ecs_client.api_requests).to eq [] 113 | end 114 | end 115 | end 116 | end 117 | end 118 | -------------------------------------------------------------------------------- /spec/ecs_deploy/auto_scaler/service_config_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | require "ecs_deploy/auto_scaler/service_config" 4 | 5 | RSpec.describe EcsDeploy::AutoScaler::ServiceConfig do 6 | describe "#adjust_desired_count" do 7 | before do 8 | allow_any_instance_of(described_class).to receive(:client) { ecs_client } 9 | allow(ecs_client).to receive(:describe_services).and_return(double(services: [double(desired_count: initial_desired_count)])) 10 | end 11 | 12 | subject(:service_config) do 13 | described_class.new({ 14 | "name" => "service_name", 15 | "cluster" => "cluster", 16 | "region" => "ap-northeast-1", 17 | "step" => 1, 18 | "max_task_count" => 100, 19 | "min_task_count" => 1, 20 | "cooldown_time_for_reach_max" => 300, 21 | "upscale_triggers" => [ 22 | { 23 | "alarm_name" => "upscale_trigger_with_default_step", 24 | "region" => "ap-northeast-1", 25 | "state" => "ALARM", 26 | }, 27 | { 28 | "alarm_name" => "upscale_trigger_with_step_2", 29 | "region" => "ap-northeast-1", 30 | "state" => "ALARM", 31 | "step" => 2, 32 | }, 33 | { 34 | "alarm_name" => "upscale_trigger_with_step_1", 35 | "region" => "ap-northeast-1", 36 | "state" => "ALARM", 37 | "step" => 1, 38 | }, 39 | ], 40 | "downscale_triggers" => downscale_triggers, 41 | }, Logger.new(nil)) 42 | end 43 | let(:downscale_triggers) do 44 | [ 45 | { 46 | "alarm_name" => "downscale_trigger_with_step_2", 47 | "region" => "ap-northeast-1", 48 | "state" => "ALARM", 49 | "step" => 2, 50 | }, 51 | { 52 | "alarm_name" => "downscale_trigger_with_step_1", 53 | "region" => "ap-northeast-1", 54 | "state" => "ALARM", 55 | "step" => 1, 56 | }, 57 | ] 58 | end 59 | 60 | let(:initial_desired_count) { 1 } 61 | let(:ecs_client) { instance_double("Aws::ECS::Client") } 62 | 63 | let(:cluster_resource_manager) { instance_double("EcsDeploy::AutoScaler::ClusterResourceManager") } 64 | 65 | context "when all triggers match" do 66 | before do 67 | (service_config.upscale_triggers + service_config.downscale_triggers).each do |trigger| 68 | allow(trigger).to receive(:match?).and_return(true) 69 | end 70 | end 71 | 72 | it "uses the maximum step of upscale triggers" do 73 | expect(cluster_resource_manager).to receive(:acquire).with(1, timeout: kind_of(Float)).twice { true } 74 | expect(ecs_client).to receive(:update_service).with( 75 | cluster: service_config.cluster, 76 | service: service_config.name, 77 | desired_count: initial_desired_count + 1, 78 | ) 79 | expect(ecs_client).to receive(:update_service).with( 80 | cluster: service_config.cluster, 81 | service: service_config.name, 82 | desired_count: initial_desired_count + 2, 83 | ) 84 | 85 | service_config.adjust_desired_count(cluster_resource_manager) 86 | service_config.wait_until_desired_count_updated 87 | end 88 | end 89 | 90 | context "when a downscale trigger exists and all triggers match" do 91 | let(:initial_desired_count) { 3 } 92 | let(:downscale_triggers) do 93 | [ 94 | { 95 | "alarm_name" => "downscale_trigger_with_step_2", 96 | "region" => "ap-northeast-1", 97 | "state" => "ALARM", 98 | "step" => 2, 99 | }, 100 | { 101 | "alarm_name" => "downscale_trigger_with_step_1", 102 | "region" => "ap-northeast-1", 103 | "state" => "ALARM", 104 | "step" => 1, 105 | "prioritized_over_upscale_triggers" => true, 106 | }, 107 | ] 108 | end 109 | 110 | before do 111 | (service_config.upscale_triggers + service_config.downscale_triggers).each do |trigger| 112 | allow(trigger).to receive(:match?).and_return(true) 113 | end 114 | end 115 | 116 | it "uses the maximum step of down triggers with prioritized_over_upscale_triggers true" do 117 | expect(cluster_resource_manager).to receive(:release).with(1) 118 | expect(ecs_client).to receive(:update_service).with( 119 | cluster: service_config.cluster, 120 | service: service_config.name, 121 | desired_count: initial_desired_count - 1, 122 | ) 123 | 124 | expect(ecs_client).to receive(:wait_until).with(:services_stable, cluster: service_config.cluster, services: [service_config.name]) 125 | expect(ecs_client).to receive(:list_tasks).and_return([double(task_arns: ["stopping_task_arn"])], [double(task_arns: [])]) 126 | expect(ecs_client).to receive(:wait_until).with(:tasks_stopped, cluster: service_config.cluster, tasks: ["stopping_task_arn"]) 127 | 128 | service_config.adjust_desired_count(cluster_resource_manager) 129 | end 130 | end 131 | 132 | context "when only a downscale trigger matches" do 133 | before do 134 | (service_config.upscale_triggers + service_config.downscale_triggers).each do |trigger| 135 | allow(trigger).to receive(:match?).and_return(false) 136 | end 137 | allow(service_config.downscale_triggers.first).to receive(:match?).and_return(true) 138 | end 139 | 140 | context "when desired_count - step is greater than or equal to min_task_count" do 141 | let(:initial_desired_count) { 3 } 142 | 143 | it "uses the maximum step of down triggers" do 144 | expect(cluster_resource_manager).to receive(:release).with(2) 145 | expect(ecs_client).to receive(:update_service).with( 146 | cluster: service_config.cluster, 147 | service: service_config.name, 148 | desired_count: initial_desired_count - 2, 149 | ) 150 | 151 | expect(ecs_client).to receive(:wait_until).with(:services_stable, cluster: service_config.cluster, services: [service_config.name]) 152 | expect(ecs_client).to receive(:list_tasks).and_return([double(task_arns: ["stopping_task_arn"])], [double(task_arns: [])]) 153 | expect(ecs_client).to receive(:wait_until).with(:tasks_stopped, cluster: service_config.cluster, tasks: ["stopping_task_arn"]) 154 | 155 | service_config.adjust_desired_count(cluster_resource_manager) 156 | end 157 | end 158 | 159 | context "when desired_count - step is less than min_task_count" do 160 | let(:initial_desired_count) { 2 } 161 | 162 | it "decreases desired_count to min_task_count" do 163 | expect(cluster_resource_manager).to receive(:release).with(1) 164 | expect(ecs_client).to receive(:update_service).with( 165 | cluster: service_config.cluster, 166 | service: service_config.name, 167 | desired_count: initial_desired_count - 1, 168 | ) 169 | 170 | expect(ecs_client).to receive(:wait_until).with(:services_stable, cluster: service_config.cluster, services: [service_config.name]) 171 | expect(ecs_client).to receive(:list_tasks).and_return([double(task_arns: ["stopping_task_arn"])], [double(task_arns: [])]) 172 | expect(ecs_client).to receive(:wait_until).with(:tasks_stopped, cluster: service_config.cluster, tasks: ["stopping_task_arn"]) 173 | 174 | service_config.adjust_desired_count(cluster_resource_manager) 175 | end 176 | end 177 | end 178 | end 179 | end 180 | -------------------------------------------------------------------------------- /spec/ecs_deploy/auto_scaler_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | require "ecs_deploy/auto_scaler" 4 | 5 | RSpec.describe EcsDeploy::AutoScaler do 6 | describe "#load_config" do 7 | it do 8 | described_class.load_config(File.join(__dir__, "..", "fixtures", "files", "ecs_auto_scaler_config_in_old_format.yaml")) 9 | old_config = described_class.instance_variable_get(:@config) 10 | described_class.load_config(File.join(__dir__, "..", "fixtures", "files", "ecs_auto_scaler_config_in_new_format.yaml")) 11 | new_config = described_class.instance_variable_get(:@config) 12 | expect(old_config).to eq new_config 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /spec/ecs_deploy/instance_fluctuation_manager_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | require "logger" 4 | require "stringio" 5 | require "ecs_deploy/instance_fluctuation_manager" 6 | 7 | RSpec.describe EcsDeploy::InstanceFluctuationManager do 8 | let(:logdev) do 9 | StringIO.new 10 | end 11 | let(:instance_fluctuation_manager) do 12 | described_class.new( 13 | region: "ap-northeast-1", 14 | cluster: "cluster", 15 | auto_scaling_group_name: "asg-cluster", 16 | desired_capacity: 50, 17 | logger: ::Logger.new(logdev) 18 | ) 19 | end 20 | 21 | describe "#increase" do 22 | context "w/o error" do 23 | before do 24 | @auto_scaling_groups = [ 25 | Aws::AutoScaling::Types::AutoScalingGroup.new( 26 | desired_capacity: 50, 27 | max_size: 100 28 | ) 29 | ] 30 | Aws.config[:autoscaling] = { 31 | stub_responses: { 32 | describe_auto_scaling_groups: lambda do |_| 33 | Aws::AutoScaling::Types::AutoScalingGroupsType.new( 34 | auto_scaling_groups: @auto_scaling_groups, 35 | ) 36 | end, 37 | update_auto_scaling_group: lambda do |_| 38 | # no error 39 | nil 40 | end 41 | } 42 | } 43 | 44 | cluster = Aws::ECS::Types::Cluster.new(registered_container_instances_count: 50) 45 | expect(cluster).to receive(:registered_container_instances_count) 46 | .exactly(5).times.and_return(60, 70, 80, 90, 100) 47 | @clusters = [cluster] 48 | Aws.config[:ecs] = { 49 | stub_responses: { 50 | describe_clusters: lambda do |_| 51 | Aws::ECS::Types::DescribeClustersResponse.new(clusters: @clusters) 52 | end 53 | } 54 | } 55 | 56 | allow(instance_fluctuation_manager).to receive(:sleep) 57 | end 58 | 59 | it "succeeded in increasing instances" do 60 | thread = instance_fluctuation_manager.increase 61 | thread.join 62 | log = logdev.string 63 | expect(log).to include("Increasing desired capacity of asg-cluster: 50 => 100") 64 | [60, 70, 80, 90].each do |count| 65 | expect(log).to include("Current registered instance count: #{count}") 66 | end 67 | expect(log).to include("Succeeded in increasing instances!") 68 | end 69 | end 70 | end 71 | 72 | describe("#decrease") do 73 | context "w/ 2 availability zones" do 74 | before do 75 | @auto_scaling_groups = [ 76 | Aws::AutoScaling::Types::AutoScalingGroup.new( 77 | desired_capacity: 100, 78 | max_size: 100 79 | ) 80 | ] 81 | Aws.config[:autoscaling] = { 82 | stub_responses: { 83 | describe_auto_scaling_groups: lambda do |_| 84 | Aws::AutoScaling::Types::AutoScalingGroupsType.new( 85 | auto_scaling_groups: @auto_scaling_groups, 86 | ) 87 | end, 88 | update_auto_scaling_group: lambda do |_| 89 | # no error 90 | end, 91 | detach_instances: lambda do |_| 92 | # no error 93 | end 94 | } 95 | } 96 | 97 | arns = (1..100).to_a.map {|n| sprintf("arn:aws:ecs:ap-northeast-1:xxx:container-instance/%03d", n) } 98 | availability_zones = [ 99 | Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "zone-a"), 100 | Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "zone-b"), 101 | ] 102 | container_instances = arns.map do |arn| 103 | Aws::ECS::Types::ContainerInstance.new( 104 | container_instance_arn: arn, 105 | running_tasks_count: rand(1..10), 106 | attributes: [availability_zones.sample], 107 | ec2_instance_id: "ec2-#{arn}" 108 | ) 109 | end 110 | task_arns = (1..10).to_a.map {|n| sprintf("task-arn%02d", n) } 111 | tasks = task_arns.map do |arn| 112 | group = ["family:#{arn}", "dummy:#{arn}"].sample 113 | Aws::ECS::Types::Task.new(task_arn: arn, group: group) 114 | end 115 | Aws.config[:ecs] = { 116 | stub_responses: { 117 | list_container_instances: lambda do |_| 118 | Aws::ECS::Types::ListContainerInstancesResponse.new(container_instance_arns: arns) 119 | end, 120 | describe_container_instances: lambda do |_| 121 | Aws::ECS::Types::DescribeContainerInstancesResponse.new(container_instances: container_instances) 122 | end, 123 | update_container_instances_state: lambda do |_| 124 | # no error 125 | end, 126 | list_tasks: lambda do |_| 127 | Aws::ECS::Types::ListTasksResponse.new(task_arns: task_arns) 128 | end, 129 | describe_tasks: lambda do |_| 130 | Aws::ECS::Types::DescribeTasksResponse.new(tasks: tasks) 131 | end, 132 | stop_task: lambda do |_| 133 | # no error 134 | end 135 | } 136 | } 137 | # Must stub after set :stub_responses to Aws.config[:ecs] 138 | ecs_client = instance_fluctuation_manager.send(:ecs_client) 139 | allow(ecs_client).to receive(:wait_until) 140 | expect(ecs_client).to receive(:stop_task).at_most(arns.size * tasks.size).times 141 | 142 | Aws.config[:ec2] = { 143 | stub_responses: { 144 | terminate_instances: {} 145 | } 146 | } 147 | ec2_client = instance_fluctuation_manager.send(:ec2_client) 148 | allow(ec2_client).to receive(:wait_until) 149 | end 150 | 151 | it "succeeded in decreasing instances" do 152 | instance_fluctuation_manager.decrease 153 | log = logdev.string 154 | expect(log).to include("Decreasing desired capacity of asg-cluster: 100 => 50") 155 | expect(log).to include("Succeeded in decreasing instances!") 156 | instance_size_per_az = log.lines.grep(/AZ balance/).last.scan(/AZ balance: \{"zone-a"=>(\d+), "zone-b"=>(\d+)\}/).flatten.map(&:to_i) 157 | expect(instance_size_per_az).to contain_exactly(25, 25) 158 | end 159 | end 160 | 161 | context "w/ 3 availability_zones" do 162 | before do 163 | @auto_scaling_groups = [ 164 | Aws::AutoScaling::Types::AutoScalingGroup.new( 165 | desired_capacity: 100, 166 | max_size: 100 167 | ) 168 | ] 169 | Aws.config[:autoscaling] = { 170 | stub_responses: { 171 | describe_auto_scaling_groups: lambda do |_| 172 | Aws::AutoScaling::Types::AutoScalingGroupsType.new( 173 | auto_scaling_groups: @auto_scaling_groups, 174 | ) 175 | end, 176 | update_auto_scaling_group: lambda do |_| 177 | # no error 178 | end, 179 | detach_instances: lambda do |_| 180 | # no error 181 | end 182 | } 183 | } 184 | 185 | arns = (1..100).to_a.map {|n| sprintf("arn:aws:ecs:ap-northeast-1:xxx:container-instance/%03d", n) } 186 | availability_zones = [ 187 | Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "zone-a"), 188 | Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "zone-b"), 189 | Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "zone-c") 190 | ] 191 | container_instances = arns.map do |arn| 192 | Aws::ECS::Types::ContainerInstance.new( 193 | container_instance_arn: arn, 194 | running_tasks_count: rand(1..10), 195 | attributes: [availability_zones.sample], 196 | ec2_instance_id: "ec2-#{arn}" 197 | ) 198 | end 199 | task_arns = (1..10).to_a.map {|n| sprintf("task-arn%02d", n) } 200 | tasks = task_arns.map do |arn| 201 | group = ["family:#{arn}", "dummy:#{arn}"].sample 202 | Aws::ECS::Types::Task.new(task_arn: arn, group: group) 203 | end 204 | Aws.config[:ecs] = { 205 | stub_responses: { 206 | list_container_instances: lambda do |_| 207 | Aws::ECS::Types::ListContainerInstancesResponse.new(container_instance_arns: arns) 208 | end, 209 | describe_container_instances: lambda do |_| 210 | Aws::ECS::Types::DescribeContainerInstancesResponse.new(container_instances: container_instances) 211 | end, 212 | update_container_instances_state: lambda do |_| 213 | # no error 214 | end, 215 | list_tasks: lambda do |_| 216 | Aws::ECS::Types::ListTasksResponse.new(task_arns: task_arns) 217 | end, 218 | describe_tasks: lambda do |_| 219 | Aws::ECS::Types::DescribeTasksResponse.new(tasks: tasks) 220 | end, 221 | stop_task: lambda do |_| 222 | # no error 223 | end 224 | } 225 | } 226 | # Must stub after set :stub_responses to Aws.config[:ecs] 227 | ecs_client = instance_fluctuation_manager.send(:ecs_client) 228 | allow(ecs_client).to receive(:wait_until) 229 | expect(ecs_client).to receive(:stop_task).at_most(arns.size * tasks.size).times 230 | 231 | Aws.config[:ec2] = { 232 | stub_responses: { 233 | terminate_instances: {} 234 | } 235 | } 236 | ec2_client = instance_fluctuation_manager.send(:ec2_client) 237 | allow(ec2_client).to receive(:wait_until) 238 | end 239 | 240 | context "desired capacity is multiple of 3" do 241 | let(:instance_fluctuation_manager) do 242 | described_class.new( 243 | region: "ap-northeast-1", 244 | cluster: "cluster", 245 | auto_scaling_group_name: "asg-cluster", 246 | desired_capacity: 60, 247 | logger: ::Logger.new(logdev) 248 | ) 249 | end 250 | 251 | it "succeeded in decreasing instances" do 252 | instance_fluctuation_manager.decrease 253 | log = logdev.string 254 | expect(log).to include("Decreasing desired capacity of asg-cluster: 100 => 60") 255 | expect(log).to include("Succeeded in decreasing instances!") 256 | instance_size_per_az = log.lines.grep(/AZ balance/).last.scan(/AZ balance: \{"zone-a"=>(\d+), "zone-b"=>(\d+), "zone-c"=>(\d+)\}/).flatten.map(&:to_i) 257 | expect(instance_size_per_az).to contain_exactly(20, 20, 20) 258 | end 259 | end 260 | 261 | context "desired capacity is odd number" do 262 | let(:instance_fluctuation_manager) do 263 | described_class.new( 264 | region: "ap-northeast-1", 265 | cluster: "cluster", 266 | auto_scaling_group_name: "asg-cluster", 267 | desired_capacity: 53, 268 | logger: ::Logger.new(logdev) 269 | ) 270 | end 271 | 272 | it "succeeded in decreasing instances" do 273 | instance_fluctuation_manager.decrease 274 | log = logdev.string 275 | expect(log).to include("Decreasing desired capacity of asg-cluster: 100 => 53") 276 | expect(log).to include("Succeeded in decreasing instances!") 277 | instance_size_per_az = log.lines.grep(/AZ balance/).last.scan(/AZ balance: \{"zone-a"=>(\d+), "zone-b"=>(\d+), "zone-c"=>(\d+)\}/).flatten.map(&:to_i) 278 | expect(instance_size_per_az).to contain_exactly(17, 18, 18) 279 | end 280 | end 281 | end 282 | 283 | context "with DEREGISTERING status" do 284 | let(:instance_fluctuation_manager) do 285 | described_class.new( 286 | region: "ap-northeast-1", 287 | cluster: "cluster", 288 | auto_scaling_group_name: "asg-cluster", 289 | desired_capacity: 0, 290 | logger: ::Logger.new(logdev) 291 | ) 292 | end 293 | let(:auto_scaling_groups) do 294 | [ 295 | Aws::AutoScaling::Types::AutoScalingGroup.new( 296 | desired_capacity: 1, 297 | max_size: 5 298 | ) 299 | ] 300 | end 301 | let(:arns) do 302 | 2.times.map { |i| "arn:aws:ecs:ap-northeast-1:xxx:container-instance/00#{i}" } 303 | end 304 | let(:ec2_instance_ids) do 305 | 2.times.map { |i| "ec2-#{arns[i]}" } 306 | end 307 | let(:container_instances) do 308 | [ 309 | Aws::ECS::Types::ContainerInstance.new( 310 | container_instance_arn: arns[0], 311 | running_tasks_count: 1, 312 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "zone-a")], 313 | ec2_instance_id: ec2_instance_ids[0], 314 | status: 'ACTIVE', 315 | ), 316 | Aws::ECS::Types::ContainerInstance.new( 317 | container_instance_arn: arns[1], 318 | running_tasks_count: 0, 319 | attributes: [Aws::ECS::Types::Attribute.new(name: "ecs.availability-zone", value: "zone-a")], 320 | ec2_instance_id: ec2_instance_ids[1], 321 | status: 'DEREGISTERING', 322 | ) 323 | ] 324 | end 325 | let(:task_arns) do 326 | 2.times.map {|i| sprintf("task-arn%02d", i) } 327 | end 328 | let(:tasks) do 329 | task_arns.map do |arn| 330 | group = ["family:#{arn}", "dummy:#{arn}"].sample 331 | Aws::ECS::Types::Task.new(task_arn: arn, group: group) 332 | end 333 | end 334 | 335 | before do 336 | Aws.config[:autoscaling] = { 337 | stub_responses: { 338 | describe_auto_scaling_groups: lambda do |_| 339 | Aws::AutoScaling::Types::AutoScalingGroupsType.new( 340 | auto_scaling_groups: auto_scaling_groups, 341 | ) 342 | end, 343 | update_auto_scaling_group: lambda do |_| 344 | # no error 345 | end, 346 | detach_instances: lambda do |_| 347 | # no error 348 | end 349 | } 350 | } 351 | 352 | Aws.config[:ecs] = { 353 | stub_responses: { 354 | list_container_instances: lambda do |_| 355 | Aws::ECS::Types::ListContainerInstancesResponse.new(container_instance_arns: arns) 356 | end, 357 | describe_container_instances: lambda do |_| 358 | Aws::ECS::Types::DescribeContainerInstancesResponse.new(container_instances: container_instances) 359 | end, 360 | update_container_instances_state: lambda do |_| 361 | # no error 362 | end, 363 | list_tasks: lambda do |_| 364 | Aws::ECS::Types::ListTasksResponse.new(task_arns: task_arns) 365 | end, 366 | describe_tasks: lambda do |_| 367 | Aws::ECS::Types::DescribeTasksResponse.new(tasks: tasks) 368 | end, 369 | stop_task: lambda do |_| 370 | # no error 371 | end 372 | } 373 | } 374 | Aws.config[:ec2] = { 375 | stub_responses: { 376 | terminate_instances: {} 377 | } 378 | } 379 | 380 | # Must stub after set :stub_responses to Aws.config[:ecs] 381 | ecs_client = instance_fluctuation_manager.send(:ecs_client) 382 | allow(ecs_client).to receive(:wait_until) 383 | expect(ecs_client).to receive(:stop_task).at_most(2).times 384 | 385 | 386 | ec2_client = instance_fluctuation_manager.send(:ec2_client) 387 | allow(ec2_client).to receive(:wait_until) 388 | end 389 | 390 | it "succeeded in decreasing instances" do 391 | # terminate instances whose status is not 'DEREGISTERING' 392 | ec2_client = instance_fluctuation_manager.send(:ec2_client) 393 | expect(ec2_client).to receive(:terminate_instances).with(instance_ids: [ec2_instance_ids[0]]) 394 | 395 | instance_fluctuation_manager.decrease 396 | log = logdev.string 397 | expect(log).to include("Decreasing desired capacity of asg-cluster: 1 => 0") 398 | expect(log).to include("Succeeded in decreasing instances!") 399 | end 400 | end 401 | end 402 | end 403 | -------------------------------------------------------------------------------- /spec/fixtures/files/ecs_auto_scaler_config_in_new_format.yaml: -------------------------------------------------------------------------------- 1 | polling_interval: 60 2 | 3 | auto_scaling_groups: 4 | - name: ecs-cluster-nodes 5 | region: ap-northeast-1 6 | cluster: ecs-cluster 7 | buffer: 1 8 | services: 9 | - name: repro-api-production 10 | step: 1 11 | idle_time: 240 12 | max_task_count: [10, 25] 13 | scheduled_min_task_count: 14 | - {from: "1:45", to: "4:30", count: 8} 15 | cooldown_time_for_reach_max: 600 16 | min_task_count: 0 17 | required_capacity: 0.5 18 | upscale_triggers: 19 | - alarm_name: "ECS [repro-api-production] CPUUtilization" 20 | state: ALARM 21 | - alarm_name: "ELB repro-api-a HTTPCode_Backend_5XX" 22 | state: ALARM 23 | step: 2 24 | downscale_triggers: 25 | - alarm_name: "ECS [repro-api-production] CPUUtilization (low)" 26 | state: OK 27 | 28 | spot_fleet_requests: 29 | - id: sfr-354de735-2c17-4565-88c9-10ada5b957e5 30 | region: ap-northeast-1 31 | cluster: ecs-cluster-for-worker 32 | buffer: 1 33 | services: 34 | - name: repro-worker-production 35 | step: 1 36 | idle_time: 240 37 | cooldown_time_for_reach_max: 600 38 | min_task_count: 0 39 | required_capacity: 2 40 | upscale_triggers: 41 | - alarm_name: "ECS [repro-worker-production] CPUUtilization" 42 | state: ALARM 43 | downscale_triggers: 44 | - alarm_name: "ECS [repro-worker-production] CPUUtilization (low)" 45 | state: OK 46 | 47 | spot_instance_intrp_warns_queue_urls: 48 | - https://sqs.ap-northeast-1.amazonaws.com//spot-instance-intrp-warns 49 | -------------------------------------------------------------------------------- /spec/fixtures/files/ecs_auto_scaler_config_in_old_format.yaml: -------------------------------------------------------------------------------- 1 | polling_interval: 60 2 | 3 | auto_scaling_groups: 4 | - name: ecs-cluster-nodes 5 | region: ap-northeast-1 6 | buffer: 1 7 | 8 | spot_fleet_requests: 9 | - id: sfr-354de735-2c17-4565-88c9-10ada5b957e5 10 | region: ap-northeast-1 11 | buffer: 1 12 | 13 | spot_instance_intrp_warns_queue_urls: 14 | - https://sqs.ap-northeast-1.amazonaws.com//spot-instance-intrp-warns 15 | 16 | services: 17 | - name: repro-api-production 18 | cluster: ecs-cluster 19 | region: ap-northeast-1 20 | auto_scaling_group_name: ecs-cluster-nodes 21 | step: 1 22 | idle_time: 240 23 | max_task_count: [10, 25] 24 | scheduled_min_task_count: 25 | - {from: "1:45", to: "4:30", count: 8} 26 | cooldown_time_for_reach_max: 600 27 | min_task_count: 0 28 | required_capacity: 0.5 29 | upscale_triggers: 30 | - alarm_name: "ECS [repro-api-production] CPUUtilization" 31 | state: ALARM 32 | - alarm_name: "ELB repro-api-a HTTPCode_Backend_5XX" 33 | state: ALARM 34 | step: 2 35 | downscale_triggers: 36 | - alarm_name: "ECS [repro-api-production] CPUUtilization (low)" 37 | state: OK 38 | 39 | - name: repro-worker-production 40 | cluster: ecs-cluster-for-worker 41 | region: ap-northeast-1 42 | spot_fleet_request_id: sfr-354de735-2c17-4565-88c9-10ada5b957e5 43 | step: 1 44 | idle_time: 240 45 | cooldown_time_for_reach_max: 600 46 | min_task_count: 0 47 | required_capacity: 2 48 | upscale_triggers: 49 | - alarm_name: "ECS [repro-worker-production] CPUUtilization" 50 | state: ALARM 51 | downscale_triggers: 52 | - alarm_name: "ECS [repro-worker-production] CPUUtilization (low)" 53 | state: OK 54 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | require "ecs_deploy" 3 | 4 | RSpec.configure do |config| 5 | # Enable flags like --only-failures and --next-failure 6 | config.example_status_persistence_file_path = ".rspec_status" 7 | 8 | # Disable RSpec exposing methods globally on `Module` and `main` 9 | config.disable_monkey_patching! 10 | 11 | config.expect_with :rspec do |c| 12 | c.syntax = :expect 13 | end 14 | end 15 | --------------------------------------------------------------------------------