├── Gemfile ├── bin └── spot_build ├── spec ├── spec_helper.rb └── buildkite_agent_spec.rb ├── .gitignore ├── Rakefile ├── .github └── workflows │ └── tests.yml ├── lib ├── spot_build │ ├── spot_instance.rb │ ├── sqs_event.rb │ └── buildkite_agents.rb └── spot_build.rb ├── spot_build.gemspec ├── LICENSE └── README.md /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /bin/spot_build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'spot_build' 4 | 5 | SpotBuild.run 6 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) 2 | require 'spot_build' 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | .bundle/ 3 | .config 4 | .ruby-version 5 | coverage/ 6 | Gemfile.lock 7 | pkg/ 8 | spec/reports/ 9 | spec/examples.txt 10 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: tests 3 | on: [ push, pull_request ] 4 | jobs: 5 | test: 6 | name: Test (Ruby ${{ matrix.ruby }}) 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | ruby: [ '2.5', '2.6', '2.7', '3.0' ] 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: ruby/setup-ruby@v1 14 | with: 15 | ruby-version: ${{ matrix.ruby }} 16 | bundler-cache: true 17 | - name: RSpec 18 | run: bundle exec rake spec 19 | -------------------------------------------------------------------------------- /lib/spot_build/spot_instance.rb: -------------------------------------------------------------------------------- 1 | require 'net/http' 2 | require 'time' 3 | 4 | module SpotBuild 5 | class SpotInstance 6 | def shutdown_if_required(&block) 7 | return false unless self.class.scheduled_for_termination? 8 | yield 9 | true 10 | end 11 | 12 | def self.scheduled_for_termination? 13 | !time_until_termination.nil? 14 | end 15 | 16 | def self.time_until_termination 17 | uri = URI('http://169.254.169.254/latest/meta-data/spot/termination-time') 18 | response = Net::HTTP.get_response(uri) 19 | return nil if response.code == "404" 20 | Time.parse(response.body) - Time.now 21 | rescue ArgumentError 22 | nil 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /lib/spot_build/sqs_event.rb: -------------------------------------------------------------------------------- 1 | require 'aws-sdk' 2 | require 'timeout' 3 | 4 | module SpotBuild 5 | class SqsEvent 6 | def initialize(url:, timeout:, region: ENV['AWS_REGION']) 7 | @queue = Aws::SQS::Queue.new(url: url, region: region) 8 | @timeout = timeout 9 | end 10 | 11 | def shutdown_if_required(&block) 12 | # Any message to this queue is treated as a "I should shutdown" 13 | message = @queue.receive_messages( 14 | attribute_names: ["All"], 15 | max_number_of_messages: 1, 16 | visibility_timeout: (@timeout - 5), 17 | ).first 18 | return false if message.nil? 19 | yield 20 | message.delete 21 | true 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /spot_build.gemspec: -------------------------------------------------------------------------------- 1 | lib = File.expand_path('../lib', __FILE__) 2 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 3 | 4 | Gem::Specification.new do |gem| 5 | gem.name = 'spot_build' 6 | gem.version = '1.1.0' 7 | gem.authors = ['Patrick Robinson'] 8 | gem.email = [] 9 | gem.description = 'Helps manage Buildkite Agents running on EC2 Spot instances' 10 | gem.summary = gem.description 11 | gem.homepage = 'https://github.com/envato/spot_build' 12 | 13 | gem.files = `git ls-files`.split($/) 14 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 15 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 16 | gem.require_paths = ['lib'] 17 | 18 | gem.add_dependency 'buildkit', '~> 1.4' 19 | gem.add_dependency 'aws-sdk', '~> 2' 20 | gem.add_dependency 'link_header', '~> 0.0.2' 21 | gem.add_development_dependency 'rspec', '~> 3' 22 | gem.add_development_dependency 'rake' 23 | end 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Envato 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Managing Spot Fleets with Buildkite 2 | 3 | [![License MIT](https://img.shields.io/github/license/envato/spot_build)](https://github.com/envato/spot_build/blob/HEAD/LICENSE) 4 | [![Gem Version](https://img.shields.io/gem/v/spot_build)](https://rubygems.org/gems/spot_build) 5 | [![Build Status](https://github.com/envato/spot_build/workflows/tests/badge.svg?branch=master)](https://github.com/envato/spot_build/actions?query=workflow%3Atests+branch%3Amaster) 6 | 7 | AWS EC2 Spot instances are cheaper, Buildkite Agents are a natural fit for Spot instnaces as the workload is interruptable. 8 | 9 | SpotBuild makes it easier to use Spot instances and Spot fleets with Buildkite Agents by providing an agent that will shutdown the agent when the instance is scheduled for termination, preventing it from starting any new jobs and retry the current job it's working on. 10 | 11 | # Running 12 | 13 | Run this gem as a daemon on your buildkite agents and supply it the Organisation Slug and a Buildkite API token with the following permissions: 14 | - read_agents 15 | - read_builds 16 | - write_builds 17 | 18 | ## Development Status 19 | 20 | Very early stages of development 21 | -------------------------------------------------------------------------------- /lib/spot_build.rb: -------------------------------------------------------------------------------- 1 | require 'spot_build/buildkite_agents' 2 | require 'spot_build/spot_instance' 3 | require 'spot_build/sqs_event' 4 | require 'optparse' 5 | 6 | module SpotBuild 7 | DEFAULT_TIMEOUT = 300 8 | 9 | def self.run 10 | options = parse_options 11 | options[:timeout] ||= DEFAULT_TIMEOUT 12 | 13 | checks = [SpotInstance.new] 14 | if options[:queue_url] 15 | checks.push(SqsEvent.new(url: options[:queue_url], timeout: options[:timeout], region: options[:aws_region])) 16 | end 17 | 18 | agents = BuildkiteAgents.new(options[:token], options[:org_slug]) 19 | loop do 20 | checks.each do |check| 21 | terminating = check.shutdown_if_required do 22 | agents.stop 23 | if options[:auto_retries] 24 | timeout = SpotInstance.scheduled_for_termination? ? (SpotInstance.time_until_termination - 30) : options[:timeout] 25 | 26 | Timeout::timeout(timeout) do 27 | while agents.agents_running? 28 | sleep 5 29 | end 30 | end rescue Timeout::Error 31 | agents.the_end_is_nigh 32 | else 33 | while agents.agents_running? 34 | sleep 5 35 | end 36 | end 37 | end 38 | %x(shutdown -h now) if terminating 39 | end 40 | sleep 2 41 | end 42 | end 43 | 44 | def self.parse_options 45 | options = {auto_retries: true} 46 | parser = OptionParser.new do |opts| 47 | opts.banner = "Usage: #{__FILE__} [options]" 48 | opts.on("-t", "--token TOKEN", "Buildkite API token") { |v| options[:token] = v } 49 | opts.on("-o", "--org-slug ORGANISATION-SLUG", "The Buildkite Organisation Slug") { |v| options[:org_slug] = v } 50 | opts.on("-s", "--sqs-queue SQS-QUEUE-URL", "The SQS Queue URL we should monitor for events that tell us to shutdown") { |v| options[:queue_url] = v } 51 | opts.on("--timeout TIMEOUT", "The amount of time to wait for the buildkite agent to stop before shutting down. Only used if --sqs-queue is specified") { |v| options[:timeout] = v.to_i } 52 | opts.on("-r", "--aws-region REGION", "The AWS Region the SQS queue resides in") { |v| options[:aws_region] = v } 53 | opts.on("-n", "--[no-]auto-retry", "Disable automatic retries") { |v| options[:auto_retries] = v } 54 | end 55 | parser.parse! 56 | 57 | if options[:token].nil? || options[:org_slug].nil? 58 | raise OptionParser::MissingArgument, "You must specify Token and Organisational Slug.\n#{parser.help}" 59 | end 60 | 61 | options 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /lib/spot_build/buildkite_agents.rb: -------------------------------------------------------------------------------- 1 | require 'buildkit' 2 | require 'socket' 3 | require 'link_header' 4 | 5 | module SpotBuild 6 | class BuildkiteAgents 7 | def initialize(token, org_slug) 8 | @client = Buildkit.new(token: token) 9 | @org_slug = org_slug 10 | end 11 | 12 | def the_end_is_nigh(host = Socket.gethostname) 13 | agents = agents_on_this_host(host) 14 | agents.each do |agent| 15 | stop_agent(agent, force: true) 16 | end 17 | agents.each do |agent| 18 | reschedule_job(agent.job) 19 | end 20 | agents.count 21 | end 22 | 23 | def stop_agent(agent, force: false) 24 | @client.stop_agent(@org_slug, agent.id, "{\"force\": #{force}}") 25 | rescue Buildkit::UnprocessableEntity 26 | # Swallow the error, this is generally thrown when the agent has already stopped 27 | end 28 | 29 | def agents_running?(host = Socket.gethostname) 30 | !agents_on_this_host(host).empty? 31 | end 32 | 33 | def stop(host = Socket.gethostname) 34 | agents_on_this_host(host).each do |agent| 35 | stop_agent(agent, force: false) 36 | end 37 | end 38 | 39 | private 40 | 41 | RETRY_MESSAGE = /Only failed or timed out jobs can be retried/.freeze 42 | 43 | def reschedule_job(job) 44 | return if job.nil? 45 | retry_error(Buildkit::BadRequest, RETRY_MESSAGE) do 46 | @client.retry_job(@org_slug, job_pipeline(job[:build_url]), job_build(job[:build_url]), job[:id]) 47 | end 48 | end 49 | 50 | def retry_error(error_class, message_regex, sleep_seconds: 1, retries: 20) 51 | begin 52 | yield 53 | rescue error_class => e 54 | if retries > 0 && e.message =~ message_regex 55 | sleep sleep_seconds 56 | retries -= 1 57 | retry 58 | else 59 | raise 60 | end 61 | end 62 | end 63 | 64 | # build_url: https://api.buildkite.com/v2/organizations/my-great-org/pipelines/sleeper/builds/50 65 | def job_pipeline(build_url) 66 | build_url[%r{organizations/#{@org_slug}/pipelines/([^/]*)}, 1] 67 | end 68 | 69 | def job_build(build_url) 70 | build_url[%r{organizations/#{@org_slug}/pipelines/[^/]*/builds/([0-9]*)}, 1] 71 | end 72 | 73 | def agents_on_this_host(host) 74 | all_agents.select { |agent| agent.hostname == host } 75 | end 76 | 77 | def all_agents 78 | with_pagination do |options = {}| 79 | @client.agents(@org_slug, options) 80 | end 81 | end 82 | 83 | # This is definately not thread safe 84 | def with_pagination(&block) 85 | results = yield 86 | while next_ref = next_link_ref(@client.last_response.headers["link"]) 87 | uri = URI.parse(next_ref.href) 88 | next_page = uri.query.split("=")[1] 89 | results.push(yield page: next_page) 90 | end 91 | results.flatten 92 | end 93 | 94 | def next_link_ref(header) 95 | LinkHeader.parse(header).find_link(["rel", "next"]) 96 | end 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /spec/buildkite_agent_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe SpotBuild::BuildkiteAgents do 4 | let(:org_slug) { "envato" } 5 | let(:pipeline) { "my-app" } 6 | subject(:buildkite_agent) { described_class.new('deadbeef', org_slug) } 7 | let(:last_response_stub) { instance_double(Sawyer::Response) } 8 | let(:buildkit_stub) { instance_double("Buildkit::Client", :agents => agent_stubs) } 9 | let(:hostname) { "i-1234567890" } 10 | let(:build_id) { "12345678" } 11 | 12 | def agent(id:, build_id: "12345678", job_id: "1") 13 | double("BuildkiteAgent#{id}", 14 | hostname: hostname, 15 | id: id, 16 | job: {build_url: "organizations/#{org_slug}/pipelines/#{pipeline}/builds/#{build_id}", id: job_id} 17 | ) 18 | end 19 | 20 | before do 21 | allow(Buildkit).to receive(:new).and_return(buildkit_stub) 22 | allow(Socket).to receive(:gethostname).and_return(hostname) 23 | allow(buildkit_stub).to receive(:last_response).and_return(last_response_stub) 24 | allow(last_response_stub).to receive(:headers).and_return({"link" => nil}) 25 | end 26 | 27 | describe '#agents_running?' do 28 | context 'when agents are running' do 29 | let(:agent_stubs) { [agent(id: '123', build_id: build_id, job_id: '1')] } 30 | 31 | it 'returns true' do 32 | expect(buildkite_agent.agents_running?).to eq true 33 | end 34 | end 35 | 36 | context "when agents aren't running" do 37 | let(:agent_stubs) { [] } 38 | 39 | it 'returns false' do 40 | expect(buildkite_agent.agents_running?).to eq false 41 | end 42 | end 43 | end 44 | 45 | describe '#the_end_is_nigh' do 46 | context 'the agent is not running' do 47 | let(:agent_stubs) { [] } 48 | 49 | it 'does nothing' do 50 | expect(buildkit_stub).to_not receive(:stop_agent) 51 | expect(buildkit_stub).to_not receive(:retry_job) 52 | buildkite_agent.the_end_is_nigh 53 | end 54 | end 55 | 56 | context 'agents are running' do 57 | let(:agent_1_id) { '9876' } 58 | let(:agent_2_id) { '9877' } 59 | let(:agent_stubs) { [agent(id: agent_1_id, build_id: build_id, job_id: '1'), 60 | agent(id: agent_2_id, build_id: build_id, job_id: '2')] } 61 | 62 | before do 63 | allow(buildkit_stub).to receive(:stop_agent) 64 | allow(buildkit_stub).to receive(:retry_job) 65 | end 66 | 67 | it 'stops each agent forcefully' do 68 | expect(buildkit_stub).to receive(:stop_agent).with(org_slug, agent_1_id, '{"force": true}') 69 | expect(buildkit_stub).to receive(:stop_agent).with(org_slug, agent_2_id, '{"force": true}') 70 | buildkite_agent.the_end_is_nigh 71 | end 72 | 73 | it 'reschedules the job' do 74 | expect(buildkit_stub).to receive(:retry_job).with(org_slug, pipeline, build_id, '1') 75 | expect(buildkit_stub).to receive(:retry_job).with(org_slug, pipeline, build_id, '2') 76 | buildkite_agent.the_end_is_nigh 77 | end 78 | 79 | context "when the jobs aren't retryable yet" do 80 | let(:agent_stubs) { [agent(id: agent_1_id, build_id: build_id, job_id: '1')] } 81 | 82 | it 'retries' do 83 | responses = [ 84 | -> { raise Buildkit::BadRequest, {method: 'PUT', url: 'https://api.buildkite.com/v2/organizations/#{org_slug}/pipelines/#{pipeline}/builds/18961/jobs/1/retry', body: 'Only failed or timed out jobs can be retried'} }, 85 | -> { nil } 86 | ] 87 | allow(buildkit_stub).to receive(:retry_job).with(org_slug, pipeline, build_id, '1') do 88 | response = responses.shift 89 | response.call if response 90 | end 91 | buildkite_agent.the_end_is_nigh 92 | expect(buildkit_stub).to have_received(:retry_job) 93 | .with(org_slug, pipeline, build_id, '1') 94 | .twice 95 | end 96 | end 97 | end 98 | 99 | context 'the agent stops while we are trying to stop it' do 100 | let(:agent_stubs) { [agent(id: '9876')] } 101 | 102 | before do 103 | allow(buildkit_stub).to receive(:stop_agent).and_raise(Buildkit::UnprocessableEntity) 104 | allow(buildkit_stub).to receive(:retry_job) 105 | end 106 | 107 | it 'retries the job' do 108 | expect(buildkit_stub).to receive(:retry_job) 109 | buildkite_agent.the_end_is_nigh 110 | end 111 | end 112 | end 113 | end 114 | --------------------------------------------------------------------------------