├── README.md ├── auto-shutdown.nix ├── default.nix └── hydra-provisioner /README.md: -------------------------------------------------------------------------------- 1 | # Hydra provisioner 2 | 3 | `hydra-provisioner` is a script that automatically creates and destroys build machines (typically EC2 instances) for a Hydra server based on the state of the Hydra queue runner. For instance, if there are many runnable build steps for a particular system type (e.g. `x86_64-linux`), it will create additional build machines, then destroy them when they are no longer needed. The machines are managed using NixOps. 4 | 5 | To run this script: 6 | ```bash 7 | $ hydra-provisioner conf.nix 8 | ``` 9 | This command should be run periodically (e.g. every 5 minutes). `conf.nix` is a Nix expression containing a specification of when and how to create machines. For example: 10 | ```nix 11 | { 12 | 13 | # Tag used for NixOps deployments created by the provisioner. Useful 14 | # if you're running multiple provisioners. 15 | #tag = "hydra-provisioned"; 16 | 17 | # The spec must contain one or more sets named systemTypes., 18 | # where is a Nix system type such as "x86_64-linux". You 19 | # can also list system features (e.g. "x86_64-linux:benchmark"), in 20 | # which case only build steps that have "requiredSystemFeatures" set to 21 | # the listed features will be executed on the machines created here. 22 | systemTypes.x86_64-linux = { 23 | 24 | # Path to NixOps module defining the deployment for this type. 25 | nixopsExpr = builtins.toPath ./deployment.nix; 26 | 27 | # The minimum number of machines to keep around for this type. 28 | #minMachines = 0; 29 | 30 | # The maximum number of machines to provision for this type. 31 | #maxMachines = 1; 32 | 33 | # Value subtracted from the number of runnables of this type. This 34 | # is the number of runnables to be performed by non-provisioned 35 | # machines, before the provisioner kicks in to create more 36 | # machines. 37 | #ignoredRunnables = 0; 38 | 39 | # How many machines should be created given the number of 40 | # runnables. For instance, if there are 10 runnables and 41 | # runnablesPerMachine is 5, then 2 machines will be created. 42 | #runnablesPerMachine = 10; 43 | 44 | # How many jobs can be run concurrently on machines of this type. 45 | #maxJobs = 1; 46 | 47 | # The speed factor. 48 | #speedFactor = 1; 49 | 50 | # The path of the SSH private key. 51 | #sshKey = "/var/lib/hydra/queue-runner/.ssh/id_buildfarm"; 52 | 53 | # Whether to stop or destroy the machine when it's idle. 54 | #stopOnIdle = false; 55 | 56 | # Grace period in seconds before an idle machine is stopped or 57 | # destroyed. Thus, if Hydra load increases in the meantime, the 58 | # machine can be put back in action. Note that regardless of this 59 | # setting, EC2 instances are not stopped or destroyed until their 60 | # current hour of execution time has nearly expired. 61 | #gracePeriod = 0; 62 | 63 | }; 64 | 65 | # Command for getting the Hydra queue status. Useful if the provisioner 66 | # runs on a different machine from the queue runner. 67 | #sshCommand = ["hydra-queue-runner", "--status"] 68 | 69 | # Command for writing the queue runner's machines file. The contents are 70 | # passed via stdin. 71 | #updateCommand = [ "/bin/sh" "-c" "cat > /var/lib/hydra/provisioner/machines" ]; 72 | } 73 | ``` 74 | The NixOps specification (e.g. `deployment.nix`) must declare one machine named `machine`. The provisioner will create a separate NixOps deployment for each machine that it creates. A typical NixOps specification looks like this: 75 | ```nix 76 | # The "type" argument corresponds to the system type (such as 77 | # "x86_64-linux:benchmark"), and can be used for creating different 78 | # kinds of machines from the same NixOps specification. 79 | { type, tag, ... }: 80 | 81 | { 82 | 83 | machine = 84 | { config, lib, pkgs, ... }: 85 | { 86 | deployment.targetEnv = "virtualbox"; 87 | 88 | # The queue runner will perform build actions via "nix-store --serve" 89 | # on root@, so this machine needs an authorized key for that. 90 | users.extraUsers.root.openssh.authorizedKeys.keys = lib.singleton '' 91 | command="nix-store --serve --write" ssh-dss AAAAB3NzaC1... 92 | ''; 93 | 94 | # Currently, Hydra works better with the Nix 1.10 prerelease. 95 | nix.package = pkgs.nixUnstable; 96 | 97 | # Frequent garbage collection is a good idea for build machines. 98 | nix.gc.automatic = true; 99 | nix.gc.dates = "*:0/30"; 100 | }; 101 | 102 | } 103 | ``` 104 | 105 | A real-world configuration (for `hydra.nixos.org`) can be found at https://github.com/NixOS/nixos-org-configurations/tree/master/hydra-provisioner. 106 | -------------------------------------------------------------------------------- /auto-shutdown.nix: -------------------------------------------------------------------------------- 1 | { config, lib, ... }: 2 | 3 | let timeout = 3600; in 4 | 5 | { 6 | 7 | system.activationScripts.idle-monitor = 8 | '' 9 | touch /run/keep-alive 10 | ''; 11 | 12 | systemd.services.idle-monitor = 13 | { description = "Idle Monitor"; 14 | script = '' 15 | while true; do 16 | sleep 60 17 | if [ $(($(date +%s) - $(stat -c %Y /run/keep-alive))) -gt ${toString timeout} ]; then 18 | echo "powering off after ${toString timeout} seconds of idleness..." 19 | systemctl poweroff 20 | fi 21 | done 22 | ''; 23 | serviceConfig.Restart = "always"; 24 | wantedBy = [ "multi-user.target" ]; 25 | }; 26 | 27 | } 28 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | { pkgs ? import {} 2 | , nixops ? pkgs.nixops 3 | }: 4 | 5 | with pkgs; 6 | 7 | stdenv.mkDerivation { 8 | name = "hydra-provisioner"; 9 | 10 | buildInputs = with python2Packages; [ wrapPython python nixops ]; 11 | 12 | pythonPath = [ nixops nixUnstable ] ++ nixops.pythonPath; 13 | 14 | unpackPhase = "true"; 15 | buildPhase = "true"; 16 | 17 | installPhase = 18 | '' 19 | mkdir -p $out/bin $out/share/nix/hydra-provisioner 20 | cp ${./hydra-provisioner} $out/bin/hydra-provisioner 21 | cp ${./auto-shutdown.nix} $out/share/nix/hydra-provisioner/auto-shutdown.nix 22 | wrapPythonPrograms 23 | ''; 24 | } 25 | -------------------------------------------------------------------------------- /hydra-provisioner: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import base64 5 | import json 6 | import math 7 | import nixops.resources 8 | import nixops.statefile 9 | import os 10 | import subprocess 11 | import sys 12 | import tempfile 13 | import time 14 | 15 | own_modules = os.path.realpath(os.path.dirname(__file__) + "/../share/nix/hydra-provisioner") 16 | if not os.path.exists(own_modules): 17 | own_modules = os.path.dirname(__file__) 18 | 19 | def log(s): 20 | sys.stderr.write(s + "\n") 21 | 22 | def get_new_deployment_name(prefix): 23 | """Generate a unique NixOps deployment name with the given prefix.""" 24 | names = {depl.name for depl in depls} 25 | i = 0 26 | while True: 27 | name = prefix + "-" + str(i) 28 | if name not in names: break 29 | i += 1 30 | return name 31 | 32 | def get_depl_arg(depl, key, default=""): 33 | s = depl.args.get(key, default) 34 | return s.replace('"', '') # FIXME: escaping 35 | 36 | def get_depl_time_left(depl): 37 | m = depl.machines.get("machine", None) 38 | if not m: return 0 39 | next_charge_time = m.next_charge_time() 40 | if not next_charge_time: return 0 41 | return max(next_charge_time - int(time.time()), 0) 42 | 43 | def depl_state(depl): 44 | machine = depl.machines.get("machine", None) 45 | return machine.state if machine else nixops.resources.ResourceState.MISSING 46 | 47 | # Read the config file. 48 | if len(sys.argv) != 2: 49 | sys.stderr.write("Syntax: hydra-provisioner \n") 50 | sys.exit(1) 51 | config_file = sys.argv[1] 52 | 53 | config = json.loads(subprocess.check_output( 54 | ["nix-instantiate", "--eval", "--strict", "--json", config_file])) 55 | 56 | if "systemTypes" not in config: config["systemTypes"] = {} 57 | 58 | tag = config.get("tag", "hydra-provisioned") 59 | 60 | # Get the current deployments. 61 | sf = nixops.statefile.StateFile(nixops.statefile.get_default_state_file()) 62 | all_depls = sf.get_all_deployments() 63 | depls = [depl for depl in all_depls if get_depl_arg(depl, "tag") == tag] 64 | 65 | # Get status info from the Hydra queue runner. 66 | # FIXME: handle error. 67 | status_command = config.get("statusCommand", ["hydra-queue-runner", "--status"]) 68 | try: 69 | status = json.loads(subprocess.check_output(status_command)) 70 | except subprocess.CalledProcessError: 71 | status = None 72 | 73 | if not status or status["status"] == "down": 74 | status = {"status": "down", "machineTypes": {}, "machines": {}, "uptime": 0} 75 | 76 | # Squash i686-linux into x86_64-linux. We assume there are no actual 77 | # i686-linux build machines. 78 | for type_name in status["machineTypes"].keys(): 79 | if type_name.startswith("i686-linux"): 80 | target_name = type_name.replace("i686-linux", "x86_64-linux") 81 | type_status = status["machineTypes"][type_name] 82 | if target_name in status["machineTypes"]: 83 | status["machineTypes"][target_name]["runnable"] += type_status["runnable"] 84 | else: 85 | status["machineTypes"][target_name] = type_status 86 | del status["machineTypes"][type_name] 87 | 88 | system_types = set(status["machineTypes"].keys()).union(set(config["systemTypes"].keys())) 89 | 90 | # For each machine type, determine how many machines are needed, and 91 | # create new machines if necessary. 92 | in_use = set({}) 93 | up_to_date = set({}) 94 | 95 | for type_name in system_types: 96 | type_status = status["machineTypes"].get(type_name, {"runnable": 0}) 97 | type_config = config["systemTypes"].get(type_name, None) 98 | if not type_config: 99 | log("cannot provision machines of type {0}".format(type_name)) 100 | continue 101 | 102 | runnable = type_status["runnable"] 103 | ignored_runnables = type_config.get("ignoredRunnables", 0) 104 | runnables_per_machine = type_config.get("runnablesPerMachine", 10) 105 | wanted = int(math.ceil(max(runnable - ignored_runnables, 0) / float(runnables_per_machine))) 106 | allowed = min(max(wanted, type_config.get("minMachines", 0)), type_config.get("maxMachines", 1)) 107 | log("machine type {0} has {1} runnables, wants {2} machines, will get {3} machines" 108 | .format(type_name, runnable, wanted, allowed)) 109 | 110 | def depl_sort_key(depl): 111 | x = [depl_state(depl) != nixops.resources.ResourceState.UP] 112 | return x 113 | 114 | existing = [depl for depl in depls if get_depl_arg(depl, "type") == type_name] 115 | existing.sort(key=depl_sort_key) 116 | 117 | # FIXME: error handling. 118 | have = 0 119 | created = 0 120 | while have < allowed: 121 | check = False 122 | 123 | if len(existing) == 0: 124 | # Create a new machine. 125 | # FIXME: make this transactional. 126 | name = get_new_deployment_name(tag) 127 | 128 | depl = sf.create_deployment() 129 | depl.name = name 130 | depl.set_argstr("type", type_name) 131 | depl.set_argstr("tag", tag) 132 | depls.append(depl) 133 | all_depls.append(depl) 134 | 135 | log("created deployment ‘{0}’ of type ‘{1}’".format(name, type_name)) 136 | created += 1 137 | 138 | else: 139 | depl = existing[0] 140 | 141 | if depl_state(depl) == nixops.resources.ResourceState.UP: 142 | # We have an existing machine and it's up. Check 143 | # whether it's really up, and if so, use it. 144 | 145 | depl.machines["machine"].check() # FIXME: only do this periodically 146 | if depl_state(depl) != nixops.resources.ResourceState.UP: 147 | # It's not actually up. Resort and retry. 148 | existing.sort(key=depl_sort_key) 149 | continue 150 | 151 | #up_to_date.add(depl) # FIXME 152 | 153 | elif depl_state(depl) == nixops.resources.ResourceState.MISSING: 154 | existing.pop(0) 155 | continue 156 | 157 | existing.pop(0) 158 | 159 | depl.nix_exprs = [os.path.abspath(type_config["nixopsExpr"])] 160 | depl.nix_path = [nixops.util.abs_nix_path(x) for x in type_config.get("nixPath", [])] 161 | 162 | in_use.add(depl) 163 | 164 | have += 1 165 | 166 | if created >= 1: break 167 | 168 | # Keep recently used machines in nix.machines. 169 | expired = set({}) 170 | unusable = set({}) 171 | for depl in depls: 172 | if depl in in_use: continue 173 | 174 | if depl_state(depl) not in [nixops.resources.ResourceState.UP, nixops.resources.ResourceState.STARTING]: 175 | expired.add(depl) 176 | continue 177 | 178 | type_name = get_depl_arg(depl, "type") 179 | type_config = config["systemTypes"].get(type_name, None) 180 | type_status = status["machineTypes"].get(type_name, None) 181 | 182 | grace_period = type_config.get("gracePeriod", 0) if type_config else 0 183 | 184 | # Keep machines that still have at least 30 minutes of paid time 185 | # left. 186 | time_left = get_depl_time_left(depl) 187 | if time_left >= 30 * 60: 188 | log("keeping deployment ‘{0}’ because it has {1}s left".format(depl.name, time_left)) 189 | in_use.add(depl) 190 | continue 191 | 192 | # Keep machines that are currently in use. FIXME: we may want to 193 | # destroy them anyway, in order not to keep an excessive number of 194 | # machines around. Hydra will retry aborted build steps anyway. 195 | m = depl.machines.get("machine", None) 196 | machine_status = status["machines"].get("root@" + m.get_ssh_name(), {}) 197 | if machine_status and machine_status.get("currentJobs", 0) != 0: 198 | log("keeping active deployment ‘{0}’".format(depl.name)) 199 | in_use.add(depl) 200 | 201 | # If this machine doesn't have a grace period, then don't add 202 | # it to the machines list. This prevents new builds from 203 | # starting. 204 | if grace_period > 0: 205 | unusable.add(depl) 206 | 207 | continue 208 | 209 | # Keep machines that have been used within the last ‘gracePeriod’ 210 | # seconds. 211 | last_active = type_status.get("lastActive", 0) if type_status else 0 212 | if last_active == 0: last_active = int(time.time()) - status["uptime"] + 1800 213 | 214 | if int(time.time()) - last_active < grace_period: 215 | log("keeping recently used deployment ‘{0}’".format(depl.name)) 216 | in_use.add(depl) 217 | continue 218 | 219 | expired.add(depl) 220 | 221 | # Deploy the active machines. FIXME: do in parallel. 222 | deployed = set({}) 223 | for depl in in_use: 224 | if depl not in up_to_date: 225 | log("updating deployment ‘{0}’...".format(depl.name)) 226 | depl.extra_nix_path.append("hydra-provisioner=" + own_modules) 227 | try: 228 | depl.deploy(check=True) 229 | depl.machines["machine"].ssh.run_command(["touch", "/run/keep-alive"]) 230 | deployed.add(depl) 231 | except Exception as e: 232 | log("error deploying ‘{0}’: {1}".format(depl.name, e)) 233 | continue 234 | deployed.add(depl) 235 | 236 | # Generate the new nix.machines. 237 | machines_list = [] 238 | for depl in deployed: 239 | if depl in unusable: continue 240 | 241 | m = depl.machines.get("machine", None) 242 | assert(m) 243 | 244 | type_name = get_depl_arg(depl, "type") 245 | type_config = config["systemTypes"][type_name] 246 | 247 | if ":" not in type_name: type_name += ":" 248 | (systems, features) = type_name.split(":", 1) 249 | systems_list = systems.split(",") 250 | features_list = features.split(",") if features != "" else [] 251 | if "x86_64-linux" in systems_list and "i686-linux" not in systems_list: 252 | systems_list.append("i686-linux") 253 | 254 | columns = [ 255 | "root@" + m.get_ssh_name(), 256 | ",".join(systems_list), 257 | type_config.get("sshKey", "-"), 258 | str(type_config.get("maxJobs", 1)), 259 | str(type_config.get("speedFactor", 1)), 260 | ",".join(features_list) if features_list else "-", 261 | ",".join(features_list) if features_list else "-", 262 | base64.b64encode(m.public_host_key) if m.public_host_key else "-" 263 | ] 264 | 265 | assert(all(c != "" for c in columns)) 266 | 267 | machines_list.append(" ".join(columns) + "\n") 268 | 269 | machines_file = "".join(machines_list) 270 | update_command = config.get("updateCommand", None) 271 | if update_command: 272 | machines_tmp = tempfile.NamedTemporaryFile() 273 | machines_tmp.write(machines_file) 274 | machines_tmp.seek(0) 275 | subprocess.check_call(update_command, stdin=machines_tmp) 276 | else: 277 | nixops.util.write_file("/var/lib/hydra/provisioner/machines", machines_file) 278 | 279 | # Stop or destroyed unused machines. 280 | for depl in expired: 281 | type_name = get_depl_arg(depl, "type") 282 | type_config = config["systemTypes"].get(type_name, None) 283 | 284 | if depl_state(depl) in [nixops.resources.ResourceState.UP, nixops.resources.ResourceState.STARTING]: 285 | 286 | # Don't stop/destroy machines that still have at least 10 minutes 287 | # of paid time left. 288 | time_left = get_depl_time_left(depl) 289 | if time_left >= 10 * 60: 290 | log("not stopping/destroying deployment ‘{0}’ because it has {1}s left".format(depl.name, time_left)) 291 | continue 292 | 293 | stop_on_idle = type_config.get("stopOnIdle", False) if type_config else False 294 | 295 | if stop_on_idle: 296 | if depl_state(depl) != nixops.resources.ResourceState.STOPPED: 297 | log("stopping deployment ‘{0}’".format(depl.name)) 298 | depl.stop_machines() 299 | 300 | else: 301 | log("destroying deployment ‘{0}’".format(depl.name)) 302 | depl.logger.set_autoresponse("y") 303 | depl.destroy_resources() 304 | depl.delete() 305 | --------------------------------------------------------------------------------