├── .gitignore ├── Makefile ├── README.md ├── myocamlbuild.ml ├── opam ├── descr └── opam ├── src ├── app │ └── main.ml ├── ketrew_backend │ └── plugin.ml ├── lib │ ├── aws_batch_job.ml │ ├── aws_batch_queue.ml │ ├── client.ml │ ├── cluster.ml │ ├── cluster.mli │ ├── command_line.ml │ ├── error.ml │ ├── gke_cluster.ml │ ├── gke_cluster.mli │ ├── hyper_shell.ml │ ├── hyper_shell.mli │ ├── internal_pervasives.ml │ ├── job.ml │ ├── job.mli │ ├── job_common.ml │ ├── kube_job.ml │ ├── kube_job.mli │ ├── local_docker_job.ml │ ├── local_docker_job.mli │ ├── log.ml │ ├── log.mli │ ├── server.ml │ ├── server.mli │ ├── storage.ml │ └── storage.mli └── test │ ├── client_server.ml │ ├── cocloketrew.ml │ ├── ketrew_env_config.ml │ └── workflow_test.ml └── tools └── docker ├── Dockerfile ├── README.md ├── biokepi_machine.ml ├── configuration.env └── please.sh /.gitignore: -------------------------------------------------------------------------------- 1 | /.merlin 2 | /.ocamlinit 3 | *.byte 4 | *.cma 5 | _build/ 6 | /_test/ 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | OCAMLBUILD=ocamlbuild -use-ocamlfind -plugin-tag "package(solvuu-build,nonstd)" 3 | include _build/project.mk 4 | _build/project.mk: 5 | $(OCAMLBUILD) $(notdir $@) 6 | 7 | .PHONY: merlin 8 | merlin: 9 | rm -f .merlin _build/.merlin && $(MAKE) .merlin && cat .merlin 10 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Coclobas 2 | ======== 3 | 4 | Coclobas is a scheduler for HPC-like jobs accessible through HTTP. 5 | 6 | It can be setup with three kinds of configurations: 7 | 8 | - Using Kubernetes and the *Google Container Engine*, 9 | i.e. using a Kubernetes “eleastic” cluster setup with `gcloud` and submitting 10 | jobs as Kubernetes “pods”. 11 | - Using the server's machine as a one-node cluster and submitting jobs as docker 12 | containers given a maximal number of jobs. 13 | - *(Experimental)* using an AWS-Batch job queue with Docker containers. 14 | 15 | Coclobas provides logging facilities (e.g. maintaining logs long after 16 | Kubernetes disposes of them). 17 | 18 | Finally, it makes it easy to submit arbitrary scripts to be run in any Docker 19 | container, which makes it easier than using raw Kubernetes or Docker to submit 20 | arbitrarily complex jobs. 21 | 22 | If the `ketrew` package is present, Coclobas comes with a Ketrew plugin 23 | (loadable dynamically) 24 | and a build of the Ketrew binary with the plugin already loaded in (for 25 | deployment convenience). 26 | 27 | 28 | Build 29 | ----- 30 | 31 | Coclobas 0.0.1 is in `opam` (supports GKE/Kubernetes and local-docker modes). 32 | 33 | You can just use Opam to get things going quickly: 34 | 35 | opam pin add coclobas https://github.com/hammerlab/coclobas.git 36 | 37 | Or you may clone this repo and: 38 | 39 | make 40 | 41 | Requirements 42 | ------------ 43 | 44 | In GKE/Kubernetes mode, `gcloud` and `kubectl` 45 | must be installed (and authenticated) with the Coclobas server. 46 | 47 | In Local/Docker mode, `docker` must be present (and accessible to the Coclobas 48 | server's user). 49 | 50 | In AWS-Batch mode, the `aws` command line application must be present (and 51 | it must be recent enough to have `aws batch` enabled). 52 | 53 | Using Coclobas 54 | -------------- 55 | 56 | ### Configuration 57 | 58 | You first need to create a “root” directory, see: 59 | 60 | coclobas configure --help 61 | 62 | Example 1: GKE/Kubernetes mode: 63 | 64 | coclobas config --root $root \ 65 | --cluster-kind gke \ 66 | --gke-cluster-name "my-coclotest-cluster" \ 67 | --gcloud-zone "us-east1-c" \ 68 | --max-nodes 5 69 | 70 | Example 2: Local/Docker mode: 71 | 72 | coclobas config --root $root \ 73 | --cluster-kind local-doker \ 74 | --max-nodes 5 75 | 76 | Example 3: AWS-Batch mode: 77 | 78 | coclobas config --root $root \ 79 | --cluster-kind aws-batch \ 80 | --aws-queue awsuser-jq01 \ 81 | --min-sleep 4 \ 82 | --max-update-errors 4 \ 83 | --max-nodes 4 84 | 85 | ### Start The Server 86 | 87 | Now that Coclobas has a configuration (this information is stored in the 88 | `--root`, so you can have many possible Coclobas servers configured, all in 89 | different directories), you'll want to start it: 90 | 91 | 92 | coclobas start-server --root $root --port 8999 93 | 94 | 95 | You can use this `curl http://127.0.0.1:8999/status` to see if Coclobas is ready 96 | to go (if it says `Initializing` usually it means that it is setting up a 97 | GKE-cluster which takes a few minutes). 98 | 99 | ### Submitting Jobs 100 | 101 | The most common way of submitting jobs is through the Ketrew plugin, see 102 | examples in `src/test/workflow_test.ml`. 103 | 104 | In GKE/Kubernetes mode: 105 | 106 | ```ocaml 107 | workflow_node without_product ~name:"Coclobas that uses the GKE/Kubernetes" 108 | ~make:( 109 | Coclobas_ketrew_backend.Plugin.kubernetes_program 110 | ~base_url:"http://127.0.0.1:8999/" 111 | ~image:"ubuntu" 112 | ~volume_mounts:[ 113 | `Nfs ( 114 | Coclobas.Kube_job.Specification.Nfs_mount.make 115 | ~host:"nfs-server.example.com" 116 | ~path:"/path/inside/nfs-server" 117 | ~point:"/mount/point/" ()) 118 | ] 119 | Program.( 120 | chain [ 121 | shf "hostname"; 122 | shf "du -sh /mount/point"; 123 | shf "sleep 60"; 124 | ] 125 | ) 126 | ) 127 | ``` 128 | 129 | In Local/Docker mode: 130 | 131 | ```ocaml 132 | workflow_node without_product ~name:"Coclobas test of local-docker jobs" 133 | ~make:( 134 | Coclobas_ketrew_backend.Plugin.local_docker_program 135 | ~base_url:"http://127.0.0.1:8999/" 136 | ~image:"ubuntu" 137 | ~volume_mounts:[ 138 | `Local ("/usr/bin", "/hostusrbin") 139 | ] 140 | Program.( 141 | exec ["find"; "/hostusrbin"] 142 | ) 143 | ) 144 | ``` 145 | 146 | ### Querying the Server 147 | 148 | All the “job inspection” features are available through the Ketrew UIs as 149 | “backend queries.” 150 | 151 | One can also directly query the server through the command line, 152 | see `coclobas client --help`, for example: 153 | 154 | coclobas client --server http://127.0.0.1:8999 list 155 | 156 | 157 | Contact 158 | ------- 159 | 160 | If you have any questions, you may submit an 161 | [issue](https://github.com/hammerlab/coclobas/issues), or join 162 | the authors on the public “Slack” channel of the Hammer Lab: 163 | [![Slack Status](http://publicslack.hammerlab.org/badge.svg)](http://publicslack.hammerlab.org) 164 | 165 | License 166 | ------- 167 | 168 | It's [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0). 169 | 170 | -------------------------------------------------------------------------------- /myocamlbuild.ml: -------------------------------------------------------------------------------- 1 | open Nonstd 2 | open Solvuu_build.Std 3 | 4 | let project_name = "coclobas" 5 | let version = "0.0.3-dev" 6 | 7 | let build_tests = 8 | try Sys.getenv "WITH_TESTS" = "true" with _ -> false 9 | 10 | let findlib_deps = [ 11 | "nonstd"; 12 | "sosa"; 13 | "pvem_lwt_unix"; 14 | "cohttp.lwt"; 15 | "trakeva_of_uri"; 16 | "cmdliner"; 17 | "ppx_deriving.std"; 18 | "ppx_deriving_yojson"; 19 | "uuidm"; 20 | "base64"; 21 | "odate"; 22 | ] 23 | 24 | let meta_dot_ml = "src/lib/meta.ml" 25 | 26 | let generate_meta_data () = 27 | let git_last_commit () = 28 | try 29 | Some ( 30 | Ocamlbuild_pack.My_unix.run_and_read "git rev-parse HEAD" 31 | |> fun x -> String.sub x 0 (String.length x - 1) 32 | ) 33 | with _ -> None in 34 | Solvuu_build.Util.Rule.rule 35 | ~name:"meta-data-generation" 36 | ~prods:[meta_dot_ml] 37 | ~deps:[] 38 | ~insert:`bottom 39 | begin fun env builder -> 40 | let lines = List.map ~f:(sprintf "%s\n") [ 41 | "(** Metadata Module Generated by the Build System *)"; 42 | ""; 43 | sprintf "let version = %S" version; 44 | ""; 45 | sprintf "let git_commit = %s" 46 | (Option.value_map (git_last_commit ()) ~default:"None" 47 | ~f:(sprintf "Some %S")); 48 | ] in 49 | let open Ocamlbuild_plugin in 50 | Seq [ 51 | Echo (lines, meta_dot_ml); 52 | ] 53 | end 54 | 55 | let lib : Project.item = 56 | Project.lib project_name 57 | ~thread:() 58 | ~bin_annot:() 59 | ~findlib_deps 60 | ~ml_files:(`Add [Filename.basename meta_dot_ml]) 61 | ~dir:"src/lib" 62 | ~style:(`Pack project_name) 63 | 64 | let ketrew_backend : Project.item option = 65 | let item = 66 | Project.lib (project_name ^ "_ketrew_backend") 67 | ~thread:() 68 | ~bin_annot:() 69 | ~findlib_deps:("ketrew" :: findlib_deps) 70 | ~dir:"src/ketrew_backend" 71 | ~style:(`Pack (project_name ^ "_ketrew_backend")) 72 | ~internal_deps:[lib] 73 | ~install:(`Findlib (project_name ^ ".ketrew_backend")) 74 | in 75 | if Project.dep_opts_sat item ["ketrew"] 76 | then Some item 77 | else None 78 | 79 | let app : Project.item = 80 | Project.app project_name 81 | ~thread:() 82 | ~file:"src/app/main.ml" 83 | ~internal_deps:[lib] 84 | 85 | let test_findlib_deps = [ 86 | "ppx_deriving_cmdliner" 87 | ] 88 | 89 | let test : Project.item option = 90 | if build_tests 91 | then Some ( 92 | Project.app (project_name ^ "-test") 93 | ~thread:() 94 | ~bin_annot:() 95 | ~file:"src/test/client_server.ml" 96 | ~install:`No 97 | ~internal_deps:[lib] 98 | ~findlib_deps:test_findlib_deps 99 | ) else None 100 | 101 | let linked_ketrew : Project.item option = 102 | match ketrew_backend with 103 | | Some kb -> 104 | Some ( 105 | Project.app (project_name ^ "-ketrew") 106 | ~thread:() 107 | ~bin_annot:() 108 | ~file:"src/test/cocloketrew.ml" 109 | ~internal_deps:[lib; kb] 110 | ) 111 | | _ -> None 112 | 113 | let test_ketrew_workflow : Project.item option = 114 | match build_tests, ketrew_backend with 115 | | true, Some kb -> 116 | Some ( 117 | Project.app (project_name ^ "-ketrew-workflow-test") 118 | ~thread:() 119 | ~bin_annot:() 120 | ~install:`No 121 | ~file:"src/test/workflow_test.ml" 122 | ~internal_deps:[lib; kb] 123 | ~findlib_deps:test_findlib_deps 124 | ) 125 | | _, _ -> None 126 | 127 | let ocamlinit_postfix = [ 128 | sprintf "open %s" (String.capitalize_ascii project_name); 129 | ] 130 | 131 | let () = 132 | Project.basic1 ~project_name ~version ~ocamlinit_postfix 133 | ~additional_rules:[generate_meta_data] 134 | (List.filter_opt [ 135 | Some lib; 136 | Some app; 137 | test; 138 | ketrew_backend; 139 | linked_ketrew; 140 | test_ketrew_workflow; 141 | ]) 142 | -------------------------------------------------------------------------------- /opam/descr: -------------------------------------------------------------------------------- 1 | Coclobas is a scheduler for HPC-like jobs accessible through HTTP 2 | 3 | It can be setup with three configurations: 4 | 5 | - Using Kubernetes and the *Google Container Engine*, 6 | i.e. using a Kubernetes “eleastic” cluster setup with `gcloud` and submitting 7 | jobs as Kubernetes “pods”. 8 | - Using the server's machine as a one-node cluster and submitting jobs as docker 9 | containers given a maximal number of jobs. 10 | - Using AWS-Batch, submitting jobs with aws-cli and optionally using 11 | S3 bucket to share data (new in 0.0.2, and a bit experimental). 12 | 13 | Coclobas is mostly used as a Ketrew plugin, but can be driven separately. 14 | 15 | -------------------------------------------------------------------------------- /opam/opam: -------------------------------------------------------------------------------- 1 | opam-version: "1.2" 2 | maintainer: "Seb Mondet " 3 | authors: "Seb Mondet " 4 | homepage: "https://github.com/hammerlab/coclobas" 5 | bug-reports: "https://github.com/hammerlab/coclobas/issues" 6 | dev-repo: "https://github.com/hammerlab/coclobas.git" 7 | license: "Apache 2.0" 8 | 9 | build: [ 10 | [make "byte"] 11 | [make "native"] 12 | [make "META"] 13 | [make "coclobas.install"] 14 | ] 15 | 16 | 17 | depends: [ 18 | "ocamlfind" {build} 19 | "ocamlbuild" {build} 20 | "solvuu-build" {build & >= "0.3.0"} 21 | "nonstd" 22 | "sosa" 23 | "pvem_lwt_unix" 24 | "cohttp" "lwt" 25 | "trakeva" 26 | "cmdliner" 27 | "ppx_deriving" 28 | "ppx_deriving_yojson" {>= "3.0"} 29 | "uuidm" 30 | "base64" 31 | "odate" 32 | ] 33 | depopts: [ 34 | "ketrew" 35 | ] 36 | 37 | available: [ 38 | ocaml-version >= "4.03.0" 39 | ] -------------------------------------------------------------------------------- /src/app/main.ml: -------------------------------------------------------------------------------- 1 | 2 | let () = 3 | Coclobas.Command_line.main () 4 | -------------------------------------------------------------------------------- /src/ketrew_backend/plugin.ml: -------------------------------------------------------------------------------- 1 | 2 | open Coclobas 3 | open Internal_pervasives 4 | let (//) = Filename.concat 5 | 6 | let name = "coclobas" 7 | 8 | module Run_parameters = struct 9 | 10 | type created = { 11 | client: Client.t; 12 | specification: Job.Specification.t [@main]; 13 | program: Ketrew_pure.Program.t option; 14 | playground_path: string option; 15 | } [@@deriving yojson, make] 16 | type running = { 17 | created: created; 18 | job_id: string; 19 | } [@@deriving yojson] 20 | type t = [ 21 | | `Created of created 22 | | `Running of running 23 | ] [@@deriving yojson] 24 | 25 | let show t = to_yojson t |> Yojson.Safe.pretty_to_string 26 | 27 | let serialize run_parameters = 28 | to_yojson run_parameters 29 | |> Yojson.Safe.pretty_to_string ~std:true 30 | 31 | let deserialize_exn s = 32 | let open Ppx_deriving_yojson_runtime.Result in 33 | Yojson.Safe.from_string s |> of_yojson 34 | |> function 35 | | Ok o -> o 36 | | Error e -> failwith e 37 | end 38 | 39 | let create ~base_url ?playground_path ?program specification = 40 | let created = 41 | Run_parameters.make_created ?playground_path 42 | ?program ~client:(Client.make base_url) specification in 43 | `Long_running (name, `Created created |> Run_parameters.serialize) 44 | 45 | let kubernetes_program ~base_url ~image ?(volume_mounts = []) p = 46 | let script_path = "/coclo-kube/mount/script" in 47 | let script = 48 | Kube_job.Specification.File_contents_mount.make 49 | ~path:script_path 50 | Ketrew_pure.Monitored_script.( 51 | create p 52 | ~playground:(Ketrew_pure.Path.absolute_directory_exn 53 | "/tmp/playground") 54 | |> to_string 55 | ) in 56 | let spec = 57 | Kube_job.Specification.make 58 | ~image 59 | ~volume_mounts:(`Constant script :: volume_mounts) 60 | ["bash"; script_path] 61 | |> Job.Specification.kubernetes 62 | in 63 | create ~base_url ~program:p spec 64 | 65 | let extra_mount_container_side = "/coclobas-ketrew-plugin-playground" 66 | let script_filename = "program-monitored-script" 67 | 68 | let local_docker_program 69 | ?(shell = "bash") 70 | ?cpus ?memory ?tmp_dir ~base_url ~image ?(volume_mounts = []) p = 71 | let tmp_dir = 72 | match tmp_dir with 73 | | Some d -> d 74 | | None -> try Sys.getenv "TMPDIR" with _ -> "/tmp/coclolocal" 75 | in 76 | let playground_dir = 77 | let id = Uuidm.(v5 (create `V4) "coclojob" |> to_string ~upper:false) in 78 | sprintf "%s-playground" id in 79 | let extra_mount = 80 | `Local (tmp_dir, extra_mount_container_side) in 81 | let playground_path = tmp_dir // playground_dir in 82 | create ~base_url ~program:p ~playground_path 83 | (Coclobas.Job.Specification.local_docker 84 | Coclobas.Local_docker_job.Specification.( 85 | make ~image 86 | ~volume_mounts:(extra_mount :: volume_mounts) 87 | ?cpus ?memory 88 | [shell; 89 | extra_mount_container_side // playground_dir // script_filename] 90 | )) 91 | 92 | let aws_batch_program ~base_url ~image ?(volume_mounts = []) 93 | ?(shell = "bash") 94 | ?memory ?cpus ?job_role p = 95 | let script_path = "/tmp/coclobas-script.sh" in 96 | let cmd = [shell; script_path ] in 97 | let script = 98 | Ketrew_pure.Monitored_script.( 99 | create p 100 | ~playground:(Ketrew_pure.Path.absolute_directory_exn "/tmp") 101 | |> to_string 102 | ) in 103 | create ~base_url ~program:p (* ~playground_path *) 104 | (Coclobas.Job.Specification.aws_batch 105 | Coclobas.Aws_batch_job.Specification.( 106 | let extra_mount = 107 | `S3_constant (File_contents_mount.make ~path:script_path (script)) in 108 | make ~priviledged:true ?memory ?cpus ?job_role 109 | ~image ~volume_mounts:(extra_mount :: volume_mounts) cmd)) 110 | 111 | module Long_running_implementation : Ketrew.Long_running.LONG_RUNNING = struct 112 | 113 | type run_parameters = Run_parameters.t 114 | include Run_parameters 115 | 116 | let name = "coclobas" 117 | 118 | module KLRU = Ketrew.Long_running_utilities 119 | 120 | let classify_client_error m = 121 | m >>< function 122 | | `Ok o -> return o 123 | | `Error (`Client (`IO_exn _ as cl)) -> fail (`Recoverable (Client.Error.to_string cl)) 124 | | `Error (`Client (`Response _ as cl)) -> fail (`Recoverable (Client.Error.to_string cl)) 125 | | `Error (`Client cl) -> fail (`Fatal (Client.Error.to_string cl)) 126 | | `Error (`Coclo_plugin (`Expecting_one_status l)) -> 127 | fail (`Fatal (sprintf "Expecting one status but got: [%s]" 128 | (List.map l ~f:(fun (id, st) -> id) 129 | |> String.concat ~sep:"; "))) 130 | | `Error (`Coclo_plugin (`Preparing_script msg)) -> 131 | fail (`Fatal (sprintf "Preparing script: %s" msg)) 132 | 133 | let start : 134 | run_parameters -> 135 | host_io:Ketrew.Host_io.t -> 136 | (run_parameters, Ketrew.Host_io.Error.classified) Deferred_result.t = 137 | fun rp ~host_io -> 138 | match rp with 139 | | `Running _ -> 140 | ksprintf KLRU.fail_fatal "start on already running: %s" 141 | (Run_parameters.show rp) 142 | | `Created ({client; specification; 143 | program; playground_path} as created) -> 144 | classify_client_error begin 145 | begin match Job.Specification.kind specification, 146 | playground_path, program with 147 | | `Local_docker, None, Some _ -> 148 | fail (`Coclo_plugin (`Preparing_script 149 | "Program provided but missing playground!")) 150 | | `Local_docker, Some playground, Some prog -> 151 | begin 152 | let script = 153 | Ketrew_pure.Monitored_script.( 154 | create prog 155 | ~playground:(Ketrew_pure.Path.absolute_directory_exn 156 | playground) 157 | |> to_string 158 | ) in 159 | (* Permissions set to 0o777 because very often, inside 160 | the docker container, we have a different user. *) 161 | System.ensure_directory_path ~perm:0o777 playground 162 | >>= fun () -> 163 | IO.write_file ~content:script (playground // script_filename) 164 | end >>< begin function 165 | | `Ok () -> return () 166 | | `Error e -> 167 | let msg = 168 | sprintf "I/O/System Error: %s" 169 | (match e with 170 | | `IO _ as e -> IO.error_to_string e 171 | | `System _ as e -> System.error_to_string e) in 172 | fail (`Coclo_plugin (`Preparing_script msg)) 173 | end 174 | | `Aws_batch, _, _ -> 175 | return () 176 | | `Local_docker, _, None (* No program means, no need for playground *) 177 | | `Kube, _, _ -> return () 178 | end 179 | >>= fun () -> 180 | Client.submit_job client specification 181 | >>= fun job_id -> 182 | return (`Running {created; job_id}) 183 | end 184 | 185 | let running rp f = 186 | match rp with 187 | | `Created _ -> 188 | ksprintf KLRU.fail_fatal "update on not started: %s" 189 | (Run_parameters.show rp) 190 | | `Running {created = {client; specification; _}; job_id; _} -> 191 | f job_id client specification 192 | 193 | let update : 194 | run_parameters -> 195 | host_io:Ketrew.Host_io.t -> 196 | ([ `Failed of run_parameters * string 197 | | `Still_running of run_parameters 198 | | `Succeeded of run_parameters ], Ketrew.Host_io.Error.classified) 199 | Deferred_result.t = 200 | fun rp ~host_io -> 201 | running rp begin fun job_id client spec -> 202 | classify_client_error begin 203 | Client.get_job_states client [job_id] 204 | >>= fun jobs -> 205 | begin match jobs with 206 | | [(_, job)] -> 207 | begin match Job.status job with 208 | | `Finished (_,`Succeeded) -> 209 | return (`Succeeded rp) 210 | | `Error e -> 211 | return (`Failed (rp, sprintf "Error-from-coclobas: %s" e)) 212 | | `Finished (_, (`Failed | `Killed)) -> 213 | return (`Failed (rp, "job-failed-or-killed")) 214 | | `Submitted 215 | | `Started _ -> return (`Still_running rp) 216 | end 217 | | other -> 218 | fail (`Coclo_plugin (`Expecting_one_status other)) 219 | end 220 | end 221 | end 222 | 223 | let kill : 224 | run_parameters -> 225 | host_io:Ketrew.Host_io.t -> 226 | ([ `Killed of run_parameters ], Ketrew.Host_io.Error.classified) 227 | Deferred_result.t = 228 | fun rp ~host_io -> 229 | running rp begin fun id client spec -> 230 | classify_client_error begin 231 | Client.kill_jobs client [id] 232 | >>= fun () -> 233 | return (`Killed rp) 234 | end 235 | end 236 | 237 | let rec markup ?status t = 238 | let open Ketrew_pure.Internal_pervasives.Display_markup in 239 | let job_spec js = 240 | let open Coclobas.Job.Specification in 241 | let open Coclobas.Kube_job.Specification in 242 | let open Coclobas.Local_docker_job.Specification in 243 | let nfs_mount nfs = 244 | let open Nfs_mount in 245 | description_list [ 246 | "Host", uri nfs.host; 247 | "Path", command nfs.path; 248 | "Point", command nfs.point; 249 | "Read-only", textf "%b" nfs.read_only; 250 | "Id", command (id nfs); 251 | ] in 252 | let constant_mount cst = 253 | let open File_contents_mount in 254 | description_list [ 255 | "Path", uri cst.path; 256 | "Contents", code_block cst.contents; 257 | ] in 258 | match js with 259 | | Kube kube -> 260 | description_list [ 261 | "Image", uri kube.image; 262 | "Command", command (String.concat ~sep:" " kube.command); 263 | "Memory", (match kube.memory with `GB x -> textf "%d GB" x); 264 | "CPUs", textf "%d" kube.cpus; 265 | "Volumes", 266 | (List.map kube.volume_mounts ~f:(function 267 | | `Nfs nfs -> "NFS", nfs_mount nfs 268 | | `Constant cst -> "Constant", constant_mount cst) 269 | |> description_list); 270 | ] 271 | | Local_docker dock -> 272 | description_list [ 273 | "Image", uri dock.image; 274 | "Command", command (String.concat ~sep:" " dock.command); 275 | "CPUS", option dock.cpus ~f:(textf "%.3f"); 276 | "Memory", option dock.memory ( 277 | function 278 | | `GB x -> textf "%d GiB" x 279 | | `MB x -> textf "%d MiB" x 280 | ); 281 | "Volumes", 282 | (List.map dock.volume_mounts ~f:(function 283 | | `Local (f, t) -> "Local", textf "%s:%s" f t) 284 | |> description_list); 285 | ] 286 | | Aws_batch aws -> 287 | let open Coclobas.Aws_batch_job.Specification in 288 | description_list [ 289 | "Image", uri aws.image; 290 | "Command", command (String.concat ~sep:" " aws.command); 291 | "Memory", (match aws.memory with `MB x -> textf "%d MB" x); 292 | "CPUs", textf "%d" aws.cpus; 293 | "Privileged", textf "%b" aws.priviledged; 294 | "Job-role", option aws.job_role ~f:(function 295 | | `Arn s -> text s); 296 | "Volumes", 297 | (List.map aws.volume_mounts ~f:(function 298 | | `S3_constant c -> "S3-Constant", constant_mount c) 299 | |> description_list); 300 | ] 301 | in 302 | match t with 303 | | `Created c -> 304 | let status_mu = 305 | Option.value status ~default:(text "Created (not sent to Coclobas)") in 306 | [ 307 | "Status", status_mu; 308 | "Client", uri c.client.Coclobas.Client.base_url; 309 | "Program", option ~f:Ketrew_pure.Program.markup c.program; 310 | "Playground-path", option ~f:command c.playground_path; 311 | "Job", job_spec c.specification; 312 | ] 313 | | `Running rp -> 314 | ["Job-ID", command rp.job_id;] 315 | @ markup ?status (`Created rp.created) 316 | 317 | 318 | let log rp = 319 | [ 320 | "Coclobas", 321 | Ketrew_pure.Internal_pervasives.Display_markup.( 322 | description_list (markup rp) |> log) 323 | ] 324 | 325 | module Query_names = struct 326 | let details_status = "ketrew-markup/Job details and status" 327 | let server_status = "ketrew-markup/Server status" 328 | let describe = "ketrew-markup/Call `describe`" 329 | let logs = "ketrew-markup/Call `logs`" 330 | end 331 | 332 | let additional_queries : 333 | run_parameters -> (string * Ketrew_pure.Internal_pervasives.Log.t) list = 334 | fun rp -> 335 | let open Ketrew_pure.Internal_pervasives.Log in 336 | let open Query_names in 337 | let common = [ 338 | details_status, s "Display the contents and the status of the job"; 339 | server_status, s "Get the server status"; 340 | ] in 341 | match rp with 342 | | `Created c -> common 343 | | `Running c -> 344 | List.concat [ 345 | common; 346 | [ 347 | describe, s "Get a description of the job's current state"; 348 | logs, s "Get the `logs` blob"; 349 | ]; 350 | ] 351 | 352 | 353 | let client_query m = 354 | m >>< function 355 | | `Ok o -> return o 356 | | `Error (`Client ce) -> 357 | fail (Ketrew_pure.Internal_pervasives.Log.verbatim (Client.Error.to_string ce)) 358 | 359 | let job_query_result_to_markup l = 360 | let open Ketrew_pure.Internal_pervasives.Display_markup in 361 | List.map l ~f:(fun (id, qr) -> 362 | description_list 363 | (("Job-id", command id) 364 | :: List.map qr ~f:(fun (section, result) -> 365 | section, 366 | begin match result with 367 | | `Url u -> uri u 368 | | `Saved_command s -> 369 | let open Hyper_shell.Saved_command in 370 | let code_block_or_empty = 371 | function 372 | | s when String.strip s = "" -> text "Empty" 373 | | other -> code_block other in 374 | let should_display_archived = ref true in 375 | let command_and_result = [ 376 | "Command", command s.command; 377 | begin match s.outcome with 378 | | `Error e -> 379 | "Error", 380 | let open Hyper_shell.Error in 381 | description_list [ 382 | "STDOUT", option ~f:code_block_or_empty e.stdout; 383 | "STDERR", option ~f:code_block_or_empty e.stderr; 384 | "Status", 385 | option e.status 386 | ~f:(text_of_stringable Pvem_lwt_unix.System.Shell.status_to_string); 387 | "Exception", option e.exn ~f:command; 388 | ] 389 | | `Ok (out, err) -> 390 | begin 391 | if Some (out ^ err) = 392 | Option.map ~f:Output_archive.to_string s.archived 393 | then should_display_archived := false 394 | end; 395 | "Success", description_list [ 396 | "STDOUT", code_block_or_empty out; 397 | "STDERR", code_block_or_empty err; 398 | ] 399 | end; 400 | ] in 401 | let archived = 402 | let display_archive a = 403 | let open Output_archive in 404 | description_list [ 405 | "On", date a.date; 406 | "STDOUT", code_block_or_empty a.out; 407 | "STDERR", code_block_or_empty a.err; 408 | ] in 409 | if !should_display_archived then [ 410 | "Archived content", option ~f:display_archive s.archived; 411 | ] else [] in 412 | command_and_result @ archived |> description_list 413 | end))) 414 | |> concat 415 | |> serialize 416 | 417 | 418 | 419 | module Markup = Ketrew_pure.Internal_pervasives.Display_markup 420 | 421 | let markup_job_state : Coclobas.Job.t -> Markup.t = fun job -> 422 | let open Markup in 423 | let status = 424 | match Coclobas.Job.status job with 425 | | `Error ee -> concat [text "Error: "; command ee] 426 | | `Started d -> concat [text "Started on "; date d] 427 | | `Finished (d, `Failed) -> 428 | concat [text "Failed on "; date d] 429 | | `Finished (d, `Succeeded) -> 430 | concat [text "Succeeded on "; date d] 431 | | `Finished (d, `Killed) -> 432 | concat [text "Killed on "; date d] 433 | | `Submitted -> text "Submitted" 434 | in 435 | let error_list = function 436 | | [] -> text "None." 437 | | more -> 438 | let length_encoding = 439 | List.fold more ~init:[] ~f:(fun prev cur -> 440 | match prev with 441 | | (hn, hv) :: t when hv = cur -> (hn + 1, hv) :: t 442 | | _ -> (1, cur) :: prev) 443 | |> List.rev in 444 | itemize (List.map length_encoding ~f:(fun (x, err) -> 445 | concat [textf "%d × " x; code_block err])) 446 | in 447 | description_list [ 448 | "Engine-status", status; 449 | "Start-errors", Coclobas.Job.start_errors job |> error_list; 450 | "Update-errors", Coclobas.Job.update_errors job |> error_list; 451 | ] 452 | 453 | let query : 454 | run_parameters -> 455 | host_io:Ketrew.Host_io.t -> 456 | string -> 457 | (string, Ketrew_pure.Internal_pervasives.Log.t) Deferred_result.t = 458 | fun rp ~host_io query -> 459 | let open Ketrew_pure.Internal_pervasives.Log in 460 | let created = 461 | match rp with `Created c -> c | `Running {created; _} -> created in 462 | match query, rp with 463 | | ds, `Created _ when ds = Query_names.details_status -> 464 | return Markup.(markup rp |> description_list |> serialize) 465 | | ds, `Running {job_id; _} when ds = Query_names.details_status -> 466 | client_query begin 467 | Client.get_job_states created.client [job_id] 468 | >>= fun l -> 469 | let open Markup in 470 | let status = 471 | match l with 472 | | [_, one] -> markup_job_state one 473 | | other -> 474 | description_list [ 475 | "ERROR-WRONG-NUMBER-OF-STATUSES", 476 | concat ~sep:(text ", ") 477 | (List.map l ~f:(fun (id, s) -> 478 | concat [textf "Job %s: " id; markup_job_state s])); 479 | ] 480 | in 481 | return Markup.(markup rp ~status |> description_list |> serialize) 482 | end 483 | | ds, _ when ds = Query_names.server_status -> 484 | client_query begin 485 | Coclobas.Client.get_server_status_string created.client 486 | end 487 | >>= fun engine_status -> 488 | client_query (Coclobas.Client.get_job_list created.client) 489 | >>= fun job_list -> 490 | client_query (Coclobas.Client.get_cluster_description created.client) 491 | >>= fun cluster_description -> 492 | let status = 493 | let open Markup in 494 | description_list [ 495 | "Engine-status", command engine_status; 496 | "Cluster", code_block cluster_description; 497 | "Jobs", 498 | begin match job_list with 499 | | [] -> text "0 currently active" 500 | | jobs -> 501 | itemize 502 | (List.map jobs ~f:(fun (`Id is, `Status s) -> 503 | ksprintf command "%s: %s" is s)) 504 | end; 505 | ] in 506 | return (status |> Markup.serialize) 507 | | ds, `Running {job_id; _} when ds = Query_names.describe -> 508 | client_query begin 509 | Client.get_job_descriptions created.client [job_id] 510 | >>| job_query_result_to_markup 511 | end 512 | | ds, `Running {job_id; _} when ds = Query_names.logs -> 513 | client_query begin 514 | Client.get_job_logs created.client [job_id] 515 | >>| job_query_result_to_markup 516 | end 517 | | other, _ -> fail (s "Unknown query: " % s other) 518 | 519 | end 520 | let () = 521 | Ketrew.Plugin.register_long_running_plugin ~name 522 | (module Long_running_implementation) 523 | 524 | include Long_running_implementation 525 | -------------------------------------------------------------------------------- /src/lib/aws_batch_job.ml: -------------------------------------------------------------------------------- 1 | 2 | open Internal_pervasives 3 | 4 | module Specification = struct 5 | (* Cf. 6 | http://docs.aws.amazon.com/batch/latest/APIReference/API_ContainerProperties.html 7 | *) 8 | module File_contents_mount = Kube_job.Specification.File_contents_mount 9 | type t = { 10 | image: string; 11 | memory: [ `MB of int ] [@default `MB 128]; 12 | cpus: int [@default 7]; 13 | job_role: [ `Arn of string ] option; 14 | priviledged: bool [@default false]; 15 | command: string list [@main]; 16 | (* Mounting volumes seems to be only for host directories. *) 17 | volume_mounts: [ `S3_constant of File_contents_mount.t ] list; 18 | } [@@deriving yojson, show, make] 19 | end 20 | 21 | module State = struct 22 | type t = { 23 | aws_id: string; 24 | } [@@deriving yojson, show, make] 25 | end 26 | 27 | module Error = struct 28 | type nonrec exn = exn [@@deriving show] 29 | type start = [ 30 | `Start of string * Specification.t * 31 | [ `Json_parsing of 32 | string * [ `Exn of exn | `String of string ] 33 | | `Invalid_cluster of string * Cluster.t ] 34 | ] [@@deriving show] 35 | let start ~id ~specification e = 36 | `Aws_batch_job (`Start (id, specification, e)) 37 | type status = [ 38 | `Status of [`Id of string] * [ `Aws_id of string ] 39 | * [ `Parsing_status of string ] 40 | ] [@@deriving show] 41 | let status ~id ~aws_id e = 42 | `Aws_batch_job (`Status (`Id id, `Aws_id aws_id, e)) 43 | type t = [ start | status ] [@@deriving show] 44 | end 45 | 46 | let command_must_succeed ~log ?additional_json ~id cmd = 47 | Hyper_shell.command_must_succeed ~log cmd ?additional_json 48 | ~section:["job"; id; "commands"] 49 | let command_must_succeed_with_output ~log ?additional_json ~id cmd = 50 | Hyper_shell.command_must_succeed_with_output ~log cmd ?additional_json 51 | ~section:["job"; id; "commands"] 52 | 53 | let job_definition id = sprintf "coclodef-%s" id 54 | let job_name id = sprintf "coclojob-%s" id 55 | 56 | let canonicalize p = 57 | String.split ~on:(`Character '/') p 58 | |> begin function 59 | | [] | [""] | [""; ""] -> [""; ""] 60 | | "" :: more -> "" :: List.filter more ~f:((<>) "") 61 | | other -> List.filter other ~f:((<>) "") 62 | end 63 | |> String.concat ~sep:"/" 64 | 65 | let start ~cluster ~log ~id ~specification = 66 | let open Specification in 67 | Cluster.( match cluster with 68 | | Kube _ 69 | | Local_docker _ -> 70 | fail (Error.start ~id ~specification (`Invalid_cluster 71 | ("Not an AWS-queue", cluster))) 72 | | Aws_batch_queue q -> 73 | Aws_batch_queue.(queue_name q, s3_bucket q) |> return) 74 | >>= fun (job_queue, s3_bucket_opt) -> 75 | Deferred_list.while_sequential specification.volume_mounts ~f:begin function 76 | | `S3_constant {File_contents_mount. path; contents} -> 77 | begin match s3_bucket_opt with 78 | | None -> 79 | fail (Error.start ~id ~specification 80 | (`Invalid_cluster ("No S3 bucket configured", cluster))) 81 | | Some s -> return s 82 | end 83 | >>= fun s3_prefix -> 84 | let s3_filename = 85 | sprintf "Coclo-%s-%s-%d-%s" 86 | id Digest.(path ^ contents ^ id |> string |> to_hex) 87 | Random.(int 100_000_000) 88 | Filename.(basename path) in 89 | let local_path = sprintf "/tmp/%s" s3_filename in 90 | IO.write_file local_path ~content:contents 91 | >>= fun () -> 92 | let s3_uri = Uri.of_string s3_prefix in 93 | let bucket = Uri.host_with_default ~default:"" s3_uri in 94 | let s3_prefix_path = Uri.path s3_uri |> canonicalize in 95 | let s3_full_path = Filename.concat s3_prefix_path s3_filename in 96 | ksprintf 97 | (command_must_succeed ~log ~id) 98 | "aws s3 cp %s s3://%s%s \ 99 | --grants read=uri=http://acs.amazonaws.com/groups/global/AllUsers" 100 | local_path bucket s3_full_path 101 | >>= fun () -> 102 | let download_cmd = 103 | let mkdir = 104 | sprintf " mkdir -m 777 -p %s || echo mkdir-failed " 105 | (Filename.dirname path) in 106 | let curl = 107 | let url_path = 108 | Filename.concat s3_prefix_path s3_filename in 109 | sprintf "curl http://%s.s3-website-us-east-1.amazonaws.com%s -o %s" 110 | bucket url_path path in 111 | sprintf "{ echo 'Downloading %s' ; { %s ; sudo %s ; %s || sudo %s ;} \ 112 | || echo 'Downloading %s failed' ; } " 113 | s3_full_path 114 | mkdir mkdir curl curl 115 | s3_full_path 116 | in 117 | return (download_cmd) 118 | end 119 | >>= fun (s3_commands : string list) -> 120 | let json = 121 | let string s : Yojson.Safe.json = `String s in 122 | let int s : Yojson.Safe.json = `Int s in 123 | let command = 124 | match s3_commands with 125 | | [] -> specification.command 126 | | more -> 127 | [ 128 | "bash"; "-c"; 129 | String.concat ~sep:" && " 130 | (s3_commands @ [List.map ~f:Filename.quote specification.command 131 | |> String.concat ~sep:" "]) 132 | ] 133 | in 134 | let props = 135 | [ 136 | "image", string specification.image; 137 | "vcpus", int specification.cpus; 138 | "memory", int (specification.memory |> fun (`MB i) -> i); 139 | "command", `List (List.map ~f:string command); 140 | "privileged", `Bool specification.priviledged; 141 | ] 142 | @ Option.value_map ~default:[] specification.job_role ~f:(function 143 | | `Arn s -> ["jobRoleArn", string s]) 144 | in 145 | `Assoc props in 146 | ksprintf 147 | (command_must_succeed ~log ~id) 148 | "aws batch register-job-definition --job-definition %s \ 149 | --type container \ 150 | --container-properties %s 151 | " 152 | (job_definition id) 153 | (Filename.quote (Yojson.Safe.to_string ~std:true json)) 154 | >>= fun () -> 155 | ksprintf 156 | (command_must_succeed_with_output ~log ~id) 157 | "aws batch submit-job --job-name %s --job-queue %s --job-definition %s" 158 | (job_name id) job_queue (job_definition id) 159 | >>= fun (out, err) -> 160 | Deferred_result.wrap_deferred 161 | ~on_exn:(fun e -> 162 | Error.start ~id ~specification (`Json_parsing (out, `Exn e))) 163 | (fun () -> Yojson.Safe.from_string out |> Lwt.return) 164 | >>= fun json -> 165 | let job_id_opt = 166 | match json with 167 | | `Assoc l -> 168 | List.find_map l ~f:(function 169 | | ("jobId", `String v) -> Some v 170 | | other -> None) 171 | | other -> None 172 | in 173 | begin match job_id_opt with 174 | | None -> 175 | fail (Error.start ~id ~specification 176 | (`Json_parsing (out, `String "Can't find jobId"))) 177 | | Some v -> return v 178 | end 179 | >>= fun aws_id -> 180 | return { State.aws_id } 181 | 182 | let get_update ~log ~id ~state = 183 | ksprintf 184 | (command_must_succeed_with_output ~log ~id) 185 | "aws batch describe-jobs --jobs %s --query 'jobs[0].status'" 186 | state.State.aws_id 187 | >>= fun (out, err) -> 188 | let status = 189 | String.filter out ~f:(function '"' | '\n' | ' ' -> false | _ -> true) in 190 | begin match status with 191 | | "SUBMITTED" 192 | | "PENDING" 193 | | "RUNNABLE" 194 | | "STARTING" 195 | | "RUNNING" -> return `Running 196 | | "SUCCEEDED" -> return `Succeeded 197 | | "FAILED" -> return `Failed 198 | | _ -> 199 | fail (Error.status ~id ~aws_id:state.State.aws_id (`Parsing_status status)) 200 | end 201 | 202 | let describe ~storage ~log ~id ~state = 203 | let cmd = 204 | sprintf 205 | "aws batch describe-jobs --jobs %s" 206 | state.State.aws_id in 207 | let save_path = Job_common.save_path id `Describe_output in 208 | Hyper_shell.Saved_command.run 209 | ~storage ~log ~cmd ~path:save_path 210 | ~section:(Job_common.job_section id) 211 | ~keep_the:`Latest 212 | >>= fun logres -> 213 | return (Job_common.Query_result.one_saved "Description" logres) 214 | 215 | let get_logs ~log ~id ~state = 216 | let cloudwatch_url = 217 | sprintf 218 | "https://console.aws.amazon.com/cloudwatch/home?\ 219 | region=us-east-1#logStream:group=/aws/batch/job;prefix=%s/%s/;\ 220 | streamFilter=typeLogStreamPrefix" 221 | (job_name id) state.State.aws_id 222 | in 223 | return (Job_common.Query_result.one_url "Logs-link" cloudwatch_url) 224 | 225 | let kill ~log ~id ~state = 226 | ksprintf 227 | (command_must_succeed ~log ~id) 228 | "aws batch terminate-job --job-id %s --reason %s" 229 | state.State.aws_id 230 | (ksprintf Filename.quote "Killed by Coclobas on %s" 231 | ODate.Unix.(now () |> Printer.to_iso)) -------------------------------------------------------------------------------- /src/lib/aws_batch_queue.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type t = { 4 | max_jobs: int; 5 | queue_name: string; 6 | s3_bucket: string option; 7 | } [@@deriving make, yojson, show] 8 | 9 | let max_started_jobs t = t.max_jobs 10 | 11 | let queue_name t = t.queue_name 12 | 13 | let s3_bucket t = t.s3_bucket 14 | 15 | let command_must_succeed ~log cluster cmd = 16 | Hyper_shell.command_must_succeed ~log cmd 17 | ~section:["cluster"; "commands"] 18 | ~additional_json:[ 19 | "cluster", to_yojson cluster 20 | ] 21 | 22 | let command_must_succeed_with_output ~log cluster cmd = 23 | Hyper_shell.command_must_succeed_with_output ~log cmd 24 | ~section:["cluster"; "commands"] 25 | ~additional_json:[ 26 | "cluster", to_yojson cluster 27 | ] 28 | 29 | let ensure_living t ~log = 30 | let cmd = 31 | sprintf "aws batch describe-job-queues \ 32 | --job-queues %s \ 33 | --query jobQueues[0].status --output text" 34 | t.queue_name in 35 | command_must_succeed_with_output ~log t cmd 36 | >>= fun (out, err) -> 37 | begin match out with 38 | | "VALID\n" -> return () 39 | | other -> fail (`Aws_batch_queue (`Check_valid, other, err)) 40 | end 41 | 42 | let describe t ~log = 43 | let cmd = 44 | sprintf "aws batch describe-job-queues --job-queues %s" t.queue_name in 45 | command_must_succeed_with_output ~log t cmd 46 | 47 | -------------------------------------------------------------------------------- /src/lib/client.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type t = { 4 | base_url: string [@main]; 5 | } [@@deriving yojson, show, make] 6 | 7 | let wrap_io d = 8 | Deferred_result.wrap_deferred d 9 | ~on_exn:(fun e -> `Client (`IO_exn e)) 10 | 11 | let wrap_parsing d = 12 | Deferred_result.wrap_deferred d 13 | ~on_exn:(fun e -> `Client (`Json_exn e)) 14 | 15 | let do_get uri = 16 | wrap_io Lwt.(fun () -> 17 | Cohttp_lwt_unix.Client.get uri 18 | >>= fun (resp, body) -> 19 | Cohttp_lwt_body.to_string body 20 | >>= fun b -> 21 | return (resp, b) 22 | ) 23 | 24 | let uri_of_ids base_url path ids = 25 | Uri.with_query 26 | (Uri.with_path (Uri.of_string base_url) path) 27 | ["id", ids] 28 | 29 | let response_is_ok ~uri ~meth ~body resp = 30 | begin match Cohttp.Response.status resp with 31 | | `OK -> return () 32 | | other -> 33 | fail (`Client (`Response (meth, uri, resp, body))) 34 | end 35 | 36 | let submit_job {base_url} spec = 37 | let uri = 38 | Uri.with_path (Uri.of_string base_url) "job/submit" in 39 | let body = 40 | Cohttp_lwt_body.of_string 41 | (Job.Specification.to_yojson spec 42 | |> Yojson.Safe.pretty_to_string) 43 | in 44 | wrap_io (fun () -> Cohttp_lwt_unix.Client.post uri ~body) 45 | >>= fun (resp, ret_body) -> 46 | wrap_io (fun () -> Cohttp_lwt_body.to_string ret_body) 47 | >>= fun body -> 48 | response_is_ok resp ~meth:`Post ~uri ~body 49 | >>= fun () -> 50 | return body 51 | 52 | let get_job_jsons {base_url} ~path ~ids = 53 | let uri = uri_of_ids base_url path ids in 54 | do_get uri 55 | >>= fun (resp, body) -> 56 | response_is_ok ~body resp ~meth:`Get ~uri 57 | >>= fun () -> 58 | wrap_parsing (fun () -> Lwt.return (Yojson.Safe.from_string body)) 59 | 60 | let get_job_json_one_key t ~path ~ids ~json_key ~of_yojson = 61 | get_job_jsons t ~path ~ids 62 | >>= fun json -> 63 | let uri = uri_of_ids t.base_url path ids in (* Only for error values: *) 64 | begin match json with 65 | | `List l -> 66 | Deferred_list.while_sequential l ~f:(function 67 | | `Assoc ["id", `String id; key, stjson] when key = json_key -> 68 | wrap_parsing Lwt.(fun () -> 69 | let open Ppx_deriving_yojson_runtime.Result in 70 | match of_yojson stjson with 71 | | Ok s -> return (id, s) 72 | | Error e -> fail (Failure e) 73 | ) 74 | | other -> fail (`Client (`Json_parsing (uri, "Not an Assoc", other))) 75 | ) 76 | | other -> fail (`Client (`Json_parsing (uri, "Not a List", other))) 77 | end 78 | 79 | let get_job_states t ids = 80 | get_job_json_one_key t ~path:"job/state" ~ids ~json_key:"state" 81 | ~of_yojson:Job.of_yojson 82 | 83 | let get_json_keys ~uri ~parsers json = 84 | begin match json with 85 | | `List l -> 86 | Deferred_list.while_sequential l ~f:(function 87 | | `Assoc kv as jkv-> 88 | Deferred_list.while_sequential parsers ~f:(fun (key, of_yojson) -> 89 | match List.find kv ~f:(fun (k, v) -> k = key) with 90 | | Some (_, vjson) -> 91 | wrap_parsing Lwt.(fun () -> 92 | match of_yojson vjson with 93 | | `Ok s -> return s 94 | | `Error e -> fail (Failure e) 95 | ) 96 | | None -> 97 | fail (`Client (`Json_parsing (uri, "No key: " ^ key, jkv))) 98 | ) 99 | | other -> fail (`Client (`Json_parsing (uri, "Not an Assoc", other))) 100 | ) 101 | | other -> fail (`Client (`Json_parsing (uri, "Not a List", other))) 102 | end 103 | 104 | (* For describe or logs *) 105 | let get_job_query_result ~path t ids = 106 | get_job_jsons t ~path ~ids 107 | >>= fun json -> 108 | let uri = uri_of_ids t.base_url path ids in (* Only for error values: *) 109 | begin match json with 110 | | `List l -> 111 | Deferred_list.while_sequential l ~f:(function 112 | | `Assoc ["id", `String id; 113 | "output", yoj] -> 114 | begin match (Job_common.Query_result.of_yojson yoj) with 115 | | Ok output -> 116 | return (id, output) 117 | | Error e -> 118 | fail (`Client (`Json_parsing 119 | (uri, "Not an query-result", yoj))) 120 | end 121 | | other -> 122 | fail (`Client (`Json_parsing 123 | (uri, "Not an {id: ... output: ...}", other))) 124 | ) 125 | | other -> fail (`Client (`Json_parsing (uri, "Not a List", other))) 126 | end 127 | 128 | let get_job_descriptions t ids = 129 | get_job_query_result ~path:"job/describe" t ids 130 | 131 | let get_job_logs t ids = 132 | get_job_query_result ~path:"job/logs" t ids 133 | 134 | let kill_jobs {base_url} ids = 135 | let uri = uri_of_ids base_url "job/kill" ids in 136 | do_get uri 137 | >>= fun (resp, body) -> 138 | response_is_ok resp ~body ~meth:`Get ~uri 139 | 140 | let get_server_status_string {base_url} = 141 | let uri = Uri.with_path (Uri.of_string base_url) "status" in 142 | do_get uri 143 | >>= fun (resp, body) -> 144 | response_is_ok resp ~body ~meth:`Get ~uri 145 | >>= fun () -> 146 | return body 147 | 148 | let get_cluster_description {base_url} = 149 | let uri = Uri.with_path (Uri.of_string base_url) "cluster/describe" in 150 | do_get uri 151 | >>= fun (resp, body) -> 152 | response_is_ok resp ~body ~meth:`Get ~uri 153 | >>= fun () -> 154 | return body 155 | 156 | let get_job_list {base_url} = 157 | let uri = Uri.with_path (Uri.of_string base_url) "jobs" in 158 | do_get uri 159 | >>= fun (resp, body) -> 160 | let json = Yojson.Safe.from_string body in 161 | let get_string name = 162 | function 163 | | `String i -> `Ok i 164 | | other -> `Error (sprintf "%s not a string" name) 165 | in 166 | get_json_keys ~uri json ~parsers:[ 167 | "id", get_string "status"; 168 | "status", get_string "status"; 169 | ] 170 | >>= fun (res : string list list) -> 171 | Deferred_list.while_sequential res ~f:( 172 | function 173 | | [id; status] -> 174 | return (`Id id, `Status status) 175 | | other -> 176 | ksprintf failwith 177 | "This should never happen: 2 parsers Vs %d results: [%s]" 178 | (List.length other) 179 | (String.concat ~sep:", " other) 180 | ) 181 | 182 | module Error = struct 183 | let to_string = 184 | function 185 | | `IO_exn e -> sprintf "Client.IO: %s" (Printexc.to_string e) 186 | | `Json_exn e -> sprintf "Client.Json-parsing: %s" (Printexc.to_string e) 187 | | `Json_parsing (uri, problem, json) -> 188 | sprintf "Client.Json-parsing: URI: %s, problem: %s, content: %s" 189 | (Uri.to_string uri) 190 | problem 191 | (Yojson.Safe.pretty_to_string json) 192 | | `Response (meth, uri, resp, body) -> 193 | sprintf "Client.Response: URI: %s, Meth: %s, Resp: %s, Body: %s" 194 | (Uri.to_string uri) 195 | begin match meth with 196 | | `Post -> "POST" 197 | | `Get -> "GET" 198 | end 199 | (Cohttp.Response.sexp_of_t resp |> Sexplib.Sexp.to_string_hum) 200 | body 201 | end 202 | -------------------------------------------------------------------------------- /src/lib/cluster.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type local_docker = 4 | {max_jobs: int} 5 | [@@deriving yojson, show] 6 | 7 | type t = 8 | | Kube of Gke_cluster.t 9 | | Local_docker of local_docker 10 | | Aws_batch_queue of Aws_batch_queue.t 11 | [@@deriving yojson, show] 12 | 13 | let kind = 14 | function 15 | | Kube _ -> `GCloud_kubernetes 16 | | Local_docker _ -> `Local_docker 17 | | Aws_batch_queue _ -> `Aws_batch_queue 18 | 19 | let save ~storage:st cluster = 20 | Storage.Json.save_jsonable 21 | st (to_yojson cluster) 22 | ~path:["cluster"; "default"; "definition.json"] 23 | 24 | let get st = 25 | Storage.Json.get_json 26 | st ~path:["cluster"; "default"; "definition.json"] 27 | ~parse:of_yojson 28 | 29 | let gke k = Kube k 30 | 31 | let local_docker ~max_jobs = Local_docker {max_jobs} 32 | 33 | let aws_batch_queue abq = Aws_batch_queue abq 34 | 35 | let max_started_jobs = 36 | function 37 | | Kube k -> Gke_cluster.max_started_jobs k 38 | | Local_docker { max_jobs } -> max_jobs 39 | | Aws_batch_queue a -> Aws_batch_queue.max_started_jobs a 40 | 41 | let ensure_living t ~log = 42 | match t with 43 | | Kube k -> Gke_cluster.ensure_living ~log k 44 | | Local_docker _ -> 45 | (* TODO: check docker exists and is ready *) 46 | return () 47 | | Aws_batch_queue q -> 48 | Aws_batch_queue.ensure_living ~log q 49 | 50 | let display_name = 51 | function 52 | | Kube k -> 53 | sprintf "GCloud-Kube-%s@%s" k.Gke_cluster.name k.Gke_cluster.zone 54 | | Local_docker { max_jobs } -> sprintf "Localdocker_Max-%d" max_jobs 55 | | Aws_batch_queue a -> 56 | sprintf "Aws-batch-queue-%s" (Aws_batch_queue.queue_name a) 57 | 58 | let do_log_t f ~log = 59 | function 60 | | Kube k -> f ~log k 61 | | Local_docker _ -> return () 62 | | Aws_batch_queue _ -> 63 | return () 64 | 65 | let start ~log t = 66 | do_log_t Gke_cluster.gcloud_start ~log t 67 | 68 | let delete ~log t = do_log_t Gke_cluster.gcloud_delete ~log t 69 | 70 | let describe ~log = 71 | function 72 | | Kube k -> Gke_cluster.gcloud_describe ~log k 73 | | Local_docker _ as t -> return (display_name t, "") 74 | | Aws_batch_queue a -> 75 | Aws_batch_queue.describe ~log a 76 | 77 | -------------------------------------------------------------------------------- /src/lib/cluster.mli: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type local_docker = private {max_jobs: int} 4 | type t = private 5 | | Kube of Gke_cluster.t 6 | | Local_docker of local_docker 7 | | Aws_batch_queue of Aws_batch_queue.t 8 | [@@deriving show] 9 | 10 | val display_name : t -> string 11 | 12 | val kind : 13 | t -> [> `Aws_batch_queue | `GCloud_kubernetes | `Local_docker ] 14 | 15 | val save : 16 | storage:Storage.t -> 17 | t -> 18 | (unit, [> `Storage of [> Storage.Error.common ] ]) Deferred_result.t 19 | 20 | val get : 21 | Storage.t -> 22 | (t, [> `Storage of [> Storage.Error.common ] ]) Deferred_result.t 23 | 24 | val gke : Gke_cluster.t -> t 25 | 26 | val local_docker: max_jobs:int -> t 27 | 28 | val aws_batch_queue: Aws_batch_queue.t -> t 29 | 30 | val max_started_jobs : t -> int 31 | 32 | val ensure_living : 33 | t -> 34 | log:Log.t -> 35 | (unit, 36 | [> `Aws_batch_queue of [> `Check_valid ] * string * string 37 | | `Log of Log.Error.t 38 | | `Shell_command of Hyper_shell.Error.t ]) 39 | Deferred_result.t 40 | 41 | val start : 42 | log:Log.t -> 43 | t -> 44 | (unit, [> `Log of Log.Error.t | `Shell_command of Hyper_shell.Error.t ]) 45 | Deferred_result.t 46 | 47 | val delete : 48 | log:Log.t -> 49 | t -> 50 | (unit, [> `Log of Log.Error.t | `Shell_command of Hyper_shell.Error.t ]) 51 | Deferred_result.t 52 | 53 | val describe : 54 | log:Log.t -> 55 | t -> 56 | (string * string, 57 | [> `Log of Log.Error.t | `Shell_command of Hyper_shell.Error.t ]) 58 | Deferred_result.t 59 | 60 | -------------------------------------------------------------------------------- /src/lib/command_line.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | let (//) = Filename.concat 4 | 5 | let log ~root = 6 | Log.file_tree (root // "logs") 7 | 8 | let get_storage root = 9 | IO.read_file (root // "database_parameters") 10 | >>= fun params -> 11 | return (Storage.make params) 12 | 13 | let configure ?database root ~cluster ~server = 14 | let database_parameters = 15 | Option.value database ~default:(root // "db.sqlite") in 16 | System.ensure_directory_path root 17 | >>= fun () -> 18 | IO.write_file (root // "database_parameters") ~content:database_parameters 19 | >>= fun () -> 20 | get_storage root 21 | >>= fun storage -> 22 | Cluster.save ~storage cluster 23 | >>= fun () -> 24 | Server.Configuration.save ~storage server 25 | 26 | let get_cluster ~root = 27 | get_storage root 28 | >>= fun storage -> 29 | Cluster.get storage 30 | 31 | let cluster ~root action = 32 | get_cluster ~root 33 | >>= fun cluster -> 34 | let log = log ~root in 35 | begin match action with 36 | | `Start -> 37 | Cluster.start ~log cluster 38 | >>= fun () -> 39 | printf "Cluster %s: Started\n%!" (Cluster.display_name cluster); 40 | return () 41 | | `Delete -> 42 | Cluster.delete ~log cluster 43 | >>= fun () -> 44 | printf "Cluster %s: Deleted\n%!" (Cluster.display_name cluster); 45 | return () 46 | | `Describe -> 47 | Cluster.describe ~log cluster 48 | >>= fun (out, err) -> 49 | printf "OUT:\n%s\nERR:\n%s\n%!" out err; 50 | return () 51 | end 52 | 53 | 54 | let client ~base_url action ids = 55 | let client = Client.{base_url} in 56 | begin match action with 57 | | `Describe -> 58 | Client.get_job_descriptions client ids 59 | >>= fun descs -> 60 | List.iter descs 61 | ~f:(fun (id, d) -> 62 | printf "ID: %s\n\ 63 | %s\n\n" id (Job_common.Query_result.show d)) 64 | |> return 65 | | `Status -> 66 | Client.get_job_states client ids 67 | >>= fun jobs -> 68 | List.iter jobs ~f:(fun (r, s) -> 69 | printf "%s is %s\n" r (Job.Status.show (Job.status s))) 70 | |> return 71 | | `List -> 72 | Client.get_job_list client 73 | >>= fun jobs -> 74 | List.iter jobs ~f:(fun (`Id i, `Status s) -> 75 | printf "%s is %s\n" i s); 76 | return () 77 | | `Kill -> 78 | Client.kill_jobs client ids 79 | end 80 | 81 | 82 | let start_server ~root ~port = 83 | get_storage root 84 | >>= fun storage -> 85 | Cluster.get storage 86 | >>= fun cluster -> 87 | Server.Configuration.get storage 88 | >>= fun configuration -> 89 | let log = log root in 90 | let server = 91 | Server.create ~configuration ~storage ~log ~root ~cluster ~port in 92 | Server.start server 93 | 94 | 95 | let run_deferred d = 96 | match Lwt_main.run d with 97 | | `Ok () -> () 98 | | `Error e -> 99 | eprintf "Error:\n %s\n" (Error.to_string e); 100 | exit 2 101 | 102 | let required_string ~doc optname f = 103 | let open Cmdliner in 104 | let open Term in 105 | pure f 106 | $ Arg.( 107 | required & opt (some string) None 108 | & info [optname] ~doc) 109 | 110 | let optional_string ~doc optname f = 111 | let open Cmdliner in 112 | let open Term in 113 | pure f 114 | $ Arg.( 115 | value & opt (some string) None 116 | & info [optname] ~doc) 117 | 118 | let root_term () = 119 | required_string "root" (fun s -> `Root s) 120 | ~doc:"The root of the configuration" 121 | 122 | let client_term = 123 | let open Cmdliner in 124 | let term = 125 | let open Term in 126 | pure (fun (`Base_url base_url) action ids -> 127 | client ~base_url action ids 128 | |> run_deferred 129 | ) 130 | $ required_string "server-url" (fun v -> `Base_url v) 131 | ~doc:"URL where the Cocolobas server can be found." 132 | $ Arg.( 133 | let actions = [ 134 | "describe", `Describe; 135 | "status", `Status; 136 | "list", `List; 137 | "kill", `Kill; 138 | ] in 139 | required 140 | & pos 0 (some (enum actions)) None 141 | & info [] ~doc:"Action to do: {describe,status,list,kill}." 142 | ~docv:"ACTION" 143 | ) 144 | $ Arg.( 145 | value & pos_right 0 string [] 146 | & info [] ~doc:"Job IDs to act on." 147 | ~docv:"ID") 148 | in 149 | let info = Term.(info "client" ~doc:"Interact with the server.") in 150 | (term, info) 151 | 152 | 153 | let main () = 154 | let open Cmdliner in 155 | let version = 156 | Meta.version 157 | ^ Option.( 158 | (Meta.git_commit >>= fun g -> String.sub g ~index:0 ~length:8) 159 | |> value_map ~default:"" ~f:(sprintf "+%s")) in 160 | let cluster_term = 161 | let open Term in 162 | pure begin fun 163 | cluster_kind 164 | (`GCloud_kube_name gke_name) 165 | (`GCloud_zone gzone) 166 | (`Gke_image_type image_type) 167 | (`Aws_queue_name queue_name) 168 | (`Aws_s3_bucket s3_bucket) 169 | (`Max_nodes max_nodes) 170 | (`Machine_type machine_type) -> 171 | let i_need opt msg = 172 | match opt with 173 | | None -> eprintf "ERROR: %s\n%!" msg; failwith "Invalid command line" 174 | | Some o -> o 175 | in 176 | match cluster_kind with 177 | | `GKE -> 178 | Gke_cluster.make 179 | (i_need gke_name "A cluster-name is required for GKE clusters.") 180 | ~zone:(i_need gzone "A GCloud-zone name is required for GKE clusters.") 181 | ~max_nodes 182 | ?image_type:( 183 | Option.map image_type ~f:(function 184 | | "default" -> `Default 185 | | s -> `Set s) 186 | ) 187 | ?machine_type 188 | |> Cluster.gke 189 | | `Local_docker -> 190 | Cluster.local_docker ~max_jobs:max_nodes 191 | | `Aws_batch_queue -> 192 | Aws_batch_queue.make () ~max_jobs:max_nodes ?s3_bucket 193 | ~queue_name:(i_need queue_name "A AWS-Batch queue name is required \ 194 | for AWS-Batch-Queue clusters.") 195 | |> Cluster.aws_batch_queue 196 | end 197 | $ Arg.( 198 | required 199 | & opt (enum ["gke", `GKE; 200 | "local-docker", `Local_docker; 201 | "aws-batch-queue", `Aws_batch_queue; 202 | ] |> some) None 203 | & info ["cluster-kind"] 204 | ~doc:"Kind of cluster." ~docv:"KIND") 205 | $ optional_string "gke-cluster-name" (fun s -> `GCloud_kube_name s) 206 | ~doc:"Name of the GCloud-Kubernetes cluster." 207 | $ optional_string "gcloud-zone" (fun s -> `GCloud_zone s) 208 | ~doc:"Zone of the GCloud-Kubernetes cluster." 209 | $ optional_string "gke-image-type" (fun s -> `Gke_image_type s) 210 | ~doc:"Override the default `--image-type` of \ 211 | the GCloud-Kubernetes cluster (`default` means use the default, \ 212 | i.e. do not add the option)." 213 | $ optional_string "aws-queue-name" (fun s -> `Aws_queue_name s) 214 | ~doc:"The name (or ARN) of the AWS-Batch queue." 215 | $ optional_string "aws-s3-bucket" (fun s -> `Aws_s3_bucket s) 216 | ~doc:"The prefix URI of an optional S3 bucket used by the AWS-Batch \ 217 | backend to store scripts and other data." 218 | $ begin 219 | pure (fun s -> `Max_nodes s) 220 | $ Arg.( 221 | required & opt (some int) None 222 | & info ["max-nodes"] 223 | ~doc:"Maximum number of nodes in the cluster." ~docv:"NUMBER") 224 | end 225 | $ optional_string "machine-type" (fun s -> `Machine_type s) 226 | ~doc:"The GCloud machine-type (used for the GCloud-kubernetes nodes)" 227 | in 228 | let server_config_term = 229 | let open Term in 230 | pure begin fun 231 | (`Max_update_errors max_update_errors) 232 | (`Concurrent_steps concurrent_steps) 233 | (`Min_sleep min_sleep) 234 | (`Max_sleep max_sleep) 235 | (`Backoff_factor backoff_factor) -> 236 | Server.Configuration.make () 237 | ~min_sleep ~max_sleep ~max_update_errors ~backoff_factor 238 | end 239 | $ begin 240 | pure (fun s -> `Max_update_errors s) 241 | $ Arg.(value & opt int Server.Configuration.Default.max_update_errors & 242 | info ["max-update-errors"] 243 | ~doc:"The number of `kubectl` errors allowed before \ 244 | considering a job dead.") 245 | end 246 | $ begin 247 | pure (fun s -> `Concurrent_steps s) 248 | $ Arg.(value & opt int Server.Configuration.Default.concurrent_steps & 249 | info ["concurrent-steps"] 250 | ~doc:"The maximal number of concurrent actions done by the \ 251 | server loop.") 252 | end 253 | $ begin 254 | pure (fun s -> `Min_sleep s) 255 | $ Arg.(value & opt float Server.Configuration.Default.min_sleep & 256 | info ["min-sleep"] 257 | ~doc:"The minimal time to wait before reentering the \ 258 | “update loop” (events like job submission bypass this \ 259 | timer and wake-up the loop any way).") 260 | end 261 | $ begin 262 | pure (fun s -> `Max_sleep s) 263 | $ Arg.(value & opt float Server.Configuration.Default.max_sleep & 264 | info ["max-sleep"] 265 | ~doc:"The maximal time to wait before reentering the \ 266 | “update loop.”") 267 | end 268 | $ begin 269 | pure (fun s -> `Backoff_factor s) 270 | $ Arg.(value & opt float Server.Configuration.Default.backoff_factor & 271 | info ["backoff-factor"] 272 | ~doc:"The factor used for exponential backoff: \ 273 | (factor * nth-error) in seconds, see also discussion \ 274 | at pull-request #96.") 275 | end 276 | in 277 | let configure = 278 | let term = 279 | let open Term in 280 | pure begin fun 281 | (`Root root) 282 | cluster 283 | server 284 | (`Database_parameters database) 285 | () -> 286 | configure ?database root ~cluster ~server 287 | |> run_deferred 288 | end 289 | $ root_term () 290 | $ cluster_term 291 | $ server_config_term 292 | $ begin 293 | pure (fun s -> `Database_parameters s) 294 | $ Arg.( 295 | value & opt (some string) None 296 | & info ["database-uri"] 297 | ~doc:"Database parameters (Cf. Trakeva backends)." 298 | ~docv:"URI" 299 | ) 300 | end 301 | $ pure () in 302 | let info = Term.(info "configure" ~doc:"Configure an instance") in 303 | (term, info) in 304 | let cluster = 305 | let term = 306 | let open Term in 307 | pure begin fun 308 | (`Root root) 309 | action 310 | -> 311 | cluster ~root action 312 | |> run_deferred 313 | end 314 | $ root_term () 315 | $ Arg.( 316 | let actions = [ 317 | "start", `Start; 318 | "describe", `Describe; 319 | "delete", `Delete; 320 | ] in 321 | required 322 | & pos 0 (some (enum actions)) None 323 | & info [] ~doc:"Action to do on the current cluster:\ 324 | \ {start,describe,delete}." 325 | ~docv:"ACTION" 326 | ) 327 | in 328 | let info = Term.(info "cluster" ~doc:"Manage the configured cluster") in 329 | (term, info) in 330 | let start_server = 331 | let term = 332 | let open Term in 333 | pure begin fun 334 | (`Root root) 335 | (`Port port) 336 | () -> 337 | run_deferred (start_server ~root ~port) 338 | end 339 | $ root_term () 340 | $ begin 341 | pure (fun s -> `Port s) 342 | $ Arg.( 343 | required & opt (some int) None 344 | & info ["port"; "p"] 345 | ~doc:"The port to start the HTTP server" ~docv:"NUMBER") 346 | end 347 | $ pure () 348 | in 349 | let info = Term.(info "start-server" ~doc:"Start the server") in 350 | (term, info) in 351 | let default_cmd = 352 | let doc = "Coclobas, a batch scheduler" in 353 | let man = [ 354 | `S "AUTHORS"; 355 | `P "Sebastien Mondet "; `Noblank; 356 | `S "BUGS"; 357 | `P "Browse and report new issues at"; `Noblank; 358 | `P "."; 359 | ] in 360 | Term.(ret (pure (`Help (`Plain, None)))), 361 | Term.info Sys.argv.(0) ~version ~doc ~man in 362 | let choices = [cluster; start_server; configure; client_term] in 363 | match Term.eval_choice default_cmd choices with 364 | | `Ok f -> f 365 | | `Error _ -> exit 1 366 | | `Version | `Help -> exit 0 367 | -------------------------------------------------------------------------------- /src/lib/error.ml: -------------------------------------------------------------------------------- 1 | 2 | open Internal_pervasives 3 | 4 | let to_string = 5 | let exn = Printexc.to_string in 6 | function 7 | | `Shell_command e -> 8 | sprintf "Shell-command failed:\n%s" (Hyper_shell.Error.to_display_string e) 9 | | `Storage e -> Storage.Error.to_string e 10 | | `Log e -> Log.Error.to_string e 11 | | `IO (`Write_file_exn (path, e)) -> 12 | sprintf "Writing file %S: %s" path (exn e) 13 | | `IO (`Read_file_exn (path, e)) -> 14 | sprintf "Reading file %S: %s" path (exn e) 15 | | `System _ as e -> 16 | Pvem_lwt_unix.System.error_to_string e 17 | | `Job (`Kube_json_parsing _ as e) -> 18 | Kube_job.Error.to_string e 19 | | `Job (`Docker_inspect_json_parsing _ as e) -> 20 | Local_docker_job.Error.to_string e 21 | | `Job (`Missing_aws_state id) -> 22 | sprintf "Job %s is missing AWS-Batch data" id 23 | | `Start_server (`Exn e) -> 24 | sprintf "Starting Cohttp server: %s" (exn e) 25 | | `Client err -> Client.Error.to_string err 26 | | `Invalid_job_submission (`Wrong_backend (job, clu)) -> 27 | let disp = 28 | function 29 | | `Aws_batch_queue -> "AWS-Batch-Queue" 30 | | `Aws_batch -> "AWS-Batch-Job" 31 | | `Kube -> "Kubernetes" 32 | | `GCloud_kubernetes -> "GCloud-kubernetes" 33 | | `Local_docker -> "Local-Docker" in 34 | sprintf "Invalid job submission: backend mismatch: \ 35 | job wants %s, cluster is %s" 36 | (disp job) (disp clu) 37 | | `Aws_batch_queue (`Check_valid, out, err) -> 38 | sprintf "Invalid AWS job-queue: %S (stderr: %S)" out err 39 | | `Aws_batch_job ((`Start _ | `Status _) as ae) -> 40 | sprintf "AWS-Batch-Job error: %s" (Aws_batch_job.Error.show ae) 41 | 42 | -------------------------------------------------------------------------------- /src/lib/gke_cluster.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type t = { 4 | name: string [@main]; 5 | zone: string; 6 | min_nodes: int [@default 1]; 7 | max_nodes: int; 8 | machine_type: string [@default "n1-highmem-8"]; 9 | image_type: [ `Set of string | `Default] [@default (`Set "container_vm")]; 10 | } [@@deriving yojson, show, make] 11 | 12 | let save ~storage:st cluster = 13 | Storage.Json.save_jsonable 14 | st (to_yojson cluster) 15 | ~path:["cluster"; "default"; "definition.json"] 16 | 17 | let get st = 18 | Storage.Json.get_json 19 | st ~path:["cluster"; "default"; "definition.json"] 20 | ~parse:of_yojson 21 | 22 | let command_must_succeed ~log cluster cmd = 23 | Hyper_shell.command_must_succeed ~log cmd 24 | ~section:["cluster"; "commands"] 25 | ~additional_json:[ 26 | "cluster", to_yojson cluster 27 | ] 28 | 29 | let command_must_succeed_with_output ~log cluster cmd = 30 | Hyper_shell.command_must_succeed_with_output ~log cmd 31 | ~section:["cluster"; "commands"] 32 | ~additional_json:[ 33 | "cluster", to_yojson cluster 34 | ] 35 | 36 | let max_started_jobs cluster = 37 | cluster.max_nodes + 5 38 | 39 | let gcloud_start ~log t = 40 | let cmd = 41 | let image_type_option = 42 | match t.image_type with 43 | | `Default -> "" 44 | | `Set i -> sprintf "--image-type=%s" i 45 | in 46 | sprintf 47 | "gcloud container clusters create %s %s \ 48 | --quiet \ 49 | --zone %s --num-nodes=%d --min-nodes=%d --max-nodes=%d \ 50 | --machine-type=%s \ 51 | --enable-autoscaling" 52 | t.name image_type_option 53 | t.zone t.min_nodes t.min_nodes t.max_nodes 54 | t.machine_type 55 | in 56 | command_must_succeed ~log t cmd 57 | 58 | let gcloud_delete ~log t = 59 | let cmd = 60 | sprintf 61 | "gcloud container clusters delete --quiet --wait %s --zone %s" t.name t.zone in 62 | command_must_succeed ~log t cmd 63 | 64 | let gcloud_describe ~log t = 65 | let cmd = 66 | sprintf 67 | "gcloud container clusters describe %s --zone %s" t.name t.zone in 68 | command_must_succeed_with_output ~log t cmd 69 | 70 | let gcloud_set_current ~log t = 71 | let cmd = 72 | sprintf 73 | "gcloud container clusters get-credentials %s --zone %s" t.name t.zone in 74 | command_must_succeed ~log t cmd 75 | 76 | let ensure_living ~log t = 77 | gcloud_describe ~log t 78 | >>< begin function 79 | | `Ok _ -> 80 | gcloud_set_current ~log t 81 | | `Error (`Shell_command error) 82 | when error.Hyper_shell.Error.status = Some (`Exited 1) -> 83 | gcloud_start ~log t 84 | | `Error ((`Shell_command _ | `Log _) as e) -> 85 | fail e 86 | end 87 | -------------------------------------------------------------------------------- /src/lib/gke_cluster.mli: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type t = private { 4 | name : string; 5 | zone : string; 6 | min_nodes : int; 7 | max_nodes : int; 8 | machine_type : string; 9 | image_type: [ `Set of string | `Default] [@default (`Set "container_vm")]; 10 | } [@@deriving yojson, show] 11 | 12 | val make : 13 | zone:string -> 14 | ?min_nodes:int -> max_nodes:int -> 15 | ?machine_type: string -> 16 | ?image_type: [ `Set of string | `Default] -> 17 | string -> t 18 | 19 | val max_started_jobs: t -> int 20 | (** The maximum number of jobs that Coclobas will attempt to run 21 | simultaneously on the cluster. *) 22 | 23 | 24 | val gcloud_start : 25 | log:Log.t -> 26 | t -> 27 | (unit, 28 | [> `Shell_command of Hyper_shell.Error.t 29 | | `Log of Log.Error.t ]) Deferred_result.t 30 | 31 | val gcloud_delete : 32 | log:Log.t -> 33 | t -> 34 | (unit, 35 | [> `Shell_command of Hyper_shell.Error.t 36 | | `Log of Log.Error.t ]) Deferred_result.t 37 | 38 | val gcloud_describe : 39 | log:Log.t -> 40 | t -> 41 | (string * string, 42 | [> `Shell_command of Hyper_shell.Error.t 43 | | `Log of Log.Error.t ]) Deferred_result.t 44 | 45 | val gcloud_set_current : 46 | log:Log.t -> 47 | t -> 48 | (unit, 49 | [> `Shell_command of Hyper_shell.Error.t 50 | | `Log of Log.Error.t ]) Deferred_result.t 51 | 52 | val ensure_living : 53 | log:Log.t -> 54 | t -> 55 | (unit, 56 | [> `Shell_command of Hyper_shell.Error.t 57 | | `Log of Log.Error.t ]) Deferred_result.t 58 | -------------------------------------------------------------------------------- /src/lib/hyper_shell.ml: -------------------------------------------------------------------------------- 1 | 2 | open Internal_pervasives 3 | 4 | module Error = struct 5 | 6 | type t = { 7 | command: string [@main]; 8 | stdout: string option; 9 | stderr: string option; 10 | status: [`Exited of int | `Signaled of int | `Stopped of int] option; 11 | exn: string option; 12 | } [@@deriving yojson,make,show] 13 | 14 | let of_result cmd = 15 | function 16 | | `Ok (stdout, stderr, status) -> 17 | make ~stdout ~stderr ~status cmd 18 | | `Error (`Shell (_, `Exn e)) -> 19 | make cmd ~exn:(Printexc.to_string e) 20 | 21 | let to_display_string t = 22 | let show_output name = 23 | function 24 | | "" -> sprintf "%s: empty.\n" name 25 | | more -> 26 | sprintf "%s:\n```\n%s%s```\n" name more 27 | String.(if get more (length more - 1) = Some '\n' then "" else "\n") 28 | in 29 | sprintf "Command:\n```\n%s\n```\n%s" 30 | t.command 31 | (List.filter_opt 32 | [ 33 | Option.map t.status 34 | ~f:(fun s -> 35 | sprintf 36 | "Status: %s\n" (Pvem_lwt_unix.System.Shell.status_to_string s)); 37 | Option.map t.exn ~f:(sprintf "Exception: %s\n"); 38 | Option.map t.stdout ~f:(show_output "Standard-output"); 39 | Option.map t.stderr ~f:(show_output "Standard-error"); 40 | ] 41 | |> String.concat ~sep:"\n") 42 | end 43 | 44 | 45 | 46 | let command_must_succeed_with_output 47 | ~log ?(section = ["shell-commands"]) 48 | ?(additional_json = []) 49 | cmd = 50 | Pvem_lwt_unix.System.Shell.execute cmd 51 | >>< begin fun res -> 52 | let error = Error.of_result cmd res in 53 | Log.log log ~section 54 | (`Assoc ( 55 | additional_json 56 | @ [ 57 | "command", Error.to_yojson error; 58 | ])) 59 | >>= fun () -> 60 | match res with 61 | | `Ok (out, err, ex) -> 62 | begin match ex with 63 | | `Exited 0 -> return (out, err) 64 | | `Exited _ 65 | | `Signaled _ 66 | | `Stopped _ -> fail (`Shell_command error) 67 | end 68 | | `Error _ -> fail (`Shell_command error) 69 | end 70 | 71 | let command_must_succeed 72 | ~log ?section ?additional_json cmd = 73 | command_must_succeed_with_output 74 | ~log ?section ?additional_json cmd 75 | >>= fun (_, _) -> 76 | return () 77 | 78 | 79 | module Saved_command = struct 80 | module Output_archive = struct 81 | type t = { 82 | date: float; 83 | out: string; 84 | err: string; 85 | } [@@deriving yojson,show,make] 86 | 87 | let read ~storage ~path = 88 | Storage.Json.get_json_opt storage ~path ~parse:of_yojson 89 | 90 | let length t = String.length t.out + String.length t.err 91 | 92 | let write t ~storage ~path = 93 | Storage.Json.save_jsonable storage ~path (to_yojson t) 94 | 95 | let to_string t = t.out ^ t.err 96 | 97 | end 98 | type t = { 99 | command: string; 100 | outcome: [ 101 | | `Ok of string * string 102 | | `Error of Error.t 103 | ]; 104 | archived: Output_archive.t option; 105 | } [@@deriving yojson,show,make] 106 | 107 | 108 | let run ~storage ~log ~section ~cmd ~path ~keep_the = 109 | begin 110 | command_must_succeed_with_output ~log ~section cmd 111 | >>< function 112 | | `Ok (out, err) -> 113 | let new_output = 114 | Output_archive.make ~out ~err ~date:(Unix.gettimeofday ()) in 115 | begin match keep_the with 116 | | `Latest -> 117 | Output_archive.write ~storage ~path new_output 118 | | `Largest -> 119 | Output_archive.read ~storage ~path 120 | >>= begin function 121 | | Some old when Output_archive.(length old > length new_output) -> 122 | return () 123 | | Some _ (* smaller *) | None -> 124 | Output_archive.write ~storage ~path new_output 125 | end 126 | end 127 | >>= fun () -> 128 | Output_archive.read ~storage ~path >>= fun archived -> 129 | return (make 130 | ~command:cmd ~outcome:(`Ok (out, err)) 131 | ?archived ()) 132 | | `Error (`Shell_command e) -> 133 | Output_archive.read ~storage ~path >>= fun archived -> 134 | return (make 135 | ~command:cmd ~outcome:(`Error e) ?archived ()) 136 | | `Error (`Log _ as e) -> fail e 137 | end 138 | end 139 | -------------------------------------------------------------------------------- /src/lib/hyper_shell.mli: -------------------------------------------------------------------------------- 1 | module Error: sig 2 | type t = { 3 | command: string [@main]; 4 | stdout: string option; 5 | stderr: string option; 6 | status: [`Exited of int | `Signaled of int | `Stopped of int] option; 7 | exn: string option; 8 | } [@@deriving yojson,make,show] 9 | val to_display_string: t -> string 10 | end 11 | 12 | val command_must_succeed_with_output : 13 | log:Log.t -> 14 | ?section:string list -> 15 | ?additional_json:(string * Yojson.Safe.json) list -> 16 | string -> 17 | (string * string, 18 | [> `Shell_command of Error.t 19 | | `Log of Log.Error.t ]) 20 | Internal_pervasives.Deferred_result.t 21 | 22 | val command_must_succeed : 23 | log:Log.t -> 24 | ?section:string list -> 25 | ?additional_json:(string * Yojson.Safe.json) list -> 26 | string -> 27 | (unit, 28 | [> `Shell_command of Error.t 29 | | `Log of Log.Error.t ]) 30 | Internal_pervasives.Deferred_result.t 31 | 32 | module Saved_command : sig 33 | module Output_archive : sig 34 | type t = { 35 | date: float; 36 | out: string; 37 | err: string; 38 | } [@@deriving yojson,show,make] 39 | val to_string: t -> string 40 | end 41 | type t = { 42 | command: string; 43 | outcome: [ 44 | | `Ok of string * string 45 | | `Error of Error.t 46 | ]; 47 | archived: Output_archive.t option; 48 | } [@@deriving yojson,show,make] 49 | val run : 50 | storage:Storage.t -> 51 | log:Log.t -> 52 | section:string list -> 53 | cmd:string -> 54 | path:Storage.key -> 55 | keep_the:[ `Largest | `Latest ] -> 56 | (t, [> `Log of Log.Error.t | `Storage of [> Storage.Error.common ] ]) 57 | Internal_pervasives.Deferred_result.t 58 | end -------------------------------------------------------------------------------- /src/lib/internal_pervasives.ml: -------------------------------------------------------------------------------- 1 | 2 | 3 | include Nonstd 4 | module String = Sosa.Native_string 5 | 6 | include Pvem_lwt_unix 7 | let (>>=) = Deferred_result.(>>=) 8 | let (>><) = Deferred_result.(>><) 9 | let (>>|) = Deferred_result.(>>|) 10 | let return = Deferred_result.return 11 | let fail = Deferred_result.fail 12 | 13 | 14 | let of_ocaml_result = 15 | function 16 | | Ok o -> return o 17 | | Error s -> fail s 18 | 19 | let dbg fmt = 20 | ksprintf (fun s -> printf "Cocldebug>> %s\n%!" s) fmt 21 | 22 | 23 | module Generic_error = struct 24 | let to_string = 25 | function 26 | | `Exn e -> Printexc.to_string e 27 | end 28 | 29 | -------------------------------------------------------------------------------- /src/lib/job.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | module Specification = struct 4 | type t = 5 | | Kube of Kube_job.Specification.t 6 | | Local_docker of Local_docker_job.Specification.t 7 | | Aws_batch of Aws_batch_job.Specification.t 8 | [@@deriving yojson, show] 9 | let kind = 10 | function 11 | | Kube _ -> `Kube 12 | | Local_docker _ -> `Local_docker 13 | | Aws_batch _ -> `Aws_batch 14 | let kubernetes spec = Kube spec 15 | let local_docker spec = Local_docker spec 16 | 17 | let aws_batch spec = Aws_batch spec 18 | end 19 | 20 | module Status = struct 21 | type t = [ 22 | | `Submitted 23 | | `Started of float 24 | | `Finished of float * [ `Failed | `Succeeded | `Killed ] 25 | | `Error of string 26 | ] [@@deriving yojson,show ] 27 | end 28 | 29 | module State = struct 30 | type t = 31 | | Empty 32 | | Aws_batch_job of Aws_batch_job.State.t 33 | [@@deriving yojson, show] 34 | end 35 | 36 | type t = { 37 | id: string; 38 | specification: Specification.t [@main ]; 39 | mutable status: Status.t [@default `Submitted]; 40 | mutable update_errors : string list; 41 | mutable start_errors : string list; 42 | mutable latest_error: float option; 43 | mutable saved_state: State.t [@default State.Empty]; 44 | } 45 | [@@deriving yojson, show, make] 46 | 47 | let make ~id spec = make ~id spec 48 | 49 | let id t = t.id 50 | let status t = t.status 51 | let set_status t ~from_error s = 52 | (if not from_error then t.latest_error <- None); 53 | t.status <- s 54 | 55 | let start_errors t = t.start_errors 56 | let set_start_errors t ~time l = 57 | t.latest_error <- Some time; 58 | t.start_errors <- l 59 | 60 | let update_errors t = t.update_errors 61 | let set_update_errors t ~time l = 62 | t.latest_error <- Some time; 63 | t.update_errors <- l 64 | 65 | let latest_error t = t.latest_error 66 | 67 | let fresh spec = 68 | let id = Uuidm.(v5 (create `V4) "coclojobs" |> to_string ~upper:false) in 69 | make ~id spec 70 | 71 | 72 | let save st job = 73 | Storage.Json.save_jsonable st 74 | ~path:(Job_common.save_path (id job) `Saved_state) 75 | (to_yojson job) 76 | 77 | let get st job_id = 78 | Storage.Json.get_json st 79 | ~path:(Job_common.save_path job_id `Saved_state) 80 | ~parse:of_yojson 81 | 82 | let kind t = Specification.kind t.specification 83 | 84 | let aws_state t = 85 | match t.saved_state with 86 | | State.Empty -> fail (`Job (`Missing_aws_state t.id)) 87 | | State.Aws_batch_job s -> return s 88 | 89 | let get_logs ~storage ~log t = 90 | match kind t with 91 | | `Kube -> 92 | Kube_job.get_logs ~storage ~log ~id:t.id 93 | | `Local_docker -> 94 | Local_docker_job.get_logs ~storage ~log ~id:t.id 95 | | `Aws_batch -> 96 | aws_state t 97 | >>= fun state -> 98 | Aws_batch_job.get_logs ~log ~id:t.id ~state 99 | 100 | let describe ~storage ~log t = 101 | match kind t with 102 | | `Kube -> 103 | Kube_job.describe ~storage ~log ~id:t.id 104 | | `Local_docker -> 105 | Local_docker_job.describe ~log ~id:t.id ~storage 106 | | `Aws_batch -> 107 | aws_state t 108 | >>= fun state -> 109 | Aws_batch_job.describe ~storage ~log ~id:t.id ~state 110 | 111 | let kill ~log t = 112 | match kind t with 113 | | `Kube -> 114 | Kube_job.kill ~log ~id:t.id 115 | | `Local_docker -> 116 | Local_docker_job.kill ~log ~id:t.id 117 | | `Aws_batch -> 118 | aws_state t 119 | >>= fun state -> 120 | Aws_batch_job.kill ~log ~id:t.id ~state 121 | 122 | let start ~log t ~cluster = 123 | match t.specification with 124 | | Specification.Kube specification -> 125 | Kube_job.start ~log ~id:t.id ~specification 126 | | Specification.Local_docker specification -> 127 | Local_docker_job.start ~log ~id:t.id ~specification 128 | | Specification.Aws_batch specification -> 129 | Aws_batch_job.start ~cluster ~log ~id:t.id ~specification 130 | >>= fun aws_state -> 131 | t.saved_state <- State.Aws_batch_job aws_state; 132 | return () 133 | 134 | let get_update ~log t = 135 | match kind t with 136 | | `Kube -> 137 | Kube_job.get_status_json ~log ~id:t.id 138 | >>= fun blob -> 139 | Kube_job.Kube_status.of_json blob 140 | >>= fun stat -> 141 | let open Kube_job.Kube_status in 142 | begin match stat with 143 | | { phase = `Pending } 144 | | { phase = `Unknown } 145 | | { phase = `Running } -> 146 | return `Running 147 | | { phase = (`Failed | `Succeeded as phase)} -> 148 | return phase 149 | end 150 | | `Local_docker -> 151 | Local_docker_job.get_update ~log ~id:t.id 152 | | `Aws_batch -> 153 | aws_state t 154 | >>= fun state -> 155 | Aws_batch_job.get_update ~log ~id:t.id ~state 156 | -------------------------------------------------------------------------------- /src/lib/job.mli: -------------------------------------------------------------------------------- 1 | module Status : sig 2 | type t = [ 3 | | `Submitted 4 | | `Started of float 5 | | `Finished of float * [ `Failed | `Succeeded | `Killed ] 6 | | `Error of string 7 | ] [@@deriving yojson,show ] 8 | end 9 | 10 | module Specification : sig 11 | type t = private 12 | | Kube of Kube_job.Specification.t 13 | | Local_docker of Local_docker_job.Specification.t 14 | | Aws_batch of Aws_batch_job.Specification.t 15 | [@@deriving yojson,show ] 16 | val kind: t -> [ `Kube | `Local_docker | `Aws_batch ] 17 | val kubernetes: Kube_job.Specification.t -> t 18 | val local_docker: Local_docker_job.Specification.t -> t 19 | val aws_batch: Aws_batch_job.Specification.t -> t 20 | end 21 | 22 | type t 23 | [@@deriving yojson,show ] 24 | 25 | val make : id: string -> Specification.t -> t 26 | 27 | 28 | val id : t -> string 29 | val status : t -> Status.t 30 | val fresh : Specification.t -> t 31 | 32 | val set_status : t -> from_error: bool -> Status.t -> unit 33 | 34 | val start_errors : t -> string list 35 | val set_start_errors : t -> time: float -> string list -> unit 36 | 37 | val update_errors : t -> string list 38 | val set_update_errors : t -> time: float -> string list -> unit 39 | 40 | val latest_error: t -> float option 41 | 42 | val save : 43 | Storage.t -> 44 | t -> 45 | (unit, [> `Storage of [> Storage.Error.common ] ]) 46 | Internal_pervasives.Deferred_result.t 47 | val get : 48 | Storage.t -> 49 | string -> 50 | (t, [> `Storage of [> Storage.Error.common ] ]) 51 | Internal_pervasives.Deferred_result.t 52 | 53 | val get_logs : 54 | storage:Storage.t -> 55 | log:Log.t -> 56 | t -> 57 | (Job_common.Query_result.t, 58 | [> `Log of Log.Error.t 59 | | `Shell_command of Hyper_shell.Error.t 60 | | `Job of [> `Missing_aws_state of string ] 61 | | `Storage of [> Storage.Error.common ] ]) 62 | Internal_pervasives.Deferred_result.t 63 | 64 | val describe : 65 | storage:Storage.t -> 66 | log:Log.t -> 67 | t -> 68 | (Job_common.Query_result.t, 69 | [> `Log of Log.Error.t 70 | | `Shell_command of Hyper_shell.Error.t 71 | | `Job of [> `Missing_aws_state of string ] 72 | | `Storage of [> Storage.Error.common ] ]) 73 | Internal_pervasives.Deferred_result.t 74 | 75 | val kill : 76 | log:Log.t -> 77 | t -> 78 | (unit, [> `Log of Log.Error.t 79 | | `Job of [> `Missing_aws_state of string ] 80 | | `Shell_command of Hyper_shell.Error.t ]) 81 | Internal_pervasives.Deferred_result.t 82 | 83 | val start : 84 | log:Log.t -> 85 | t -> 86 | cluster: Cluster.t -> 87 | (unit, 88 | [> `IO of [> `Write_file_exn of Pvem_lwt_unix.IO.path * exn ] 89 | | `Aws_batch_job of Aws_batch_job.Error.start 90 | | `Log of Log.Error.t 91 | | `Shell_command of Hyper_shell.Error.t ]) 92 | Internal_pervasives.Deferred_result.t 93 | 94 | val get_update : 95 | log:Log.t -> 96 | t -> 97 | ([> `Failed | `Running | `Succeeded ], 98 | [> `Job of 99 | [> `Docker_inspect_json_parsing of 100 | string * [> `Exn of exn | `String of string ] 101 | | `Kube_json_parsing of 102 | string * [> `Exn of exn | `String of string ] ] 103 | | `Log of Log.Error.t 104 | | `Aws_batch_job of Aws_batch_job.Error.status 105 | | `Job of [> `Missing_aws_state of string ] 106 | | `Shell_command of Hyper_shell.Error.t ]) 107 | Internal_pervasives.Deferred_result.t 108 | -------------------------------------------------------------------------------- /src/lib/job_common.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | (** Common tools and function for all [Job] implementations. *) 4 | 5 | 6 | let job_section id = ["job"; id; "commands"] 7 | 8 | let save_path ?tag id = 9 | let name ?(ext = "json") s = 10 | sprintf "%s%s.%s" 11 | s (Option.value_map ~default:"" tag ~f:(sprintf "-%s")) ext in 12 | function 13 | | `Saved_state -> ["job"; id; name "saved_state"] 14 | | `Describe_output -> ["job"; id; name "describe"] 15 | | `Logs_output -> ["job"; id; name "logs"] 16 | 17 | (** The queries [describe] and [logs] have an interesting archival 18 | mechanism for their results, this module packs results together 19 | (esp. for display in the Ketrew UI within the plugin). *) 20 | module Query_result = struct 21 | type job_call_result_item = [ 22 | | `Saved_command of Hyper_shell.Saved_command.t 23 | | `Url of string 24 | ] 25 | [@@deriving yojson, show] 26 | type t = (string * job_call_result_item) list 27 | [@@deriving yojson, show] 28 | type 'a call_function = 29 | storage:Storage.t -> 30 | log:Log.t -> 31 | id:string -> 32 | (t, 33 | [> `Log of Log.Error.t 34 | | `Shell_command of Hyper_shell.Error.t 35 | | `Storage of [> Storage.Error.common ] ] as 'a) Deferred_result.t 36 | 37 | let one_saved name saved = [name, `Saved_command saved] 38 | let one_url name saved = [name, `Url saved] 39 | end 40 | -------------------------------------------------------------------------------- /src/lib/kube_job.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | module Specification = struct 4 | module Nfs_mount = struct 5 | type t = { 6 | host: string; 7 | path: string; 8 | point: string; 9 | read_only: bool [@default false]; 10 | } [@@deriving yojson, show, make] 11 | let id m = 12 | let sanitize_dns s = 13 | String.map s ~f:begin 14 | function 15 | | ('a' .. 'z' | '0' .. '9' | 'A' .. 'Z') as c -> c 16 | | other -> '-' 17 | end 18 | |> fun s -> 19 | begin match s.[0] with 20 | | Some ('a' .. 'z') -> s 21 | | None -> "nohost" 22 | | other -> "host-" ^ s 23 | end 24 | in 25 | Hashtbl.hash m |> sprintf "%s-%x" (sanitize_dns m.host) 26 | let point m = m.point 27 | let host m = m.host 28 | let path m = m.path 29 | let point m = m.point 30 | let read_only m = m.read_only 31 | end 32 | module File_contents_mount = struct 33 | type t = { 34 | path: string; 35 | contents: string [@main]; 36 | } [@@deriving yojson, show, make] 37 | let path t = t.path 38 | let contents t = t.contents 39 | end 40 | type t = { 41 | image: string; 42 | command: string list [@main]; 43 | volume_mounts: [ `Nfs of Nfs_mount.t | `Constant of File_contents_mount.t ] list; 44 | memory: [ `GB of int ] [@default `GB 50]; 45 | cpus: int [@default 7]; 46 | } [@@deriving yojson, show, make] 47 | end 48 | 49 | let command_must_succeed ~log ?additional_json ~id cmd = 50 | Hyper_shell.command_must_succeed ~log cmd ?additional_json 51 | ~section:(Job_common.job_section id) 52 | let command_must_succeed_with_output ~log ?additional_json ~id cmd = 53 | Hyper_shell.command_must_succeed_with_output ~log cmd ?additional_json 54 | ~section:(Job_common.job_section id) 55 | 56 | let start ~log ~id ~specification = 57 | let spec = specification in 58 | let open Specification in 59 | let secret_name f = 60 | String.take 61 | (id ^ (Digest.string (File_contents_mount.show f) |> Digest.to_hex)) 62 | 55 (* The max is 63 + we want to add "-volume" *) 63 | in 64 | let requests_json = 65 | `Assoc [ 66 | "memory", (let `GB gb = spec.memory in `String (sprintf "%dG" gb)); 67 | "cpu", `String (Int.to_string spec.cpus); 68 | ] in 69 | let json : Yojson.Safe.json = 70 | let secrets = 71 | List.filter_map spec.volume_mounts ~f:(function 72 | | `Nfs _ -> None 73 | | `Constant f -> 74 | Some (`Assoc [ 75 | "apiVersion", `String "v1"; 76 | "kind", `String "Secret"; 77 | "metadata", `Assoc [ 78 | "name", `String (secret_name f); 79 | ]; 80 | "data", `Assoc [ 81 | Filename.basename (File_contents_mount.path f), 82 | `String (File_contents_mount.contents f |> B64.encode); 83 | ]; 84 | ] )) in 85 | let items = 86 | secrets 87 | @ [ 88 | `Assoc [ 89 | "apiVersion", `String "v1"; 90 | "kind", `String "Pod"; 91 | "metadata", `Assoc [ 92 | "name", `String id; 93 | "labels", `Assoc [ 94 | "app", `String id; 95 | ]; 96 | ]; 97 | "spec", `Assoc [ 98 | "restartPolicy", `String "Never"; 99 | "containers", `List [ 100 | `Assoc [ 101 | "name", `String (id ^ "container"); 102 | "image", `String spec.image; 103 | "command", `List (List.map spec.command ~f:(fun s -> `String s)); 104 | "volumeMounts", 105 | `List ( 106 | List.map spec.volume_mounts ~f:(function 107 | | `Constant f -> 108 | `Assoc [ 109 | "name", `String (secret_name f ^ "-volume"); 110 | "readOnly", `Bool true; 111 | "mountPath", `String (Filename.dirname 112 | (File_contents_mount.path f)); 113 | ] 114 | | `Nfs m -> 115 | `Assoc [ 116 | "name", `String (Nfs_mount.id m); 117 | "mountPath", `String (Nfs_mount.point m); 118 | ]) 119 | ); 120 | "resources", `Assoc [ 121 | "requests", requests_json; 122 | ]; 123 | ]; 124 | ]; 125 | "volumes", `List ( 126 | List.map spec.volume_mounts ~f:(function 127 | | `Constant f -> 128 | `Assoc [ 129 | "name", `String (secret_name f ^ "-volume"); 130 | "secret", `Assoc [ 131 | "secretName", `String (secret_name f); 132 | ] 133 | ] 134 | | `Nfs m -> 135 | `Assoc [ 136 | "name", `String (Nfs_mount.id m); 137 | "nfs", `Assoc [ 138 | "server", `String (Nfs_mount.host m); 139 | "path", `String (Nfs_mount.path m); 140 | "readOnly", `Bool (Nfs_mount.read_only m); 141 | ]; 142 | ]) 143 | ); 144 | ]; 145 | ]; 146 | ] 147 | in 148 | `Assoc [ 149 | "apiVersion", `String "v1"; 150 | "kind", `String "List"; 151 | "items", `List items; 152 | ] 153 | in 154 | let json_string = Yojson.Safe.pretty_to_string ~std:true json in 155 | let tmp = Filename.temp_file "coclojob" ".json" in 156 | Pvem_lwt_unix.IO.write_file tmp ~content:json_string 157 | >>= fun () -> 158 | let additional_json = [ 159 | "temp-file", `String tmp; 160 | "contents", json; 161 | ] in 162 | ksprintf 163 | (command_must_succeed ~additional_json ~log ~id) 164 | "kubectl create -f %s" tmp 165 | 166 | 167 | let describe ~storage ~log ~id = 168 | let cmd = sprintf "kubectl describe pod %s" id in 169 | let save_path = Job_common.save_path id `Describe_output in 170 | Hyper_shell.Saved_command.run 171 | ~storage ~log ~cmd ~path:save_path 172 | ~section:(Job_common.job_section id) 173 | ~keep_the:`Latest 174 | >>= fun logres -> 175 | return (Job_common.Query_result.one_saved "Description" logres) 176 | 177 | let get_logs ~storage ~log ~id = 178 | let save_path = Job_common.save_path id `Logs_output in 179 | let cmd = sprintf "kubectl logs %s" id in 180 | Hyper_shell.Saved_command.run 181 | ~storage ~log ~cmd ~path:save_path 182 | ~section:(Job_common.job_section id) 183 | ~keep_the:`Largest 184 | >>= fun logres -> 185 | return (Job_common.Query_result.one_saved "Logs" logres) 186 | 187 | 188 | let kill ~log ~id = 189 | let cmd = sprintf "kubectl delete pod %s" id in 190 | command_must_succeed ~log ~id cmd 191 | 192 | let get_status_json ~log ~id = 193 | let cmd = sprintf "kubectl get pod -a %s -o=json" id in 194 | command_must_succeed_with_output ~log ~id cmd 195 | >>= fun (out, _) -> 196 | return out 197 | 198 | module Kube_status = struct 199 | (* cf. http://kubernetes.io/docs/user-guide/pod-states/ *) 200 | type t = { 201 | phase : [ `Pending | `Running | `Succeeded | `Failed | `Unknown ]; 202 | } 203 | [@@deriving show,yojson] 204 | 205 | let phase_of_string = 206 | function 207 | | "Pending" -> Some `Pending 208 | | "Running" -> Some `Running 209 | | "Succeeded" -> Some `Succeeded 210 | | "Failed" -> Some `Failed 211 | | "Unknown" -> Some `Unknown 212 | | other -> None 213 | 214 | let of_json blob = 215 | Deferred_result.wrap_deferred 216 | ~on_exn:(fun e -> `Job (`Kube_json_parsing (blob, `Exn e))) 217 | (fun () -> Yojson.Safe.from_string blob |> Lwt.return) 218 | >>= fun json -> 219 | let fail_parsing reason = 220 | fail (`Job (`Kube_json_parsing (blob, `String reason))) in 221 | begin match json with 222 | | `Assoc l -> 223 | let phase = 224 | List.find_map l ~f:(function 225 | | "status", `Assoc json_assoc -> 226 | List.find_map json_assoc ~f:(function 227 | | "phase", `String phase -> Some phase 228 | | _ -> None) 229 | | _ -> None) 230 | |> Option.bind ~f:phase_of_string 231 | in 232 | begin match phase with 233 | | None -> fail_parsing "Cannot find field /status/phase" 234 | | Some phase -> return {phase} 235 | end 236 | | _ -> fail_parsing "JSON is not an `Assoc _" 237 | end 238 | end 239 | 240 | module Error = struct 241 | let to_string = 242 | function 243 | | `Kube_json_parsing (blob, `Exn e) -> 244 | sprintf "Parsing JSON output of kube-get-pod: %s, %s" 245 | (Printexc.to_string e) 246 | blob 247 | | `Kube_json_parsing (blob, `String e) -> 248 | sprintf "Parsing JSON output of kube-get-pod: %s, %s" e blob 249 | end 250 | -------------------------------------------------------------------------------- /src/lib/kube_job.mli: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | module Specification : sig 4 | module Nfs_mount : 5 | sig 6 | type t = { 7 | host : string; 8 | path : string; 9 | point : string; 10 | read_only : bool; 11 | } [@@deriving yojson, show, make] 12 | val show : t -> Ppx_deriving_runtime.string 13 | val make : 14 | host:string -> 15 | path:string -> point:string -> ?read_only:bool -> unit -> t 16 | val id : t -> string 17 | val host : t -> string 18 | val path : t -> string 19 | val point : t -> string 20 | val read_only : t -> bool 21 | end 22 | module File_contents_mount : sig 23 | type t = { path : string; contents : string; } 24 | [@@deriving yojson, show, make] 25 | val show : t -> Ppx_deriving_runtime.string 26 | val make : path:string -> string -> t 27 | val path : t -> string 28 | val contents : t -> string 29 | end 30 | type t = { 31 | image: string; 32 | command: string list [@main]; 33 | volume_mounts: [ `Nfs of Nfs_mount.t | `Constant of File_contents_mount.t ] list; 34 | memory: [ `GB of int ] [@default `GB 50]; 35 | cpus: int [@default 7]; 36 | } [@@deriving yojson, show, make] 37 | end 38 | 39 | val start : 40 | log:Log.t -> 41 | id:string -> 42 | specification:Specification.t -> 43 | (unit, 44 | [> `IO of [> `Write_file_exn of Pvem_lwt_unix.IO.path * exn ] 45 | | `Shell_command of Hyper_shell.Error.t 46 | | `Log of Log.Error.t ]) Deferred_result.t 47 | 48 | val describe : 49 | _ Job_common.Query_result.call_function 50 | 51 | val get_logs: 52 | _ Job_common.Query_result.call_function 53 | 54 | val kill : 55 | log:Log.t -> 56 | id: string -> 57 | (unit, 58 | [> `Shell_command of Hyper_shell.Error.t 59 | | `Log of Log.Error.t ]) Deferred_result.t 60 | 61 | val get_status_json : 62 | log:Log.t -> 63 | id: string -> 64 | (string, 65 | [> `Shell_command of Hyper_shell.Error.t 66 | | `Log of Log.Error.t ]) Deferred_result.t 67 | 68 | module Kube_status : sig 69 | type t = { 70 | phase : [ `Failed | `Pending | `Running | `Succeeded | `Unknown ]; 71 | } [@@deriving yojson, show] 72 | 73 | val phase_of_string : 74 | string -> 75 | [> `Failed | `Pending | `Running | `Succeeded | `Unknown ] option 76 | 77 | val of_json : 78 | string -> 79 | (t, 80 | [> `Job of 81 | [> `Kube_json_parsing of 82 | string * [> `Exn of exn | `String of string ] ] ]) Deferred_result.t 83 | end 84 | 85 | 86 | module Error : sig 87 | val to_string : 88 | [< `Kube_json_parsing of 89 | string * [< `Exn of exn | `String of string ] ] -> 90 | string 91 | end 92 | -------------------------------------------------------------------------------- /src/lib/local_docker_job.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | 4 | module Specification = struct 5 | 6 | type t = { 7 | image: string; 8 | command: string list [@main]; 9 | volume_mounts: [ `Local of string * string ] list; 10 | memory: [ `GB of int | `MB of int ] option; 11 | cpus: float option; 12 | } [@@deriving yojson, show, make] 13 | end 14 | 15 | let job_section id = ["job"; id; "commands"] 16 | 17 | let command_must_succeed ~log ?additional_json ~id cmd = 18 | Hyper_shell.command_must_succeed ~log cmd ?additional_json 19 | ~section:(job_section id) 20 | let command_must_succeed_with_output ~log ?additional_json ~id cmd = 21 | Hyper_shell.command_must_succeed_with_output ~log cmd ?additional_json 22 | ~section:(job_section id) 23 | 24 | let exec c = 25 | List.map c ~f:Filename.quote |> String.concat ~sep:" " 26 | 27 | let start ~log ~id ~specification = 28 | let open Specification in 29 | let additional_json = [ 30 | "specification", Specification.to_yojson specification; 31 | ] in 32 | let mounts = 33 | specification.volume_mounts |> List.map ~f:(function 34 | `Local (one, two) -> ["-v"; sprintf "%s:%s" one two] 35 | ) 36 | |> List.concat in 37 | let cpus = 38 | Option.value_map ~default:[] specification.cpus 39 | ~f:(fun c -> 40 | (* [sprintf "--cpus=%f" c] -> This option is for Docker ≥ 1.13 41 | Cf. https://docs.docker.com/engine/admin/resource_constraints/#cpu 42 | *) 43 | let period = 1000 in 44 | [sprintf "--cpu-period=%d" period; 45 | sprintf "--cpu-quota=%d" (c *. float period |> int_of_float)]) in 46 | let memory = 47 | Option.value_map ~default:[] specification.memory 48 | ~f:(function 49 | | `GB g -> [sprintf "--memory=%dg" g] 50 | | `MB m -> [sprintf "--memory=%dm" m]) in 51 | command_must_succeed ~additional_json ~log ~id 52 | (["docker"; "run"; "--name"; id; "-d"] 53 | @ mounts 54 | @ cpus 55 | @ memory 56 | @ [specification.image] 57 | @ specification.command 58 | |> exec) 59 | 60 | let describe ~storage ~log ~id = 61 | let save_path tag = Job_common.save_path ~tag id `Describe_output in 62 | Hyper_shell.Saved_command.run 63 | ~storage ~log ~path:(save_path "stats") 64 | ~section:(job_section id) 65 | ~keep_the:`Latest 66 | ~cmd:(["docker"; "stats"; "--no-stream"; id] |> exec) 67 | >>= fun stat_result -> 68 | Hyper_shell.Saved_command.run 69 | ~storage ~log ~path:(save_path "inspect") 70 | ~section:(job_section id) 71 | ~keep_the:`Latest 72 | ~cmd:(["docker"; "inspect"; id] |> exec) 73 | >>= fun inspect_result -> 74 | return ["Stats", `Saved_command stat_result; 75 | "Inspection", `Saved_command inspect_result] 76 | 77 | let get_logs ~storage ~log ~id = 78 | let save_path = Job_common.save_path id `Logs_output in 79 | Hyper_shell.Saved_command.run 80 | ~storage ~log ~path:save_path 81 | ~section:(job_section id) 82 | ~keep_the:`Latest 83 | ~cmd:(["docker"; "logs"; id] |> exec) 84 | >>= fun logres -> 85 | return (Job_common.Query_result.one_saved "Logs" logres) 86 | 87 | let kill ~log ~id = 88 | command_must_succeed ~log ~id 89 | (["docker"; "kill"; id] |> exec) 90 | 91 | 92 | let get_update ~log ~id = 93 | command_must_succeed_with_output ~log ~id 94 | (["docker"; "inspect"; id] |> exec) 95 | >>= fun (stdout, stderr) -> 96 | Deferred_result.wrap_deferred 97 | ~on_exn:(fun e -> `Job (`Docker_inspect_json_parsing (stdout, `Exn e))) 98 | (fun () -> Yojson.Safe.from_string stdout |> Lwt.return) 99 | >>= fun json -> 100 | let fail_parsing reason = 101 | fail (`Job (`Docker_inspect_json_parsing (stdout, `String reason))) in 102 | begin match json with 103 | | `List [`Assoc l] -> 104 | let status = ref None in 105 | let exit_code = ref None in 106 | List.iter l ~f:begin function 107 | | "State", `Assoc json_assoc -> 108 | List.iter json_assoc ~f:begin function 109 | | "Status", `String phase -> 110 | status := Some phase 111 | | "ExitCode", `Int i -> 112 | exit_code := Some i 113 | | _ -> () 114 | end 115 | | _ -> () 116 | end; 117 | begin match !status, !exit_code with 118 | | Some "running", _ -> return `Running 119 | | Some "exited", Some 0 -> return `Succeeded 120 | | Some "exited", Some other -> return `Failed 121 | | Some other, _ -> 122 | ksprintf fail_parsing "Unknown State/Status: %s" other 123 | | None, _ -> 124 | fail_parsing "Cannot find State/Status in 'inspect' JSON" 125 | end 126 | | _ -> fail_parsing "JSON is not an `Assoc _" 127 | end 128 | 129 | module Error = struct 130 | let to_string = 131 | function 132 | | `Docker_inspect_json_parsing (blob, `Exn e) -> 133 | sprintf "Parsing JSON output of kube-get-pod: %s, %s" 134 | (Printexc.to_string e) 135 | blob 136 | | `Docker_inspect_json_parsing (blob, `String e) -> 137 | sprintf "Parsing JSON output of kube-get-pod: %s, %s" e blob 138 | end 139 | -------------------------------------------------------------------------------- /src/lib/local_docker_job.mli: -------------------------------------------------------------------------------- 1 | 2 | open Internal_pervasives 3 | 4 | module Specification : sig 5 | type t = { 6 | image: string; 7 | command: string list [@main]; 8 | volume_mounts: [ `Local of string * string ] list; 9 | memory: [ `GB of int | `MB of int ] option; 10 | cpus: float option; 11 | } [@@deriving yojson, show, make] 12 | end 13 | 14 | val start : 15 | log:Log.t -> 16 | id:string -> 17 | specification:Specification.t -> 18 | (unit, 19 | [> `IO of [> `Write_file_exn of Pvem_lwt_unix.IO.path * exn ] 20 | | `Shell_command of Hyper_shell.Error.t 21 | | `Log of Log.Error.t ]) Deferred_result.t 22 | 23 | val describe : 24 | _ Job_common.Query_result.call_function 25 | 26 | val get_logs: 27 | _ Job_common.Query_result.call_function 28 | 29 | val kill : 30 | log:Log.t -> 31 | id: string -> 32 | (unit, 33 | [> `Shell_command of Hyper_shell.Error.t 34 | | `Log of Log.Error.t ]) Deferred_result.t 35 | 36 | val get_update : 37 | log: Log.t -> 38 | id: string -> 39 | ([> `Failed | `Running | `Succeeded ], 40 | [> `Job of 41 | [> `Docker_inspect_json_parsing of 42 | string * [> `Exn of exn | `String of string ] 43 | ] 44 | | `Log of Log.Error.t 45 | | `Shell_command of Hyper_shell.Error.t ]) Deferred_result.t 46 | 47 | module Error : sig 48 | val to_string : 49 | [< `Docker_inspect_json_parsing of 50 | string * [< `Exn of exn | `String of string ] ] -> string 51 | end 52 | -------------------------------------------------------------------------------- /src/lib/log.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type stored = { 4 | storage: Storage.t; 5 | } 6 | type t = [ 7 | | `Silent 8 | | `Stored of stored 9 | | `File_tree of string (* The root *) 10 | ] 11 | 12 | let silent : t = `Silent 13 | let stored storage : t = `Stored {storage} 14 | let file_tree root : t = `File_tree root 15 | 16 | let debug_sections : string list list ref = ref [] 17 | let () = 18 | try debug_sections := 19 | Sys.getenv "COCLOBAS_DEBUG_SECTIONS" 20 | |> String.split ~on:(`Character ',') 21 | |> List.map ~f:(fun p -> 22 | String.split ~on:(`Character '/') p |> List.filter ~f:((<>) "")) 23 | with _ -> () 24 | 25 | module Error = struct 26 | type t = 27 | [ `IO of [ `Write_file_exn of string * exn ] 28 | | `System of 29 | [ `File_info of string 30 | | `List_directory of string 31 | | `Make_directory of string 32 | | `Remove of string ] * 33 | [ `Exn of exn | `Wrong_access_rights of int ] 34 | | `Storage of Storage.Error.common 35 | ] 36 | let to_string = 37 | function 38 | | `IO _ as e -> Pvem_lwt_unix.IO.error_to_string e 39 | | `System _ as s -> Pvem_lwt_unix.System.error_to_string s 40 | | `Storage s -> Storage.Error.to_string s 41 | 42 | let wrap m = 43 | m >>< function `Ok o -> return o | `Error e -> fail (`Log e) 44 | end 45 | 46 | let empty l = 47 | Error.wrap begin 48 | match l with 49 | | `Silent -> return () 50 | | `Stored s -> Storage.empty s.storage 51 | | `File_tree root -> 52 | let open Pvem_lwt_unix in 53 | System.remove root 54 | >>= fun () -> 55 | System.ensure_directory_path root 56 | end 57 | 58 | let log ?(section = ["main"]) t json = 59 | Error.wrap begin 60 | let path = 61 | let name = 62 | let now = Unix.gettimeofday () in 63 | sprintf "%s_%s" 64 | (truncate (1000. *. now) |> Int.to_string) 65 | (Hashtbl.hash json |> sprintf "%x") 66 | in 67 | "logs" :: section @ [name ^ ".json"] in 68 | begin match List.mem section ~set:!debug_sections with 69 | | true -> 70 | printf "<<<< Coclobas.Log %s (%s)\n%s\n>>>>\n%!" 71 | ODate.Unix.(now () |> Printer.to_iso) 72 | (String.concat ~sep:"/" path) 73 | (Yojson.Safe.pretty_to_string json ~std:true); 74 | | false -> () 75 | end; 76 | match t with 77 | | `Silent -> return () 78 | | `File_tree root -> 79 | let path_str = String.concat ~sep:"/" (root :: path) in 80 | Pvem_lwt_unix.System.ensure_directory_path (Filename.dirname path_str) 81 | >>= fun () -> 82 | let content = Yojson.Safe.pretty_to_string json ~std:true in 83 | IO.write_file path_str ~content 84 | | `Stored st -> 85 | Storage.Json.save_jsonable st.storage json ~path 86 | end 87 | -------------------------------------------------------------------------------- /src/lib/log.mli: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | type t 3 | 4 | val silent : t 5 | val stored : Storage.t -> t 6 | 7 | val file_tree: string -> t 8 | (** Store the logs as simple files, under a given root-path. *) 9 | 10 | module Error: sig 11 | 12 | type t = 13 | [ `IO of [ `Write_file_exn of string * exn ] 14 | | `System of 15 | [ `File_info of string 16 | | `List_directory of string 17 | | `Make_directory of string 18 | | `Remove of string ] * 19 | [ `Exn of exn | `Wrong_access_rights of int ] 20 | | `Storage of Storage.Error.common 21 | ] 22 | val to_string: [< t ] -> string 23 | 24 | end 25 | 26 | val log : 27 | ?section:string list -> 28 | t -> 29 | Yojson.Safe.json -> 30 | (unit, [> `Log of Error.t ]) Deferred_result.t 31 | 32 | val empty: t -> 33 | (unit, [> `Log of Error.t ]) Deferred_result.t 34 | 35 | val debug_sections : string list list ref 36 | (** Catch sections and display them on ["stdout"], cf. the [?section] argument of {!log}. 37 | 38 | At the start of the program, the [Log] module tries to parse 39 | the ["COCLOBAS_DEBUG_SECTIONS"] environment variable to fill this 40 | reference (comma-separated list of paths, e.g. ["/server/loop,cluster//commands"]). 41 | *) 42 | -------------------------------------------------------------------------------- /src/lib/server.ml: -------------------------------------------------------------------------------- 1 | 2 | open Internal_pervasives 3 | 4 | 5 | module Configuration = struct 6 | 7 | module Default = struct 8 | let min_sleep = 3. 9 | let max_sleep = 180. 10 | let max_update_errors = 10 11 | let concurrent_steps = 5 12 | let backoff_factor = 20. 13 | end 14 | 15 | type t = { 16 | min_sleep: float [@default Default.min_sleep]; 17 | max_sleep: float [@default Default.max_sleep]; 18 | max_update_errors: int [@default Default.max_update_errors]; 19 | concurrent_steps: int [@default Default.concurrent_steps]; 20 | backoff_factor : float [@default Default.backoff_factor]; 21 | } [@@deriving make, yojson, show] 22 | 23 | let path = ["server"; "configuration.json"] 24 | 25 | let save ~storage conf = 26 | Storage.Json.save_jsonable storage (to_yojson conf) ~path 27 | 28 | let get st = Storage.Json.get_json st ~path ~parse:of_yojson 29 | end 30 | 31 | type t = { 32 | port : int; 33 | mutable status: [ `Initializing | `Ready ] [@default `Initializing]; 34 | root : string; 35 | mutable cluster : Cluster.t; 36 | mutable jobs: Job.t list; 37 | mutable jobs_to_kill: string list; 38 | storage: Storage.t; 39 | log: Log.t; 40 | job_list_mutex: Lwt_mutex.t; 41 | kick_loop: unit Lwt_condition.t; 42 | configuration: Configuration.t; 43 | } [@@deriving make] 44 | 45 | let create ~port ~configuration ~root ~cluster ~storage ~log = 46 | let job_list_mutex = Lwt_mutex.create () in 47 | let kick_loop = Lwt_condition.create () in 48 | make () 49 | ~job_list_mutex 50 | ~kick_loop 51 | ~port ~root ~cluster ~storage ~log 52 | ~configuration 53 | 54 | let log_event t e = 55 | let json_event name moar_json = 56 | `Assoc ([ 57 | "event", `String name; 58 | "date", `String (ODate.Unix.(now () |> Printer.to_iso)); 59 | ] @ moar_json) 60 | in 61 | let stringf fmt = ksprintf (fun s -> `String s) fmt in 62 | let count l f = 63 | List.fold l ~init:0 ~f:(fun p k -> p + (if f k then 1 else 0)) in 64 | let int i = stringf "%d" i in 65 | let current_jobs () = 66 | let count_status f = count t.jobs (fun j -> Job.status j |> f) in 67 | `Assoc [ 68 | "cardinal", List.length t.jobs |> int; 69 | "submitted", count_status (function `Submitted -> true | _ -> false) |> int; 70 | "started", count_status (function `Started _ -> true | _ -> false) |> int; 71 | "errors", count_status (function `Error _ -> true | _ -> false) |> int; 72 | "finished", count_status (function `Finished _ -> true | _ -> false) |> int; 73 | ]; 74 | in 75 | let subsection, json = 76 | match e with 77 | | `Ready -> 78 | "startup", 79 | json_event "start-with-jobs" [ 80 | "jobs", current_jobs (); 81 | ] 82 | | `Loop_begins (started_jobs, todo, batches) -> 83 | let ctodo = count todo in 84 | "loop", 85 | json_event "loop-begins" [ 86 | "batches", 87 | stringf "[%s]" 88 | (List.map batches ~f:(fun b -> sprintf "%d" (List.length b)) 89 | |> String.concat ~sep:", "); 90 | "todo", `Assoc [ 91 | "remove", ctodo (function `Remove _ -> true | _ -> false) |> int; 92 | "kill", ctodo (function `Kill _ -> true | _ -> false) |> int; 93 | "start", ctodo (function `Start _ -> true | _ -> false) |> int; 94 | "update", ctodo (function `Update _ -> true | _ -> false) |> int; 95 | ]; 96 | "jobs", current_jobs (); 97 | "started_jobs", int started_jobs; 98 | ] 99 | | `Loop_ends (sleep, errors) -> 100 | "loop", 101 | json_event "loop-ends" [ 102 | "sleep", `Float sleep; 103 | "errors", `List (List.map errors ~f:(fun x -> 104 | `String (Error.to_string x))); 105 | "jobs", current_jobs (); 106 | ] 107 | | `Changed_job_list action -> 108 | let action = 109 | match action with 110 | | `Add j -> 111 | `Assoc ["verb", `String "add"; "job", `String (Job.id j)] 112 | | `Remove j -> 113 | `Assoc ["verb", `String "remove"; "job", `String (Job.id j)] 114 | in 115 | "job-list", 116 | json_event "changed-job-list" [ 117 | "action", action; 118 | "jobs", current_jobs (); 119 | ] 120 | in 121 | Log.log t.log ~section:["server"; subsection] json 122 | 123 | let change_job_list t action = 124 | Lwt_mutex.with_lock t.job_list_mutex begin fun () -> 125 | begin match action with 126 | | `Add j -> t.jobs <- j :: t.jobs 127 | | `Remove j -> 128 | t.jobs <- List.filter t.jobs ~f:(fun jj -> Job.id jj <> Job.id j); 129 | end; 130 | Storage.Json.save_jsonable t.storage 131 | ~path:["server"; "jobs.json"] 132 | (`List (List.map t.jobs ~f:(fun j -> `String (Job.id j)))) 133 | >>= fun () -> 134 | log_event t (`Changed_job_list action) 135 | end 136 | 137 | let get_job_list t = 138 | let parse = 139 | let open Ppx_deriving_yojson_runtime in 140 | let open Ppx_deriving_yojson_runtime.Result in 141 | function 142 | | `List l -> 143 | Nonstd.List.fold ~init:(Ok []) l ~f:(fun prev j -> 144 | prev >>= fun l -> 145 | match j with 146 | | `String s -> Ok (s :: l) 147 | | other -> Error "expecting List of Strings") 148 | | other -> Error "expecting List (of Strings)" 149 | in 150 | begin 151 | Storage.Json.get_json t.storage ~path:["server"; "jobs.json"] ~parse 152 | >>< function 153 | | `Ok ids -> return ids 154 | | `Error (`Storage (`Get_json (_, `Missing_data))) -> return [] 155 | | `Error other -> fail other 156 | end 157 | >>= fun ids -> 158 | Deferred_list.while_sequential ids ~f:(fun id -> 159 | Job.get t.storage id) 160 | >>= fun jobs -> 161 | t.jobs <- jobs; 162 | return () 163 | 164 | let incoming_job t string = 165 | Storage.Json.parse_json_blob ~parse:Job.Specification.of_yojson string 166 | >>= fun spec -> 167 | begin match Job.Specification.kind spec, Cluster.kind t.cluster with 168 | | (`Kube, `GCloud_kubernetes) -> return () 169 | | (`Local_docker, `Local_docker) -> return () 170 | | (`Aws_batch, `Aws_batch_queue) -> return () 171 | | tuple -> (* we could run local-docker jobs with a kube cluster but that would 172 | mess with the maximum number of jobs to submit *) 173 | fail (`Invalid_job_submission (`Wrong_backend tuple)) 174 | end 175 | >>= fun () -> 176 | let job = Job.fresh spec in 177 | Job.save t.storage job 178 | >>= fun () -> 179 | change_job_list t (`Add job) 180 | >>= fun () -> 181 | Lwt_condition.broadcast t.kick_loop (); 182 | return (`String (Job.id job)) 183 | 184 | let batch_list ~max_items l = 185 | let res = ref [] in 186 | let rec go l = 187 | match List.split_n l max_items with 188 | | some, [] -> res := some :: !res 189 | | some, more -> 190 | res := some :: !res; 191 | go more 192 | in 193 | go l; 194 | List.rev !res 195 | 196 | let rec loop: 197 | ?and_sleep : float -> t -> (unit, _) Deferred_result.t 198 | = fun ?and_sleep t -> 199 | let and_sleep = 200 | Option.value and_sleep ~default:t.configuration.Configuration.min_sleep in 201 | let now () = Unix.gettimeofday () in 202 | let `Started_count started_jobs, todo = 203 | let currently_started = 204 | List.fold t.jobs ~init:0 ~f:(fun c j -> 205 | match Job.status j with 206 | | `Started _ -> c + 1 207 | | _ -> c) in 208 | let max_started = Cluster.max_started_jobs t.cluster in 209 | let reupdate_wait j = 210 | match Job.latest_error j with 211 | | None -> 30. (* Latest thing was not an error. *) 212 | | Some _ -> 213 | let errors = 214 | (Job.start_errors j |> List.length) + 215 | (Job.update_errors j |> List.length) in 216 | (float errors *. t.configuration.Configuration.backoff_factor) 217 | in 218 | let should_try_to_start j = 219 | match Job.latest_error j with 220 | | None -> true 221 | | Some t -> 222 | now () > t +. (reupdate_wait j) in 223 | List.fold t.jobs ~init:(`Started_count currently_started, []) 224 | ~f:(fun (`Started_count started, todo) j -> 225 | match Job.status j with 226 | (* Remove finished jobs: *) 227 | | `Error _ 228 | | `Finished _ -> 229 | (`Started_count started, `Remove j :: todo) 230 | (* Kill Jobs to kill: *) 231 | | other when List.mem ~set:t.jobs_to_kill (Job.id j) -> 232 | (`Started_count started, `Kill j :: todo) 233 | (* Do not submit too many jobs: *) 234 | | `Submitted when started >= max_started -> 235 | (`Started_count started, todo) 236 | (* Do not retry to start too often: *) 237 | | `Submitted when not (should_try_to_start j) -> 238 | (`Started_count started, todo) 239 | (* Start submitted jobs (< max_started & not backcing off) *) 240 | | `Submitted -> 241 | (`Started_count (started + 1), `Start j :: todo) 242 | (* Do not check again on jobs checked too recently: *) 243 | | `Started time when time +. reupdate_wait j > now () -> 244 | (`Started_count started, todo) 245 | (* Check on jobs that are running: *) 246 | | `Started _ -> 247 | (`Started_count started, `Update j :: todo) 248 | ) 249 | in 250 | t.jobs_to_kill <- []; 251 | let todo_batches = 252 | batch_list ~max_items:t.configuration.Configuration.concurrent_steps todo 253 | in 254 | log_event t (`Loop_begins (started_jobs, todo, todo_batches)) 255 | >>= fun () -> 256 | Pvem_lwt_unix.Deferred_list.while_sequential todo_batches ~f:begin fun batch -> 257 | Pvem_lwt_unix.Deferred_list.for_concurrent batch ~f:begin function 258 | | `Remove j -> 259 | (* We call these functions once to give them a chance to save the output 260 | before Kubernetes forgets about the job: *) 261 | (Job.get_logs ~storage:t.storage ~log:t.log j >>< fun _ -> return ()) 262 | >>= fun () -> 263 | (Job.describe ~storage:t.storage ~log:t.log j >>< fun _ -> return ()) 264 | >>= fun () -> 265 | change_job_list t (`Remove j) 266 | | `Kill j -> 267 | begin 268 | Job.kill ~log:t.log j 269 | >>< function 270 | | `Ok () -> 271 | Job.set_status ~from_error:false j (`Finished (now (), `Killed)); 272 | return () 273 | | `Error e -> 274 | Job.set_status ~from_error:true j 275 | (`Error ("Killing failed: " ^ Error.to_string e)); 276 | return () 277 | end 278 | >>= fun () -> 279 | Job.save t.storage j 280 | | `Start j -> 281 | Job.start ~log:t.log j ~cluster:t.cluster 282 | >>< begin function 283 | | `Ok () -> 284 | Job.set_status ~from_error:false j @@ `Started (now ()); 285 | Job.save t.storage j 286 | >>= fun () -> 287 | return () 288 | | `Error e -> 289 | begin match Job.start_errors j with 290 | | l when List.length l <= t.configuration.Configuration.max_update_errors -> 291 | Job.set_start_errors ~time:(now ()) j (Error.to_string e :: l); 292 | Job.save t.storage j 293 | | more -> 294 | Job.set_status ~from_error:true j @@ 295 | `Error (sprintf 296 | "Starting failed %d times: [ %s ]" 297 | (t.configuration.Configuration.max_update_errors + 1) 298 | (List.dedup more |> String.concat ~sep:" -- ")); 299 | Job.save t.storage j 300 | end 301 | end 302 | | `Update j -> 303 | begin 304 | Job.get_update ~log:t.log j 305 | >>= fun stat -> 306 | begin match stat with 307 | | `Running -> 308 | Job.set_status ~from_error:false j @@ `Started (now ()); 309 | Job.save t.storage j 310 | >>= fun () -> 311 | return () 312 | | (`Failed | `Succeeded as phase) -> 313 | Job.set_status ~from_error:false j @@ `Finished (now (), phase); 314 | Job.save t.storage j 315 | >>= fun () -> 316 | return () 317 | end 318 | end >>< begin function 319 | | `Ok () -> return () 320 | | `Error e -> 321 | begin match Job.update_errors j with 322 | | l when List.length l <= t.configuration.Configuration.max_update_errors -> 323 | Job.set_status ~from_error:true j @@ `Started (now ()); 324 | Job.set_update_errors j ~time:(now ()) (Error.to_string e :: l); 325 | Job.save t.storage j 326 | | more -> 327 | Job.set_status ~from_error:true j @@ 328 | `Error (sprintf 329 | "Updating failed %d times: [ %s ]" 330 | (t.configuration.Configuration.max_update_errors + 1) 331 | (List.dedup more |> String.concat ~sep:" -- ")); 332 | Job.save t.storage j 333 | end 334 | end 335 | end 336 | >>= fun ((_ : unit list), errors) -> 337 | return errors 338 | end 339 | >>= fun 340 | (* We make sure only really fatal errors “exit the loop:” *) 341 | (errors_per_batch : [ `Storage of Storage.Error.common 342 | | `Log of Log.Error.t ] list list) -> 343 | let errors = List.concat errors_per_batch in 344 | log_event t (`Loop_ends (and_sleep, errors)) 345 | >>= fun () -> 346 | begin match errors with 347 | | [] -> return () 348 | | _ :: _ -> 349 | dbg "%s → ERRORS IN THE LOOP!!!: %s" 350 | ODate.Unix.(now () |> Printer.to_iso) 351 | (List.map errors ~f:Error.to_string |> String.concat ~sep:"\n"); 352 | exit 5 353 | end 354 | >>= fun () -> 355 | Deferred_list.pick_and_cancel [ 356 | (Pvem_lwt_unix.System.sleep and_sleep >>< fun _ -> return false); 357 | Lwt.(Lwt_condition.wait t.kick_loop >>= fun () -> return (`Ok true)); 358 | ] 359 | >>= fun kicked -> 360 | let and_sleep = 361 | let still_some_submitted = 362 | List.exists t.jobs ~f:(fun j -> Job.status j = `Submitted) in 363 | match todo, kicked, still_some_submitted with 364 | | [], false, false -> 365 | min t.configuration.Configuration.max_sleep (and_sleep *. 2.) 366 | | _, _, _ -> t.configuration.Configuration.min_sleep in 367 | loop ~and_sleep t 368 | 369 | let initialization t = 370 | Cluster.ensure_living ~log:t.log t.cluster 371 | >>= fun () -> 372 | get_job_list t 373 | >>= fun () -> 374 | t.status <- `Ready; 375 | log_event t `Ready; 376 | >>= fun () -> 377 | Lwt.async (fun () -> loop t); 378 | return () 379 | 380 | let get_job_state t ids = 381 | Deferred_list.while_sequential ids ~f:(fun id -> 382 | Job.get t.storage id 383 | >>= fun job -> 384 | return (`Assoc [ 385 | "id", `String id; 386 | "state", Job.to_yojson job; 387 | ])) 388 | >>= fun l -> 389 | return (`Json (`List l)) 390 | 391 | 392 | let get_job_logs t ids = 393 | Deferred_list.while_sequential ids ~f:(fun id -> 394 | Job.get t.storage id 395 | >>= fun job -> 396 | Job.get_logs ~storage:t.storage ~log:t.log job 397 | >>= fun job_query_result -> 398 | return (`Assoc [ 399 | "id", `String id; 400 | "output", Job_common.Query_result.to_yojson job_query_result; 401 | ]) 402 | ) 403 | >>= fun l -> 404 | return (`Json (`List l)) 405 | 406 | 407 | let get_job_description t ids = 408 | Deferred_list.while_sequential ids ~f:(fun id -> 409 | Job.get t.storage id 410 | >>= fun job -> 411 | Job.describe ~storage:t.storage ~log:t.log job 412 | >>= fun job_query_result -> 413 | return (`Assoc [ 414 | "id", `String id; 415 | "output", Job_common.Query_result.to_yojson job_query_result; 416 | ]) 417 | ) 418 | >>= fun l -> 419 | return (`Json (`List l)) 420 | 421 | 422 | let kill_jobs t ids = 423 | t.jobs_to_kill <- ids @ t.jobs_to_kill; 424 | Lwt_condition.broadcast t.kick_loop (); 425 | return `Done 426 | 427 | let get_jobs t = 428 | let jobs = List.map t.jobs (fun j -> 429 | (`Assoc [ 430 | "id", `String (Job.id j); 431 | "status", `String (Job.Status.show (Job.status j)); 432 | ])) in 433 | let json = (`List jobs) in 434 | return (`Json json) 435 | 436 | let respond_result r = 437 | let open Cohttp in 438 | let module Coserver = Cohttp_lwt_unix.Server in 439 | let open Lwt in 440 | r >>= begin function 441 | | `Ok `Done -> Coserver.respond_string ~status:`OK ~body:"Done" () 442 | | `Ok (`String body) -> Coserver.respond_string ~status:`OK ~body () 443 | | `Ok (`Json j) -> 444 | let body = Yojson.Safe.pretty_to_string ~std:true j in 445 | Coserver.respond_string ~status:`OK ~body () 446 | | `Error e -> 447 | Coserver.respond_string 448 | ~status:`Bad_request ~body:(Error.to_string e) () 449 | end 450 | 451 | let job_ids_of_uri uri = 452 | Uri.query uri 453 | |> List.concat_map ~f:(function | ("id", l) -> l | _ -> []) 454 | 455 | let empty_logs t = 456 | Log.empty t.log 457 | >>= fun () -> 458 | return `Done 459 | 460 | let start t = 461 | let condition = Lwt_condition.create () in 462 | let server_thread () = 463 | Deferred_result.wrap_deferred 464 | ~on_exn:(fun e -> `Start_server (`Exn e)) 465 | begin fun () -> 466 | let open Cohttp in 467 | let module Coserver = Cohttp_lwt_unix.Server in 468 | let open Lwt in 469 | let callback _conn req body = 470 | let uri = req |> Request.uri in 471 | match Uri.path uri with 472 | | "/status" -> 473 | let body = 474 | match t.status with 475 | | `Initializing -> "Initializing" 476 | | `Ready -> "Ready" 477 | in 478 | Coserver.respond_string ~status:`OK ~body () 479 | | "/kick" -> 480 | Lwt_condition.broadcast t.kick_loop (); 481 | respond_result (return (`Ok `Done)) 482 | | "/empty-logs" -> 483 | empty_logs t |> respond_result 484 | | "/jobs" -> 485 | get_jobs t |> respond_result 486 | | "/job/state" -> 487 | get_job_state t (job_ids_of_uri uri) |> respond_result 488 | | "/job/logs" -> 489 | get_job_logs t (job_ids_of_uri uri) |> respond_result 490 | | "/job/describe" -> 491 | get_job_description t (job_ids_of_uri uri) |> respond_result 492 | | "/job/kill" -> 493 | kill_jobs t (job_ids_of_uri uri) |> respond_result 494 | | "/job/submit" -> 495 | body |> Cohttp_lwt_body.to_string 496 | >>= fun body_string -> 497 | incoming_job t body_string 498 | |> respond_result 499 | | "/cluster/describe" -> 500 | Cluster.describe ~log:t.log t.cluster 501 | >>| (fun (a, b) -> `String (sprintf "%s\n%s" a b)) 502 | |> respond_result 503 | | other -> 504 | let meth = req |> Request.meth |> Code.string_of_method in 505 | let headers = req |> Request.headers |> Header.to_string in 506 | body |> Cohttp_lwt_body.to_string >|= (fun body -> 507 | (Printf.sprintf "Uri: %s\nMethod: %s\nHeaders\nHeaders: %s\nBody: %s" 508 | (Uri.to_string uri) meth headers body)) 509 | >>= (fun body -> Coserver.respond_string ~status:`OK ~body ()) 510 | in 511 | Coserver.create ~mode:(`TCP (`Port t.port)) (Coserver.make ~callback ()) 512 | >>= fun () -> 513 | Lwt_condition.signal condition (`Ok ()); 514 | return () 515 | end 516 | in 517 | Lwt.async server_thread; 518 | initialization t 519 | >>= fun () -> 520 | Lwt_condition.wait condition 521 | -------------------------------------------------------------------------------- /src/lib/server.mli: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | module Configuration : sig 4 | 5 | module Default : sig 6 | val min_sleep : float 7 | val max_sleep : float 8 | val max_update_errors : int 9 | val concurrent_steps : int 10 | val backoff_factor : float 11 | end 12 | 13 | type t = { 14 | min_sleep: float [@default Default.min_sleep]; 15 | max_sleep: float [@default Default.max_sleep]; 16 | max_update_errors: int [@default Default.max_update_errors]; 17 | concurrent_steps: int [@default Default.concurrent_steps]; 18 | backoff_factor : float [@default Default.backoff_factor]; 19 | } [@@deriving make, yojson, show] 20 | 21 | val save : 22 | storage:Storage.t -> 23 | t -> 24 | (unit, [> `Storage of [> Storage.Error.common ] ]) 25 | Deferred_result.t 26 | val get : 27 | Storage.t -> 28 | (t, [> `Storage of [> Storage.Error.common] ]) Deferred_result.t 29 | end 30 | 31 | 32 | 33 | type t 34 | 35 | val create : 36 | port:int -> 37 | configuration: Configuration.t -> 38 | root:string -> 39 | cluster:Cluster.t -> 40 | storage:Storage.t -> 41 | log:Log.t -> 42 | t 43 | 44 | val start: t -> 45 | (unit, 46 | [> `Shell_command of Hyper_shell.Error.t 47 | | `Aws_batch_queue of [> `Check_valid ] * string * string 48 | | `Log of Log.Error.t 49 | | `Storage of [> Storage.Error.common ] ]) Deferred_result.t 50 | -------------------------------------------------------------------------------- /src/lib/storage.ml: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type key = string list 4 | type value = string 5 | 6 | let key_of_path = String.concat ~sep:"/" 7 | 8 | type t = { 9 | parameters: string; 10 | mutable handle: Trakeva_of_uri.t option; 11 | store_mutex: Lwt_mutex.t; 12 | collection: string; 13 | } 14 | 15 | let make parameters = 16 | { 17 | parameters; handle = None; 18 | store_mutex = Lwt_mutex.create (); 19 | collection = "coclobas"; 20 | } 21 | 22 | module Error = struct 23 | type where = [ 24 | | `Update of key 25 | | `Read of key 26 | | `Parsing_json of string 27 | ] 28 | type common = [ 29 | | `Exn of where * exn 30 | | `Backend of where * string 31 | | `Of_json of where * string 32 | | `Get_json of where * [ `Missing_data ] 33 | ] 34 | let to_string e = 35 | let where w = 36 | match w with 37 | | `Update u -> sprintf "update: %s" (key_of_path u) 38 | | `Read r -> sprintf "read: %s" (key_of_path r) 39 | | `Parsing_json js -> 40 | sprintf "Parsing json: %s (%d B)" 41 | (String.sub js ~index:0 ~length:30 |> Option.value ~default:js) 42 | (String.length js) 43 | in 44 | sprintf "Storage error: %s" 45 | begin match e with 46 | | `Exn (wh, e) -> sprintf "%s: %s" (where wh) (Printexc.to_string e) 47 | | `Backend (wh, s) -> sprintf "%s: Trakeva: %s" (where wh) s 48 | | `Of_json (wh, s) -> sprintf "%s: From-Yojson: %s" (where wh) s 49 | | `Get_json (wh, `Missing_data) -> 50 | sprintf "Get-json: %s: missing data!" (where wh) 51 | end 52 | 53 | let wrap_trakeva ~info m = 54 | m >>< function 55 | | `Ok o -> return o 56 | | `Error (`Database trakeva) -> 57 | fail (`Storage (`Backend (info, Trakeva.Error.to_string trakeva))) 58 | | `Error (`Not_done) -> 59 | fail (`Storage (`Backend (info, "Transaction: Not done."))) 60 | 61 | end 62 | let wrap ~info lwt = 63 | Deferred_result.wrap_deferred lwt ~on_exn:(fun e -> `Storage (`Exn (info, e))) 64 | 65 | let init t = 66 | Trakeva_of_uri.load t.parameters 67 | 68 | let get_store_no_mutex t = 69 | match t.handle with 70 | | Some h -> return h 71 | | None -> 72 | init t 73 | >>= fun store -> 74 | t.handle <- Some store; 75 | return store 76 | 77 | let on_store t ~f = 78 | Lwt_mutex.with_lock t.store_mutex begin fun () -> 79 | get_store_no_mutex t 80 | >>= fun s -> 81 | f s 82 | end 83 | 84 | let update t k v : (unit, [> `Storage of [> Error.common] ] ) Deferred_result.t = 85 | on_store t ~f:(fun s -> 86 | let action = 87 | Trakeva.Action.(set ~collection:t.collection ~key:(key_of_path k) v) in 88 | Trakeva_of_uri.act s ~action 89 | >>= begin function 90 | | `Done -> return () 91 | | `Not_done -> 92 | fail (`Not_done) 93 | end 94 | ) 95 | |> Error.wrap_trakeva ~info:(`Update k) 96 | 97 | let read t k = 98 | on_store t ~f:(fun s -> 99 | Trakeva_of_uri.get s ~collection:t.collection ~key:(key_of_path k)) 100 | |> Error.wrap_trakeva ~info:(`Read k) 101 | 102 | let empty t = 103 | return () 104 | 105 | 106 | module Json = struct 107 | let of_yojson_error ~info = 108 | let open Ppx_deriving_yojson_runtime.Result in 109 | function 110 | | Ok o -> return o 111 | | Error s -> fail (`Storage (`Of_json (info, s))) 112 | 113 | let save_jsonable st ~path yo = 114 | let json = yo |> Yojson.Safe.pretty_to_string ~std:true in 115 | update st path json 116 | 117 | let parse_json_blob ~parse json = 118 | let info = (`Parsing_json json) in 119 | wrap ~info (fun () -> Lwt.return (Yojson.Safe.from_string json)) 120 | >>= fun yo -> 121 | of_yojson_error ~info (parse yo) 122 | 123 | let get_json_opt st ~path ~parse = 124 | read st path 125 | >>= begin function 126 | | Some json -> parse_json_blob ~parse json >>| fun s -> Some s 127 | | None -> return None 128 | end 129 | 130 | let get_json st ~path ~parse = 131 | read st path 132 | >>= begin function 133 | | Some json -> parse_json_blob ~parse json 134 | | None -> 135 | fail (`Storage (`Get_json (`Read path, `Missing_data))) 136 | end 137 | end 138 | 139 | 140 | -------------------------------------------------------------------------------- /src/lib/storage.mli: -------------------------------------------------------------------------------- 1 | open Internal_pervasives 2 | 3 | type t 4 | type key = string list 5 | type value = string 6 | 7 | module Error : sig 8 | type where = [ 9 | | `Update of key 10 | | `Read of key 11 | | `Parsing_json of string 12 | ] 13 | type common = [ 14 | | `Exn of where * exn 15 | | `Backend of where * string 16 | | `Of_json of where * string 17 | | `Get_json of where * [ `Missing_data ] 18 | ] 19 | val to_string : [< common ] -> string 20 | end 21 | 22 | val make : string -> t 23 | 24 | val update : t -> key -> value -> 25 | (unit, 26 | [> `Storage of [> Error.common] ]) Deferred_result.t 27 | 28 | val read : t -> key -> 29 | (value option, 30 | [> `Storage of [> Error.common] ]) Deferred_result.t 31 | 32 | val empty: t -> 33 | (unit, 34 | [> `Storage of[> Error.common] ]) Deferred_result.t 35 | 36 | module Json : sig 37 | 38 | val save_jsonable : t -> path:key -> Yojson.Safe.json -> 39 | (unit, 40 | [> `Storage of [> Error.common ] ]) 41 | Deferred_result.t 42 | 43 | val parse_json_blob : 44 | parse:(Yojson.Safe.json -> 45 | ('a, string) Ppx_deriving_yojson_runtime.Result.result) -> 46 | string -> 47 | ('a, [> `Storage of [> Error.common ] ]) Deferred_result.t 48 | 49 | val get_json_opt : t -> path:key -> 50 | parse:(Yojson.Safe.json -> 51 | ('a, string) Ppx_deriving_yojson_runtime.Result.result) -> 52 | ('a option, 53 | [> `Storage of [> Error.common]]) 54 | Deferred_result.t 55 | val get_json : t -> path:key -> 56 | parse:(Yojson.Safe.json -> 57 | ('a, string) Ppx_deriving_yojson_runtime.Result.result) -> 58 | ('a, 59 | [> `Storage of [> Error.common]]) 60 | Deferred_result.t 61 | end 62 | 63 | -------------------------------------------------------------------------------- /src/test/client_server.ml: -------------------------------------------------------------------------------- 1 | 2 | open Nonstd 3 | module String = Sosa.Native_string 4 | 5 | let test_out fmt = 6 | ksprintf (fun s -> printf ">>>>> %s\n%!" s) fmt 7 | 8 | let failf fmt = 9 | ksprintf (fun s -> Lwt.fail (Failure s)) fmt 10 | 11 | let command ~bin args : Lwt_process.command = 12 | (bin, Array.of_list (bin :: args)) 13 | 14 | let coclobas args : Lwt_process.command = 15 | command ~bin:"./coclobas.byte" args 16 | 17 | let make_url port path = 18 | sprintf "http://localhost:%d/%s" port path 19 | 20 | let curl ?post ~port path = 21 | command ~bin:"curl" ( 22 | [make_url port path] 23 | @ Option.value_map ~default:[] post ~f:(fun d -> 24 | ["--data-binary"; d]) 25 | ) 26 | 27 | let rec curl_status_until_ready ~port acc = 28 | let open Lwt in 29 | let curl_status = 30 | Lwt_process.open_process_in (curl ~port "status") 31 | ~stderr:`Dev_null 32 | in 33 | Lwt_io.read_lines curl_status#stdout |> Lwt_stream.to_list 34 | >>= fun curl_lines -> 35 | begin match curl_lines with 36 | | ["Ready"] -> 37 | test_out "curl_status_until_ready: %s → Ready" 38 | (String.concat ~sep:" → " acc); 39 | return () 40 | | ["Initializing"] -> 41 | Lwt_unix.sleep 1. 42 | >>= fun () -> 43 | curl_status_until_ready ~port ("Initializing" :: acc) 44 | | other -> 45 | failf "Curl /status: %s" (String.concat ~sep:"." curl_lines); 46 | end 47 | 48 | module Coclojob = Coclobas.Job.Specification 49 | 50 | let submit_job ~port how job = 51 | let open Lwt in 52 | match how with 53 | | `Curl -> 54 | let post = Coclojob.to_yojson job |> Yojson.Safe.pretty_to_string ~std:true in 55 | let process = 56 | Lwt_process.open_process_in 57 | ~stderr:`Dev_null 58 | (curl ~port ~post "job/submit") 59 | in 60 | Lwt_io.read_lines process#stdout |> Lwt_stream.to_list 61 | >>= fun lines -> 62 | test_out "curl_submit_job: %s %s" 63 | (Coclojob.show job) (String.concat ~sep:", " lines); 64 | return (List.hd_exn lines) 65 | | `Client -> 66 | Coclobas.Client.(submit_job (make (make_url port "")) job 67 | >>= fun res -> 68 | match res with 69 | | `Ok id -> return id 70 | | `Error (`Client c) -> failf "Client error: %s" (Error.to_string c) 71 | ) 72 | 73 | let curl_submit_job job = submit_job `Curl job 74 | 75 | let get_status how ids ~port = 76 | let open Lwt in 77 | begin match how with 78 | | `Curl -> 79 | let process = 80 | Lwt_process.open_process_in ~stderr:`Dev_null 81 | (ksprintf (curl ~port) "job/status?%s" 82 | (List.map ids ~f:(sprintf "id=%s") |> String.concat ~sep:"&")) 83 | in 84 | Lwt_io.read_lines process#stdout |> Lwt_stream.to_list 85 | | `Client -> 86 | Coclobas.Client.( 87 | get_job_states 88 | (make (make_url port "")) 89 | ids 90 | >>= fun res -> 91 | match res with 92 | | `Ok stats -> 93 | return (List.map stats ~f:(fun (id, st) -> 94 | sprintf "%s: %s" id Coclobas.Job.(Status.show (status st)))) 95 | | `Error (`Client c) -> failf "Client error: %s" (Error.to_string c) 96 | ) 97 | end 98 | >>= fun lines -> 99 | test_out "get_statuses %s: %s" 100 | (String.concat ~sep:", " ids) 101 | (String.concat ~sep:"\n" lines); 102 | return () 103 | 104 | let curl_get_description ids ~port = 105 | let open Lwt in 106 | let process = 107 | Lwt_process.open_process_in ~stderr:`Dev_null 108 | (ksprintf (curl ~port) "job/describe?%s" 109 | (List.map ids ~f:(sprintf "id=%s") |> String.concat ~sep:"&")) 110 | in 111 | Lwt_io.read_lines process#stdout |> Lwt_stream.to_list 112 | >>= fun lines -> 113 | test_out "curl_descr %s: %s" 114 | (String.concat ~sep:", " ids) 115 | (String.concat ~sep:"\n" lines); 116 | return () 117 | 118 | let curl_get_logs ids ~port = 119 | let open Lwt in 120 | let process = 121 | Lwt_process.open_process_in ~stderr:`Dev_null 122 | (ksprintf (curl ~port) "job/logs?%s" 123 | (List.map ids ~f:(sprintf "id=%s") |> String.concat ~sep:"&")) 124 | in 125 | Lwt_io.read_lines process#stdout |> Lwt_stream.to_list 126 | >>= fun lines -> 127 | test_out "curl_descr %s: %s" 128 | (String.concat ~sep:", " ids) 129 | (String.concat ~sep:"\n" lines); 130 | return () 131 | 132 | let curl_kill ids ~port = 133 | let open Lwt in 134 | let process = 135 | Lwt_process.open_process_in ~stderr:`Dev_null 136 | (ksprintf (curl ~port) "job/kill?%s" 137 | (List.map ids ~f:(sprintf "id=%s") |> String.concat ~sep:"&")) 138 | in 139 | Lwt_io.read_lines process#stdout |> Lwt_stream.to_list 140 | >>= fun lines -> 141 | test_out "curl_kill %s: %s" 142 | (String.concat ~sep:", " ids) 143 | (String.concat ~sep:"\n" lines); 144 | return () 145 | 146 | 147 | let make_test_job ~kind cmd = 148 | let image = "hammerlab/keredofi:epidisco-dev" in 149 | let open Coclobas in 150 | match kind with 151 | | `Aws_batch_queue -> 152 | Aws_batch_job.Specification.make ~image cmd |> Job.Specification.aws_batch 153 | | `GCloud_kubernetes -> 154 | Kube_job.Specification.make ~image cmd |> Job.Specification.kubernetes 155 | | `Local_docker -> 156 | Local_docker_job.Specification.make ~image cmd |> Job.Specification.local_docker 157 | 158 | let job_with_nfs () = 159 | begin try 160 | let mount, witness = 161 | Sys.getenv "VALID_NFS_MOUNT" 162 | |> String.split ~on:(`Character ',') 163 | |> function 164 | | host :: path :: witness :: point :: [] -> 165 | Coclobas.Kube_job.Specification.Nfs_mount.make 166 | ~host ~path ~point (), 167 | sprintf "%s/%s" point witness 168 | | _ -> failwith "can't parse" 169 | in 170 | Some ( 171 | Coclojob.kubernetes 172 | (Coclobas.Kube_job.Specification.make 173 | ~image:"ubuntu" 174 | ~volume_mounts:[`Nfs mount] 175 | ["ls"; "-la"; witness]) 176 | ) 177 | with _ -> None 178 | end 179 | 180 | type test_config = { 181 | root: string; 182 | port: int [@default 22822]; 183 | } [@@deriving cmdliner] 184 | 185 | let () = 186 | let main {root; port} = 187 | Lwt_main.run Lwt.( 188 | let server_process = 189 | Lwt_process.open_process_none 190 | (coclobas ["start-server"; "--root"; root; 191 | "--port"; Int.to_string port]) 192 | in 193 | test_out "Server started"; 194 | Lwt_unix.sleep 1. 195 | >>= fun () -> 196 | Lwt.pick [ 197 | curl_status_until_ready ~port []; 198 | begin 199 | Lwt_unix.sleep 12. 200 | >>= fun () -> 201 | Lwt.fail_with "curl_status_until_ready timedout" 202 | end; 203 | ] 204 | >>= fun () -> 205 | begin 206 | Pvem_lwt_unix.Deferred_result.( 207 | Coclobas.Command_line.get_cluster ~root 208 | >>| Coclobas.Cluster.kind 209 | ) 210 | >>= function 211 | | `Ok k -> return k 212 | | `Error e -> Coclobas.Error.to_string e |> Lwt.fail_with 213 | end 214 | >>= fun kind -> 215 | curl_submit_job ~port (make_test_job ~kind ["sleep"; "400"]) 216 | >>= fun sleep_42 -> 217 | submit_job `Client ~port (make_test_job ~kind [ 218 | "sh"; "-c"; "whoami ; \ 219 | du -sh /usr ; \ 220 | curl http://169.254.169.254/latest/meta-data/ \ 221 | "]) 222 | >>= fun du_sh_usr -> 223 | get_status ~port `Curl [sleep_42; du_sh_usr] 224 | >>= fun () -> 225 | Lwt_unix.sleep 5. >>= fun () -> 226 | get_status ~port `Client [sleep_42; du_sh_usr] 227 | >>= fun () -> 228 | curl_kill ~port [sleep_42] 229 | >>= fun () -> 230 | Lwt_unix.sleep 5. >>= fun () -> 231 | curl_get_description ~port [sleep_42; du_sh_usr] 232 | >>= fun () -> 233 | Option.value_map ~default:(return ()) (job_with_nfs ()) 234 | ~f:begin fun job -> 235 | curl_submit_job job ~port 236 | >>= fun id -> 237 | get_status `Curl [id] ~port 238 | >>= fun () -> 239 | Lwt_unix.sleep 5. 240 | >>= fun () -> 241 | get_status `Client [id] ~port 242 | end 243 | >>= fun () -> 244 | Lwt_unix.sleep 50. 245 | >>= fun () -> 246 | curl_get_description ~port [sleep_42; du_sh_usr] 247 | >>= fun () -> 248 | curl_get_logs ~port [sleep_42; du_sh_usr] 249 | >>= fun () -> 250 | test_out "Killing server"; 251 | server_process#kill Sys.sigint; 252 | server_process#close 253 | >>= fun _ -> 254 | return () 255 | ) 256 | in 257 | match Cmdliner.Term.(eval ( 258 | (pure main $ test_config_cmdliner_term ()), 259 | info "coclobas-client-server-test" 260 | ~doc:"A test that starts a server and talks to it.")) 261 | with 262 | | `Error _ -> exit 1 263 | | `Ok () 264 | | `Version 265 | | `Help -> exit 0 266 | 267 | -------------------------------------------------------------------------------- /src/test/cocloketrew.ml: -------------------------------------------------------------------------------- 1 | 2 | let `Never_returns = 3 | Printf.printf "Plugin: %s enabled\n%!" Coclobas_ketrew_backend.Plugin.name; 4 | Ketrew.Command_line.run_main () 5 | -------------------------------------------------------------------------------- /src/test/ketrew_env_config.ml: -------------------------------------------------------------------------------- 1 | 2 | 3 | let () = 4 | try Topdirs.dir_directory (Sys.getenv "OCAML_TOPLEVEL_PATH") 5 | with Not_found -> ();; 6 | #use "topfind" 7 | #thread 8 | #require "ketrew" 9 | open Nonstd 10 | open Ketrew.Configuration 11 | let debug_level = 2 12 | 13 | let engine = 14 | let database_parameters = 15 | "postgresql://127.0.0.1/?user=postgres&password=kpass" 16 | (* :"/tmp/ketrew/database" *) 17 | in 18 | engine ~database_parameters () 19 | 20 | let env_exn s = 21 | try Sys.getenv s with _ -> ksprintf failwith "Missing environment variable: %S" s 22 | 23 | let port = 24 | env_exn "PORT" |> Int.of_string 25 | |> Option.value_exn ~msg:"$PORT is not an integer" 26 | 27 | let token = (env_exn "AUTH_TOKEN") 28 | let server = 29 | server ~engine 30 | ~authorized_tokens:[ 31 | authorized_token ~name:"From-env" token; 32 | ] 33 | ~return_error_messages:true 34 | ~log_path:"/tmp/ketrew/logs/" 35 | ~command_pipe:"/tmp/ketrew/command.pipe" 36 | (`Tcp port) 37 | 38 | let client = 39 | client ~token (sprintf "http://127.0.0.1:%d" port) 40 | 41 | let () = 42 | output [ 43 | profile "server" (create ~debug_level (server)); 44 | profile "client" (create ~debug_level (client)); 45 | ] 46 | -------------------------------------------------------------------------------- /src/test/workflow_test.ml: -------------------------------------------------------------------------------- 1 | 2 | open Nonstd 3 | 4 | type params = { 5 | port: int [@env "PORT"]; 6 | test_kind: [ `Local_docker | `Kubernetes | `Aws_batch ] 7 | [@enum [ "local-docker", `Local_docker; 8 | "kubernetes", `Kubernetes; 9 | "aws-batch", `Aws_batch ]]; 10 | additional_test: string list; 11 | } [@@deriving cmdliner] 12 | 13 | let main {port; test_kind; additional_test} = 14 | let base_url = sprintf "http://127.0.0.1:%d" port in 15 | let tags more = "coclobas" :: "test" :: more in 16 | let open Ketrew.EDSL in 17 | let of_command expect_tag cmd = 18 | workflow_node without_product 19 | ~name:(sprintf "Coclobas test should %s" expect_tag) 20 | ~tags:(tags ["should-" ^ expect_tag]) 21 | ~make:( 22 | match test_kind with 23 | | `Aws_batch -> 24 | Coclobas_ketrew_backend.Plugin.create 25 | ~base_url 26 | (Coclobas.Aws_batch_job.Specification.make ~image:"ocaml/opam" cmd 27 | |> Coclobas.Job.Specification.aws_batch) 28 | | `Kubernetes -> 29 | Coclobas_ketrew_backend.Plugin.create 30 | ~base_url 31 | (Coclobas.Kube_job.Specification.make ~image:"ubuntu" cmd 32 | |> Coclobas.Job.Specification.kubernetes) 33 | | `Local_docker -> 34 | Coclobas_ketrew_backend.Plugin.create 35 | ~base_url 36 | (Coclobas.Local_docker_job.Specification.make ~image:"ubuntu" cmd 37 | |> Coclobas.Job.Specification.local_docker) 38 | ) 39 | in 40 | let of_program ?image tag p = 41 | let prog_string = Ketrew_pure.Program.to_single_shell_command p in 42 | workflow_node without_product 43 | ~name:(sprintf "Coclotest Program.t (%d B), %s on %s" 44 | (String.length prog_string) 45 | tag 46 | Option.(value ~default:"default" image)) 47 | ~tags:(tags ["should-" ^ tag; "program"]) 48 | ~make:( 49 | match test_kind with 50 | | `Aws_batch -> 51 | Coclobas_ketrew_backend.Plugin.aws_batch_program 52 | ~base_url 53 | ~image:(Option.value ~default:"ocaml/opam" image) p 54 | | `Kubernetes -> 55 | Coclobas_ketrew_backend.Plugin.kubernetes_program 56 | ~base_url 57 | ~image:(Option.value ~default:"ubuntu" image) p 58 | | `Local_docker -> 59 | Coclobas_ketrew_backend.Plugin.local_docker_program 60 | ~cpus:1.5 61 | ~memory:(`MB 10) 62 | ~base_url 63 | ~image:(Option.value ~default:"ubuntu" image) p 64 | ) 65 | in 66 | let wf = 67 | let node1 = of_command "succeed" ["ls"; "-la"] in 68 | let node2 = of_command "fail" ["exit"; "1"] in 69 | let node3 = 70 | workflow_node without_product 71 | ~name:"Coclobas test uses secret" 72 | ~tags:(tags ["succeeds"; "secret"]) 73 | ~make:( 74 | Coclobas_ketrew_backend.Plugin.create 75 | ~base_url 76 | (Coclobas.Job.Specification.kubernetes 77 | Coclobas.Kube_job.Specification.( 78 | let path = "/ketrewkube/hello-world" in 79 | let cool_file = 80 | File_contents_mount.make ~path "Hello world!" in 81 | make ~image:"ubuntu" 82 | ~volume_mounts:[`Constant cool_file] 83 | ["bash"; "-c"; 84 | sprintf "ls -la %s ;\ 85 | cat %s " 86 | (Filename.dirname path) path] 87 | )) 88 | ) 89 | in 90 | let node4 = of_program "succeed" Program.( 91 | chain [ 92 | shf "whoami"; 93 | shf "hostname"; 94 | shf "echo \"ketrew playground: $KETREW_PLAYGROUND\""; 95 | shf "sleep 60"; 96 | ] 97 | ) in 98 | let node4big = of_program "succeed" Program.( 99 | chain ( 100 | List.init 100 ~f:(fun i -> shf "echo 'This is the command %d'" i) 101 | ) 102 | ) in 103 | let node5 = 104 | workflow_node without_product 105 | ~name:"Coclobas test local-docker" 106 | ~tags:(tags ["succeeds"; "local-docker"]) 107 | ~make:( 108 | Coclobas_ketrew_backend.Plugin.create 109 | ~base_url 110 | (Coclobas.Job.Specification.local_docker 111 | Coclobas.Local_docker_job.Specification.( 112 | make ~image:"ubuntu" 113 | ~volume_mounts:[`Local ("/usr/bin", "/hostusrbin")] 114 | ["bash"; "-c"; sprintf "echo sleepin ; sleep 120 ; ls -la /hostusrbin"] 115 | )) 116 | ) 117 | in 118 | let node6 = 119 | workflow_node without_product 120 | ~name:"Coclobas local-docker wrong image" 121 | ~tags:(tags ["fails"; "local-docker"]) 122 | ~make:( 123 | Coclobas_ketrew_backend.Plugin.create 124 | ~base_url 125 | (Coclobas.Job.Specification.local_docker 126 | Coclobas.Local_docker_job.Specification.( 127 | make ~image:"ubuntuijdeidejiije/djedidjede" 128 | ~volume_mounts:[`Local ("/usr/bin", "/hostusrbin")] 129 | ["bash"; "-c"; sprintf "ls -la /hostusrbin"] 130 | )) 131 | ) 132 | in 133 | let node7 = 134 | workflow_node without_product 135 | ~name:"Coclobas local-docker with program" 136 | ~tags:(tags ["succeeds"; "local-docker"]) 137 | ~make:( 138 | Coclobas_ketrew_backend.Plugin.local_docker_program 139 | ~base_url ~image:"ubuntu" 140 | ~volume_mounts:[`Local ("/usr/bin", "/hostusrbin")] 141 | Program.( 142 | exec ["find"; "/hostusrbin"] 143 | ) 144 | ) 145 | in 146 | let additional = 147 | match additional_test with 148 | | [] -> [] 149 | | image :: cmds -> 150 | [ 151 | depends_on (of_program ~image "additional" 152 | Program.(chain (List.map ~f:sh cmds))) 153 | ] 154 | in 155 | let kubes = [ 156 | depends_on node1; 157 | depends_on node2; 158 | depends_on node3; 159 | depends_on node4; 160 | depends_on node4big; 161 | ] @ additional in 162 | let locals = [ 163 | depends_on node1; 164 | depends_on node2; 165 | depends_on node4; 166 | depends_on node4big; 167 | depends_on node5; 168 | depends_on node6; 169 | depends_on node7; 170 | ] @ additional in 171 | let aws = [ 172 | depends_on node1; 173 | depends_on node2; 174 | depends_on node4; 175 | depends_on node4big; 176 | ] @ additional in 177 | let edges = 178 | match test_kind with 179 | | `Kubernetes -> kubes 180 | | `Local_docker -> locals 181 | | `Aws_batch -> aws in 182 | workflow_node without_product 183 | ~tags:(tags ["fails"; "toplevel"]) 184 | ~name:"Coclobas test workflow" 185 | ~edges 186 | in 187 | Ketrew.Client.submit_workflow wf 188 | 189 | let () = 190 | match 191 | Cmdliner.Term.(eval 192 | (pure main $ params_cmdliner_term (), info "test-workflow")) 193 | with 194 | | `Error _ -> exit 1 195 | | `Ok () 196 | | `Version 197 | | `Help -> exit 0 -------------------------------------------------------------------------------- /tools/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # We want `ocaml`, `opam`, and the `biokepi` user: 2 | FROM hammerlab/biokepi-run 3 | 4 | # `opam` is the user with `sudo` powers, and the local `opam-repository` 5 | USER opam 6 | ENV HOME /home/opam 7 | WORKDIR /home/opam 8 | 9 | ENV CLOUDSDK_CORE_DISABLE_PROMPTS true 10 | RUN bash -c 'curl https://sdk.cloud.google.com | bash' 11 | ENV PATH "/home/opam/google-cloud-sdk/bin/:${PATH}" 12 | RUN gcloud components install kubectl 13 | 14 | RUN sudo apt-get install -y python-pip python-dev build-essential 15 | RUN sudo pip install --upgrade google-api-python-client 16 | RUN sudo wget https://raw.githubusercontent.com/cioc/gcloudnfs/master/gcloudnfs -O/usr/bin/gcloudnfs 17 | RUN sudo chmod a+rx /usr/bin/gcloudnfs 18 | 19 | RUN sudo apt-get install -y zlib1g-dev screen nfs-common graphviz 20 | RUN opam install --yes tlstunnel 21 | RUN opam pin --yes add solvuu-build https://github.com/solvuu/solvuu-build.git 22 | RUN opam pin --yes add coclobas https://github.com/hammerlab/coclobas.git 23 | 24 | COPY please.sh /usr/bin/ 25 | RUN sudo chmod 777 /usr/bin/please.sh 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /tools/docker/README.md: -------------------------------------------------------------------------------- 1 | Docker Container To Play With Coclobas 2 | ====================================== 3 | 4 | 5 | Usage 6 | ----- 7 | 8 | Get the container ready: 9 | 10 | sudo docker pull hammerlab/coclobas 11 | sudo mkdir -p /tmp/coclo 12 | sudo chmod 777 /tmp/coclo 13 | sudo docker run -it -p 443:443 -v /tmp/coclo:/coclo --privileged hammerlab/coclobas bash 14 | 15 | - `--privileged` is for NFS mounting 16 | - `-p 443:443` is to pass the port 443 to the container 17 | 18 | Put your `configuration.env` in `/tmp/coclo` (or `/coclo` if you edit it from 19 | the container), and then if all goes well you can mount & start everything with: 20 | 21 | cd /coclo/ 22 | please.sh ./configuration.env start_all 23 | 24 | 25 | 26 | Warnings 27 | -------- 28 | 29 | The environment variable `CLOUDSDK_CORE_DISABLE_PROMPTS` is set to `true`; if 30 | you want to get back the usual `gcloud` interactiveness you may want to: 31 | 32 | unset CLOUDSDK_CORE_DISABLE_PROMPTS 33 | -------------------------------------------------------------------------------- /tools/docker/biokepi_machine.ml: -------------------------------------------------------------------------------- 1 | 2 | #use "topfind";; 3 | #thread 4 | #require "coclobas.ketrew_backend,biokepi";; 5 | 6 | open Nonstd 7 | module String = Sosa.Native_string 8 | let (//) = Filename.concat 9 | 10 | let env_exn s = 11 | try Sys.getenv s with _ -> 12 | ksprintf failwith "Missing environment variable %S" s 13 | 14 | let work_dir = 15 | env_exn "BIOKEPI_WORK_DIR" 16 | 17 | let install_tools_path = 18 | try env_exn "INSTALL_TOOLS_PATH" 19 | with _ -> work_dir // "toolkit" 20 | 21 | let pyensembl_cache_dir = 22 | try env_exn "PYENSEMBLE_CACHE_DIR" 23 | with _ -> work_dir // "pyensembl-cache" 24 | 25 | let reference_genomes_path = 26 | try env_exn "REFERENCE_GENOME_PATH" 27 | with _ -> work_dir // "reference-genome" 28 | 29 | let allow_daemonize = 30 | try env_exn "ALLOW_DAEMONIZE" = "true" 31 | with _ -> false 32 | 33 | let image = 34 | try env_exn "DOCKER_IMAGE" 35 | with _ -> "hammerlab/biokepi-run" 36 | 37 | let env_exn_tool_loc s tool = 38 | try (`Wget (Sys.getenv s)) with _ -> 39 | `Fail (sprintf "No location provided for %s" tool) 40 | 41 | let gatk_jar_location () = env_exn_tool_loc "GATK_JAR_URL" "GATK" 42 | let mutect_jar_location () = env_exn_tool_loc "MUTECT_JAR_URL" "MuTect" 43 | let netmhc_tool_locations () = Biokepi.Setup.Netmhc.({ 44 | netmhc=env_exn_tool_loc "NETMHC_TARBALL_URL" "NetMHC"; 45 | netmhcpan=env_exn_tool_loc "NETMHCPAN_TARBALL_URL" "NetMHCpan"; 46 | pickpocket=env_exn_tool_loc "PICKPOCKET_TARBALL_URL" "PickPocket"; 47 | netmhccons=env_exn_tool_loc "NETMHCCONS_TARBALL_URL" "NetMHCcons"; 48 | }) 49 | 50 | let volume_mounts = 51 | env_exn "NFS_MOUNTS" 52 | |> String.split ~on:(`Character ':') 53 | |> List.map ~f:(fun csv -> 54 | String.split ~on:(`Character ',') csv 55 | |> begin function 56 | | host :: path :: witness :: point :: [] -> 57 | `Nfs ( 58 | Coclobas.Kube_job.Specification.Nfs_mount.make 59 | ~host ~path ~point ()) 60 | | other -> 61 | ksprintf failwith "Wrong format for NFS_MOUNTS: %S" csv 62 | end) 63 | 64 | let name = "Coclomachine" 65 | 66 | let biokepi_machine = 67 | let host = Ketrew.EDSL.Host.parse "/tmp/KT-coclomachine/" in 68 | let max_processors = 7 in 69 | let run_program ?name ?(requirements = []) p = 70 | let open Ketrew.EDSL in 71 | let how = 72 | if 73 | (List.mem ~set:requirements `Quick_run 74 | || List.mem ~set:requirements `Internet_access) 75 | && allow_daemonize 76 | then `On_server_node 77 | else `Submit_to_coclobas 78 | in 79 | let with_umask prog = Program.(sh "umask 000" && sh "whoami" && sh "groups" && prog) in 80 | match how with 81 | | `On_server_node -> 82 | daemonize ~host ~using:`Python_daemon (with_umask p) 83 | | `Submit_to_coclobas -> 84 | Coclobas_ketrew_backend.Plugin.run_program 85 | ~base_url:"http://127.0.0.1:8082" 86 | ~image 87 | ~volume_mounts 88 | Program.( 89 | (* sh "sudo mkdir -m 777 -p /cloco-kube/playground" && *) 90 | sh "echo User" && sh "whoami" && 91 | sh "echo Host" && sh "hostname" && 92 | sh "echo Machine" && sh "uname -a" && 93 | p |> with_umask) 94 | in 95 | let open Biokepi.Setup.Download_reference_genomes in 96 | let toolkit = 97 | Biokepi.Setup.Tool_providers.default_toolkit () 98 | ~host 99 | ~install_tools_path 100 | ~run_program 101 | ~gatk_jar_location 102 | ~mutect_jar_location 103 | ~netmhc_tool_locations in 104 | Biokepi.Machine.create name 105 | ~pyensembl_cache_dir 106 | ~max_processors 107 | ~get_reference_genome:(fun name -> 108 | Biokepi.Setup.Download_reference_genomes.get_reference_genome name 109 | ~toolkit 110 | ~host ~run_program 111 | ~destination_path:reference_genomes_path) 112 | ~host 113 | ~toolkit 114 | ~run_program 115 | ~work_dir:(work_dir // "work") 116 | -------------------------------------------------------------------------------- /tools/docker/configuration.env: -------------------------------------------------------------------------------- 1 | 2 | ## Template for Configuring Ketrew with Coclobas scripts 3 | 4 | ## Set the name of the cluster you want to manage: 5 | # export CLUSTER_NAME=kocluster 6 | # (please change use a name that implies who you are / something recognizable) 7 | 8 | ## Set the gcloud zone if not already set 9 | if [[ -z "$GCLOUD_ZONE" ]] ; then 10 | export GCLOUD_ZONE="us-east1-c" 11 | fi 12 | # see also 13 | # gcloud config set compute/zone $GCLOUD_ZONE 14 | 15 | ## Choose an authentication token for the Ketrew server: 16 | # 17 | export TOKEN=deijdsleijde9e83989d0300 18 | 19 | 20 | ## Number of compute nodes in the deployed cluster: 21 | # 22 | export CLUSTER_MAX_NODES=15 23 | 24 | ## Description of the NFS services that we want the containers to mount 25 | ## 26 | ## A `:`-separated list of `,`-separated 4-tuples 27 | ## 28 | ## Each tuple is: ,,, 29 | ## 30 | ## - is the instance name 31 | ## - is the path on the server that we want to mount 32 | ## - is a relative path to a file that should exist on the mount (used for verification) 33 | ## - is the path where we mount the NFS service (on all cluster nodes) 34 | 35 | export NFS_MOUNTS=datasets-nfs-vm,/datasets-nfs-storage/,.strato-witness.txt,/nfsdatasets:test-nfs-server-vm,/test-storage,Hello.md,/nfswork 36 | 37 | ## Variables used in the `biokepi_machine.ml` script: 38 | 39 | ## Biokepi configuration requires a few directories shared accross the cluster: 40 | ## `$BIOKEPI_WORK_DIR` is mandatory; 41 | # export BIOKEPI_WORK_DIR=/nfs-work/biokepi/ 42 | 43 | ## `INSTALL_TOOLS_PATH` is optional (default is `$BIOKEPI_WORK_DIR/toolkit`): 44 | # export INSTALL_TOOLS_PATH=/nfs-constants/biokepi-software/ 45 | 46 | ## `PYENSEMBL_CACHE_DIR` is optional (default is `$BIOKEPI_WORK_DIR/pyensembl-cache`): 47 | # export PYENSEMBL_CACHE_DIR=/nfs-constants/biokepi-pyensemble-cache/ 48 | 49 | ## `REFERENCE_GENOMES_PATH` is optional (default is `$BIOKEPI_WORK_DIR/reference-genome`) 50 | # export REFERENCE_GENOMES_PATH=/nfs-constants/biokepi-ref-genomes/ 51 | 52 | ## `ALLOW_DAEMONIZE` is optional (default: false) 53 | ## if `true` some nodes (such as downloads or moving data around) will run with 54 | ## `daemonize` backend (i.e. on the server/docker container). 55 | # export ALLOW_DAEMONIZE=true 56 | 57 | ## `DOCKER_IMAGE` is optional (default: hammerlab/biokepi-run): 58 | ## The docker image to use for the Kubernetes jobs 59 | # export DOCKER_IMAGE=something/this:that 60 | 61 | ## Usual Biokepi variables used to download Broad/NetMHC software: 62 | # export GATK_JAR_URL="http://example.com/GATK.jar" 63 | # export MUTECT_JAR_URL="http://example.com/Mutect.jar" 64 | # export NETMHC_TARBALL_URL="http://example.com/netMHC-3.4a.Linux.tar.gz" 65 | # export NETMHCPAN_TARBALL_URL="http://example.com/netMHCpan-2.8a.Linux.tar.gz" 66 | # export PICKPOCKET_TARBALL_URL="http://example.com/pickpocket-1.1a.Linux.tar.gz" 67 | # export NETMHCCONS_TARBALL_URL="http://example.com/netMHCcons-1.1a.Linux.tar.gz" 68 | -------------------------------------------------------------------------------- /tools/docker/please.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | usage () { 6 | echo "Usage: bash please.sh " 7 | echo "" 8 | echo "where may be:" 9 | echo "" 10 | echo "- start_all: mount_all + start screen session with everything" 11 | echo "- mount_all: mount all the NFS mounts" 12 | echo "" 13 | } 14 | 15 | if [ -f "$1" ]; then 16 | export PATH=$PATH:. 17 | export config_file=$1 18 | . $1 19 | shift 20 | else 21 | echo "Argument \$1 should be a configuration file" 22 | usage 23 | exit 2 24 | fi 25 | case $1 in 26 | ""|help|"-h"|"--help" ) 27 | usage 28 | exit 0;; 29 | * ) echo "Let's do this …" ;; 30 | esac 31 | 32 | if [ "$CLUSTER_NAME" = "" ] || [ "$CLUSTER_NAME" = "kocluster" ]; then 33 | echo "Error: \$CLUSTER_NAME is not set or has an unacceptable value: $CLUSTER_NAME" 34 | exit 3 35 | fi 36 | if [ "$GCLOUD_ZONE" = "" ] ; then 37 | echo "Error: \$GCLOUD_ZONE is not set" 38 | exit 3 39 | fi 40 | if [ "$TOKEN" = "" ] ; then 41 | echo "Error: \$TOKEN is not set" 42 | exit 3 43 | fi 44 | if [ "$CLUSTER_MAX_NODES" = "" ] ; then 45 | echo "Error: \$CLUSTER_MAX_NODES is not set" 46 | exit 3 47 | fi 48 | if [ "$CLUSTER_MACHINE_TYPE" = "" ] ; then 49 | CLUSTER_MACHINE_TYPE="n1-highmem-8" 50 | fi 51 | if [ "$NFS_MOUNTS" = "" ] ; then 52 | echo "Warning: \$MOUNT_NFS is not set, your cluster is going to be useless" 53 | fi 54 | 55 | 56 | mount_nfs () { 57 | local host=$1 58 | local storage=$2 59 | local witness=$3 60 | local mount_point=$4 61 | if [ "$1" = "" ] || [ "$2" = "" ] || [ "$3" = "" ] || [ "$4" = "" ] ; then 62 | echo "mount_nfs: invalid arguments" 63 | return 2 64 | fi 65 | if [ -f $mount_point/$witness ] ; then 66 | echo "mount_nfs: $host:$storage already mounted, cf. $mount_point/$witness" 67 | return 68 | fi 69 | sudo mkdir -p $mount_point 70 | sudo mount -t nfs $host:$storage $mount_point 71 | #sudo chmod -R 777 $mount_point 72 | cat $mount_point/$witness 73 | } 74 | mount_all () { 75 | for mo in `echo "$NFS_MOUNTS" | sed 's/:/\n/g'` ; do 76 | arguments=$(echo $mo | sed 's/,/ /g') 77 | echo "--> Mounting: $arguments" 78 | mount_nfs $arguments 79 | done 80 | } 81 | 82 | # This is now not used any more, kept around for a couple of commits … 83 | make_biokepi_user(){ 84 | sudo sh -c "adduser --uid 20042 --disabled-password --gecos '' biokepi && \ 85 | passwd -l biokepi && \ 86 | chown -R biokepi:biokepi /home/biokepi" 87 | } 88 | 89 | start_coclobas () { 90 | root=_cocloroot 91 | ccb=coclobas 92 | $ccb config --root $root \ 93 | --cluster-name $CLUSTER_NAME \ 94 | --cluster-zone $GCLOUD_ZONE \ 95 | --max-nodes $CLUSTER_MAX_NODES \ 96 | --machine-type $CLUSTER_MACHINE_TYPE 97 | export COCLOBAS_DEBUG_SECTIONS="/server/loop" 98 | $ccb start-server --root $root --port 8082 99 | } 100 | 101 | tls_config () { 102 | if [ -f _fake_tls/privkey-nopass.pem ] ; then 103 | echo "TLS cert/key already configured" 104 | else 105 | ketrew init --config _fake_tls --self-signed 106 | fi 107 | } 108 | 109 | start_ketrew () { 110 | sudo chmod -R 777 . 111 | sudo chmod -R 777 /tmp/ketrew 112 | ketrew init --config _ketrew_config --port 8080 --debug 1 --with-tok $TOKEN 113 | ketrew_bin=`which coclobas-ketrew` 114 | ketrew_config=_ketrew_config/config.json 115 | ocaml _ketrew_config/configuration.ml > $ketrew_config 116 | sudo chmod -R 777 _ketrew_config 117 | sudo su biokepi -c "KETREW_CONFIG=$ketrew_config $ketrew_bin start -P server" 118 | } 119 | 120 | start_tlstunnel () { 121 | tls_config 122 | tt=`which tlstunnel` 123 | sudo $tt --cert _fake_tls/certificate.pem \ 124 | --key _fake_tls/privkey-nopass.pem \ 125 | --backend 127.0.0.1:8080 --frontend :443 126 | } 127 | 128 | access_rights_on_console () { 129 | sudo chmod 777 /dev/console 130 | } 131 | 132 | start_all () { 133 | mount_all 134 | access_rights_on_console 135 | if [ -f ~/.screenrc ]; then 136 | cat ~/.screenrc > screenrc 137 | else 138 | echo '# Generated screenrc 139 | hardstatus alwayslastline "%c %-w [ %n %t ] %+w" 140 | ' > screenrc 141 | fi 142 | cat >> screenrc <