├── .gitignore ├── .merlin ├── README.md ├── broadcast_queue.ml ├── config.ml ├── stack_ext.ml ├── swim.ml └── unikernel.ml /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | log 3 | *.native 4 | *.xe 5 | *.xl 6 | *.in 7 | *.ctf 8 | mir-swim 9 | swim_libvirt.xml 10 | key_gen.ml 11 | main.ml 12 | Makefile 13 | -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG lwt 2 | PKG mirage 3 | PKG mirage-types 4 | PKG bin_prot 5 | PKG res 6 | PKG core 7 | PKG cstruct 8 | EXT lwt 9 | B _build 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mirage-swim 2 | 3 | This repo is the beginnings of an implementation of the [SWIM protocol](http://www.cs.cornell.edu/~asdas/research/dsn02-SWIM.pdf) for [Mirage](https://mirage.io/). The goal is to provide a scalable membership protocol for clusters of unikernels. 4 | 5 | The intention is that unikernels in a cluster run the the Swim task alongside their actual task. The application task can then query Swim about the state of the cluster, e.g. to issue a request to another node. 6 | 7 | # Usage 8 | 9 | So far, this has only been tested on OSX running only the Swim task (no application task). Run the following to start a node locally: 10 | 11 | ```bash 12 | env NET=direct DHCP=true mirage configure --unix 13 | make 14 | sudo ./mir-swim 15 | ``` 16 | 17 | (sudo is required to access vmnet) 18 | 19 | You can then run the following in another shell to join the cluster: 20 | 21 | ```bash 22 | sudo ./mir-swim --join_ip=[the ip assigned to the previous node] 23 | ``` 24 | 25 | You should see debug information in the console. 26 | 27 | # Todo 28 | 29 | - Testing and verification 30 | - [SWIM protocol improvements](https://github.com/hashicorp/memberlist#changes-from-swim) 31 | - Register metadata about nodes 32 | - Interface to application task 33 | -------------------------------------------------------------------------------- /broadcast_queue.ml: -------------------------------------------------------------------------------- 1 | open Core_kernel.Std 2 | 3 | (* Broadcast_queue.Make keeps track of changes that need to be broadcast to the 4 | cluster. *) 5 | module Make(E: sig 6 | type t 7 | type id 8 | 9 | val size : t -> int 10 | val skip : t -> id -> bool 11 | val invalidates : t -> t -> bool 12 | end) = struct 13 | 14 | type t = (int * E.t) list ref 15 | 16 | let create () = 17 | ref [] 18 | 19 | let select_broadcasts t transmit_limit addr = 20 | let select memo elem = 21 | let transmit, elems, bytes_left = memo in 22 | let transmit_count, broadcast = elem in 23 | let size = E.size broadcast in 24 | if transmit_count > transmit_limit then 25 | (transmit, elems, bytes_left) 26 | else if size > bytes_left || E.skip broadcast addr then 27 | (transmit, elem::elems, bytes_left) 28 | else 29 | (broadcast::transmit, (transmit_count+1, broadcast)::elems, bytes_left - size) 30 | in 31 | let transmit, t', _ = List.fold_left !t ~f:select ~init:([], [], 65507) in 32 | t := List.sort ~cmp:(fun e e' -> Int.compare (fst e) (fst e')) t'; 33 | transmit 34 | 35 | let enqueue_broadcast t broadcast = 36 | if not (List.exists !t ~f:(fun (_, broadcast') -> broadcast = broadcast')) then begin 37 | let q = List.filter !t ~f:(fun (_, broadcast') -> not (E.invalidates broadcast broadcast')) in 38 | t := ((0, broadcast)::q) 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /config.ml: -------------------------------------------------------------------------------- 1 | open Mirage 2 | 3 | let join_ip = 4 | let doc = Key.Arg.info ~doc:"IP of other node to join." ["join_ip"] in 5 | Key.(create "join_ip" Arg.(opt string "" doc)) 6 | 7 | let port = 8 | let doc = Key.Arg.info ~doc:"Port for serving requests." ["port"] in 9 | Key.(create "port" Arg.(opt int 53 doc)) 10 | 11 | let tracing = mprof_trace ~size:1000000 () 12 | 13 | let handler = 14 | foreign 15 | ~keys:[ 16 | Key.abstract join_ip; 17 | Key.abstract port 18 | ] 19 | "Unikernel.Main" 20 | (console @-> stackv4 @-> time @-> random @-> job) 21 | 22 | let net = 23 | try match Sys.getenv "NET" with 24 | | "direct" -> `Direct 25 | | "socket" -> `Socket 26 | | _ -> `Direct 27 | with Not_found -> `Direct 28 | 29 | let dhcp = 30 | try match Sys.getenv "DHCP" with 31 | | "" -> false 32 | | _ -> true 33 | with Not_found -> false 34 | 35 | let stack = 36 | match net, dhcp with 37 | | `Direct, true -> direct_stackv4_with_dhcp default_console tap0 38 | | `Direct, false -> direct_stackv4_with_default_ipv4 default_console tap0 39 | | `Socket, _ -> socket_stackv4 default_console [Ipaddr.V4.any] 40 | 41 | let () = 42 | add_to_opam_packages ["res"; "core_kernel"; "bin_prot"; "sexplib"]; 43 | add_to_ocamlfind_libraries ["res"; "core_kernel"; "bin_prot"; "bin_prot.syntax"; "sexplib"; "sexplib.syntax"]; 44 | register "swim" ~tracing [ 45 | handler $ default_console $ stack $ default_time $ default_random; 46 | ] 47 | 48 | -------------------------------------------------------------------------------- /stack_ext.ml: -------------------------------------------------------------------------------- 1 | open V1 2 | open Core_kernel.Std 3 | 4 | (* Make Ipaddr.V4.t compatible with bin_io and sexp *) 5 | module Make(S : STACKV4) = struct 6 | include S 7 | 8 | module Ipv4Binable = Bin_prot.Utils.Make_binable(struct 9 | module Binable = String 10 | type t = Ipaddr.V4.t 11 | 12 | let to_binable = Ipaddr.V4.to_bytes 13 | let of_binable = Ipaddr.V4.of_bytes_exn 14 | end) 15 | 16 | let bin_size_ipv4addr = Ipv4Binable.bin_size_t 17 | let bin_write_ipv4addr = Ipv4Binable.bin_write_t 18 | let bin_read_ipv4addr = Ipv4Binable.bin_read_t 19 | 20 | let ipv4addr_of_sexp t = 21 | t 22 | |> Sexp.to_string 23 | |> Ipaddr.V4.of_string_exn 24 | 25 | let sexp_of_ipv4addr t = 26 | t 27 | |> Ipaddr.V4.to_string 28 | |> Sexp.of_string 29 | end 30 | -------------------------------------------------------------------------------- /swim.ml: -------------------------------------------------------------------------------- 1 | open Lwt 2 | open V1 3 | open V1_LWT 4 | open Core_kernel.Std 5 | 6 | module Make (C : CONSOLE) (S : STACKV4) (T : TIME) (R : RANDOM) = struct 7 | module U = S.UDPV4 8 | module IPV4 = S.IPV4 9 | module S = Stack_ext.Make(S) 10 | 11 | type state = Dead | Alive with bin_io, sexp 12 | 13 | type addr = { 14 | ip : S.ipv4addr; 15 | port : int; 16 | } with bin_io, sexp 17 | 18 | type node = { 19 | addr : addr; 20 | mutable state : state; 21 | } with bin_io, sexp 22 | 23 | type payload = 24 | | Ping 25 | | Ack 26 | | PingReq of addr 27 | with bin_io, sexp 28 | 29 | type message = { 30 | seq_no : int; 31 | payload : payload; 32 | nodes : node list; 33 | } with bin_io, sexp 34 | 35 | (* - Ping timeout should be average or 99th percentile round-trip time 36 | * - Protocol period should be at least three times the round-trip estimate 37 | *) 38 | type config = { 39 | port : int; 40 | protocol_period : int; 41 | round_trip_time : int; 42 | indirect_ping_count : int; 43 | } 44 | 45 | module BQ = Broadcast_queue.Make(struct 46 | type t = node 47 | type id = addr 48 | 49 | let invalidates node node' = 50 | node.addr = node'.addr && node.state <> node'.state 51 | 52 | let skip node addr = 53 | node.addr = addr 54 | 55 | let size = bin_size_node 56 | end) 57 | 58 | type t = { 59 | console : C.t; 60 | net : S.t; 61 | config : config; 62 | transmit_queue : BQ.t; 63 | acks : (int, unit Lwt_condition.t) Hashtbl.Poly.t; 64 | mutable nodes : node list; 65 | mutable seq_no : int; 66 | } 67 | 68 | let create s c ~config = 69 | { console = c; 70 | net = s; 71 | config; 72 | nodes = []; 73 | transmit_queue = BQ.create (); 74 | acks = Hashtbl.Poly.create (); 75 | seq_no = 0; 76 | } 77 | 78 | let debug t s = 79 | C.log t.console s 80 | 81 | let debug_msg t fmt addr msg = 82 | let msg_s = (Sexp.to_string_hum (sexp_of_message msg)) in 83 | let sender_s = Sexp.to_string_hum (S.sexp_of_ipv4addr addr) in 84 | debug t (Printf.sprintf fmt sender_s msg_s) 85 | 86 | let debug_nodes t = 87 | debug t "Nodes:"; 88 | debug t (t.nodes |> <:sexp_of> |> Sexp.to_string_hum) 89 | 90 | let next_seq_no t = 91 | let seq_no = t.seq_no in 92 | t.seq_no <- t.seq_no + 1; 93 | seq_no 94 | 95 | let node_count t = 96 | List.length t.nodes 97 | 98 | let transmit_limit t = 99 | if node_count t = 0 then 100 | 0 101 | else 102 | node_count t 103 | |> Float.of_int 104 | |> log 105 | |> Float.round_up 106 | |> Float.to_int 107 | 108 | let send t ~addr ~payload ~seq_no = 109 | let transmit_limit = transmit_limit t in 110 | let nodes = BQ.select_broadcasts t.transmit_queue transmit_limit addr in 111 | let msg = { seq_no=seq_no; payload; nodes; } in 112 | debug_msg t "< Msg to %s: %s" addr.ip msg; 113 | let size = bin_size_message msg in 114 | let buf = Cstruct.create size in 115 | ignore(bin_write_message ~pos:0 (Cstruct.to_bigarray buf) msg); 116 | let udp = S.udpv4 t.net in 117 | U.write ~source_port:t.config.port ~dest_ip:addr.ip ~dest_port:addr.port udp buf 118 | 119 | let send_ping t ~addr ~seq_no = 120 | send t ~addr ~payload:Ping ~seq_no 121 | 122 | let send_ack t ~addr ~seq_no = 123 | send t ~addr ~payload:Ack ~seq_no 124 | 125 | let send_ping_request t ~addr ~target ~seq_no = 126 | send t ~addr ~payload:(PingReq target) ~seq_no 127 | 128 | let rec wait_ack t seq_no = 129 | let cond = Hashtbl.find_or_add t.acks seq_no ~default:Lwt_condition.create in 130 | Lwt_condition.wait cond 131 | 132 | let wait_ack_timeout t seq_no timeout = 133 | pick [ 134 | (T.sleep (Float.of_int timeout) >> return `Timeout); 135 | (wait_ack t seq_no >> return `Ok) 136 | ] 137 | 138 | let get_node_by_addr t addr = 139 | List.find ~f:(fun n -> n.addr = addr) t.nodes 140 | 141 | let update_node t node = 142 | match (get_node_by_addr t node.addr, node.state) with 143 | | (Some node, Dead) -> 144 | node.state <- Dead; 145 | BQ.enqueue_broadcast t.transmit_queue node 146 | | (None, Alive) -> 147 | t.nodes <- node::t.nodes; 148 | BQ.enqueue_broadcast t.transmit_queue node 149 | | (Some _, Alive) 150 | | (None, Dead) -> () 151 | 152 | let sample_nodes t k exclude = 153 | t.nodes 154 | |> List.filter ~f:(fun n -> n <> exclude) 155 | |> List.permute 156 | |> fun l -> List.take l k 157 | 158 | let probe_node t node = 159 | let seq_no = next_seq_no t in 160 | ignore_result (send_ping t ~addr:node.addr ~seq_no); 161 | match_lwt (wait_ack_timeout t seq_no t.config.round_trip_time) with 162 | | `Ok -> return () 163 | | `Timeout -> 164 | debug t "ACK timeout"; 165 | let helpers = sample_nodes t t.config.indirect_ping_count node in 166 | List.iter helpers (fun helper -> 167 | send_ping_request t ~addr:helper.addr ~target:node.addr ~seq_no 168 | |> ignore_result 169 | ); 170 | let wait_time = t.config.protocol_period - t.config.round_trip_time in 171 | match_lwt wait_ack_timeout t seq_no wait_time with 172 | | `Ok -> return () 173 | | `Timeout -> 174 | debug t "Indirect ping timeout"; 175 | let node = { state = Dead; addr = node.addr } in 176 | update_node t node; 177 | BQ.enqueue_broadcast t.transmit_queue node; 178 | return () 179 | 180 | let rec failure_detection t = 181 | debug_nodes t; 182 | t.nodes <- List.filter t.nodes ~f:(fun node -> node.state = Alive); 183 | t.nodes <- List.permute t.nodes; 184 | (if node_count t = 0 then 185 | T.sleep (Float.of_int (t.config.protocol_period)) 186 | else 187 | Lwt_list.iter_s (fun node -> 188 | if node.state = Dead then 189 | return () 190 | else 191 | join [ 192 | probe_node t node; 193 | T.sleep (Float.of_int (t.config.protocol_period)) 194 | ] 195 | ) t.nodes 196 | ) >> 197 | failure_detection t 198 | 199 | let join t addr = 200 | let seq_no = next_seq_no t in 201 | send_ping t ~addr ~seq_no 202 | 203 | let handle_payload t src_addr msg = 204 | update_node t {state=Alive; addr=src_addr}; 205 | match msg.payload with 206 | | Ping -> 207 | send_ack t ~addr:src_addr ~seq_no:msg.seq_no 208 | | PingReq addr -> 209 | let seq_no = next_seq_no t in 210 | send_ping t ~addr ~seq_no >> 211 | (match_lwt wait_ack_timeout t seq_no t.config.protocol_period with 212 | | `Timeout -> return () 213 | | `Ok -> send_ack t ~addr:src_addr ~seq_no:msg.seq_no) 214 | | Ack -> 215 | match Hashtbl.find t.acks msg.seq_no with 216 | | Some cond -> 217 | Lwt_condition.broadcast cond (); 218 | return () 219 | | None -> 220 | (* Unexpected ACK -- ignore *) 221 | return () 222 | 223 | let callback t ~src ~dst ~src_port buf = 224 | let message = bin_read_message (Cstruct.to_bigarray buf) (ref 0) in 225 | debug_msg t "> Msg from %s: %s" src message; 226 | List.iter message.nodes (update_node t); 227 | let addr = { ip=src; port=src_port } in 228 | handle_payload t addr message 229 | 230 | let listen t = 231 | async (fun () -> failure_detection t); 232 | S.listen_udpv4 t.net ~port:t.config.port (callback t) 233 | end 234 | -------------------------------------------------------------------------------- /unikernel.ml: -------------------------------------------------------------------------------- 1 | open Lwt 2 | open V1 3 | open V1_LWT 4 | open Printf 5 | open Core_kernel.Std 6 | 7 | module Main (C:CONSOLE) (S:STACKV4) (T:TIME) (R:RANDOM) = struct 8 | 9 | module SwimImpl = Swim.Make(C)(S)(T)(R) 10 | 11 | let start c s t r = 12 | let join_ip = Key_gen.join_ip () in 13 | let port = Key_gen.port () in 14 | let swim = SwimImpl.create s c { port = port; protocol_period = 5; round_trip_time = 1; indirect_ping_count = 3 } in 15 | SwimImpl.listen swim; 16 | match join_ip with 17 | | "" -> 18 | C.log c "Starting new cluster"; 19 | S.listen s 20 | | ip -> 21 | C.log c (Printf.sprintf "Attempting to join node: %s" ip); 22 | SwimImpl.join swim { ip = (Ipaddr.V4.of_string_exn ip); port = port } >> 23 | S.listen s 24 | end 25 | --------------------------------------------------------------------------------