├── .gitignore ├── .merlin ├── .ocp-indent ├── AUTHORS ├── LICENSE ├── Makefile ├── README.md ├── dune-project ├── oraft-lwt-extprot-io.opam ├── oraft-lwt-tls.opam ├── oraft-lwt.opam ├── oraft-rsm.opam ├── oraft.opam ├── src ├── dune ├── oraft.ml ├── oraft.mli ├── oraft_lwt.ml ├── oraft_lwt.mli ├── oraft_lwt_conn_wrapper.ml ├── oraft_lwt_conn_wrapper.mli ├── oraft_lwt_extprot_io.ml ├── oraft_lwt_extprot_io.mli ├── oraft_lwt_s.ml ├── oraft_lwt_tls.ml ├── oraft_lwt_tls.mli ├── oraft_proto.proto ├── oraft_proto_rsm.proto ├── oraft_proto_types.proto ├── oraft_rsm.ml ├── oraft_rsm.mli └── oraft_rsm_s.ml └── test ├── dict.ml ├── dune └── test_DES.ml /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | *.install 3 | **/*.merlin -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG batteries 2 | PKG cryptokit 3 | PKG lwt.unix 4 | PKG extprot 5 | -------------------------------------------------------------------------------- /.ocp-indent: -------------------------------------------------------------------------------- 1 | # -*- conf -*- 2 | # This is an example configuration file for ocp-indent 3 | # 4 | # Copy to the root of your project with name ".ocp-indent", customise, and 5 | # transparently get consistent indentation on all your ocaml source files. 6 | 7 | # Starting the configuration file with a preset ensures you won't fallback to 8 | # definitions from "~/.ocp/ocp-indent.conf". 9 | # These are `normal`, `apprentice` and `JaneStreet` and set different defaults. 10 | normal 11 | 12 | # 13 | # INDENTATION VALUES 14 | # 15 | 16 | # Number of spaces used in all base cases, for example: 17 | # let foo = 18 | # ^^bar 19 | base = 2 20 | 21 | # Indent for type definitions: 22 | # type t = 23 | # ^^int 24 | type = 2 25 | 26 | # Indent after `let in` (unless followed by another `let`): 27 | # let foo = () in 28 | # ^^bar 29 | in = 2 30 | 31 | # Indent after `match/try with` or `function`: 32 | # match foo with 33 | # ^^| _ -> bar 34 | with = 2 35 | 36 | # Indent for clauses inside a pattern-match (after the arrow): 37 | # match foo with 38 | # | _ -> 39 | # ^^^^bar 40 | # the default is 2, which aligns the pattern and the expression 41 | match_clause = 4 # this is non-default 42 | 43 | # When nesting expressions on the same line, their indentation are in 44 | # some cases stacked, so that it remains correct if you close them one 45 | # at a line. This may lead to large indents in complex code though, so 46 | # this parameter can be used to set a maximum value. Note that it only 47 | # affects indentation after function arrows and opening parens at end 48 | # of line. 49 | # 50 | # for example (left: `none`; right: `4`) 51 | # let f = g (h (i (fun x -> # let f = g (h (i (fun x -> 52 | # x) # x) 53 | # ) # ) 54 | # ) # ) 55 | max_indent = 4 56 | 57 | 58 | # 59 | # INDENTATION TOGGLES 60 | # 61 | 62 | # Wether the `with` parameter should be applied even when in a sub-block. 63 | # Can be `always`, `never` or `auto`. 64 | # if `always`, there are no exceptions 65 | # if `auto`, the `with` parameter is superseded when seen fit (most of the time, 66 | # but not after `begin match` for example) 67 | # if `never`, `with` is only applied if the match block starts a line. 68 | # 69 | # For example, the following is not indented if set to `always`: 70 | # let f = function 71 | # ^^| Foo -> bar 72 | strict_with = never 73 | 74 | # Controls indentation after the `else` keyword. `always` indents after the 75 | # `else` keyword normally, like after `then`. 76 | # If set to `never', the `else` keyword won't indent when followed by a newline. 77 | # `auto` indents after `else` unless in a few "unclosable" cases (`let in`, 78 | # `match`...). 79 | # 80 | # For example, with `strict_else=never`: 81 | # if cond then 82 | # foo 83 | # else 84 | # bar; 85 | # baz 86 | # `never` is discouraged if you may encounter code like this example, 87 | # because it hides the scoping error (`baz` is always executed) 88 | strict_else = always 89 | 90 | # Ocp-indent will normally try to preserve your in-comment indentation, as long 91 | # as it respects the left-margin or starts with `(*\n`. Setting this to `true` 92 | # forces alignment within comments. 93 | strict_comments = false 94 | 95 | # Toggles preference of column-alignment over line indentation for most 96 | # of the common operators and after mid-line opening parentheses. 97 | # 98 | # for example (left: `false'; right: `true') 99 | # let f x = x # let f x = x 100 | # + y # + y 101 | align_ops = true 102 | 103 | # Function parameters are normally indented one level from the line containing 104 | # the function. This option can be used to have them align relative to the 105 | # column of the function body instead. 106 | # if set to `always`, always align below the function 107 | # if `auto`, only do that when seen fit (mainly, after arrows) 108 | # if `never`, no alignment whatsoever 109 | # 110 | # for example (left: `never`; right: `always or `auto) 111 | # match foo with # match foo with 112 | # | _ -> some_fun # | _ -> some_fun 113 | # ^^parameter # ^^parameter 114 | align_params = always 115 | 116 | 117 | # 118 | # SYNTAX EXTENSIONS 119 | # 120 | 121 | # You can also add syntax extensions (as per the --syntax command-line option): 122 | syntax = lwt 123 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Mauricio Fernandez 2 | Vincent Bernardoff 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | "MIT License" 2 | 3 | Copyright (c) 2014-2016 Mauricio Fernandez , 4 | Vincent Bernardoff 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | dune build @install @runtest 3 | 4 | clean: 5 | dune clean 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Implementation of Raft consensus algorithm 3 | 4 | oraft consists of: 5 | 6 | * a core Raft (purely functional) state machine featuring leader election + 7 | log replication, log compaction via snapshotting, cluster membership change 8 | support and efficient linearizable read-only operations. This state machine 9 | is abstracted over the specifics of peer communication (I/O, message 10 | serialization), concurrency and timeouts. (`Oraft` module) 11 | 12 | * a specialization of the above state machine using the Lwt library for 13 | concurrency (`Oraft_lwt` module) 14 | 15 | * a replicated state machine built atop `Oraft_lwt` (`RSM` module) 16 | 17 | * a sample distributed dictionary client/server (`dict.ml`) built atop `RSM` 18 | 19 | ## Status 20 | 21 | The core state machine has been tested using a discrete event simulator 22 | that simulates node failures, network failures, message loss, random delays, 23 | cluster changes (node deployment and decommissioning)... 24 | 25 | ## Performance 26 | 27 | The sample distributed dictionary has been clocked at rates exceeding 70000 28 | ops/s on a 3-node cluster. There is potential for optimization, both in the 29 | core state machine (by decreasing GC pressure via lesser copying) and the 30 | communication layer (faster message serialization, command batching). 31 | 32 | ## References 33 | 34 | In Search of an Understandable Consensus Algorithm. Diego Ongaro and John 35 | Ousterhout. Stanford University. (Draft of October 7, 2013). 36 | https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf 37 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 1.0) 2 | -------------------------------------------------------------------------------- /oraft-lwt-extprot-io.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "oraft-lwt-extprot-io" 3 | version: "0.1" 4 | authors: [ 5 | "Mauricio Fernandez " 6 | "Vincent Bernardoff " 7 | ] 8 | maintainer: "Vincent Bernardoff " 9 | homepage: "https://github.com/mfp/oraft" 10 | bug-reports: "https://github.com/mfp/oraft/issues" 11 | dev-repo: "git+https://github.com/mfp/oraft" 12 | doc: "https://mfp.github.io/oraft/doc" 13 | build: [ "dune" "build" "-j" jobs "-p" name "@install" ] 14 | depends: [ 15 | "dune" {build & >= "1.3.0"} 16 | "lwt_ppx" {build & >= "1.2.1"} 17 | "lwt" {>= "4.1.0"} 18 | "extprot" {>= "1.5.0"} 19 | "logs" {>= "0.6.2"} 20 | 21 | "oraft-lwt" {= "0.1"} 22 | ] 23 | synopsis: "Implementation of IO module for `oraft-lwt`" 24 | description: """ 25 | oraft consists of: 26 | 27 | * a core Raft (purely functional) state machine featuring leader election + 28 | log replication, log compaction via snapshotting, cluster membership change 29 | support and efficient linearizable read-only operations. This state machine 30 | is abstracted over the specifics of peer communication (I/O, message 31 | serialization), concurrency and timeouts. (`Oraft` module) 32 | 33 | * a specialization of the above state machine using the Lwt library for 34 | concurrency (`Oraft_lwt` module) 35 | 36 | * a replicated state machine built atop `Oraft_lwt` (`Oraft_rsm` module) 37 | 38 | * a sample distributed dictionary client/server (`dict.ml`) built atop `Oraft_rsm` 39 | 40 | ## Status 41 | 42 | The core state machine has been tested using a discrete event simulator 43 | that simulates node failures, network failures, message loss, random delays, 44 | cluster changes (node deployment and decommissioning)... 45 | 46 | ## Performance 47 | 48 | The sample distributed dictionary has been clocked at rates exceeding 70000 49 | ops/s on a 3-node cluster. There is potential for optimization, both in the 50 | core state machine (by decreasing GC pressure via lesser copying) and the 51 | communication layer (faster message serialization, command batching). 52 | 53 | ## References 54 | 55 | In Search of an Understandable Consensus Algorithm. Diego Ongaro and John 56 | Ousterhout. Stanford University. (Draft of October 7, 2013). 57 | https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf""" 58 | -------------------------------------------------------------------------------- /oraft-lwt-tls.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "oraft-lwt-tls" 3 | version: "0.1" 4 | authors: [ 5 | "Mauricio Fernandez " 6 | "Vincent Bernardoff " 7 | ] 8 | maintainer: "Vincent Bernardoff " 9 | homepage: "https://github.com/mfp/oraft" 10 | bug-reports: "https://github.com/mfp/oraft/issues" 11 | dev-repo: "git+https://github.com/mfp/oraft" 12 | doc: "https://mfp.github.io/oraft/doc" 13 | build: [ "dune" "build" "-j" jobs "-p" name "@install" ] 14 | depends: [ 15 | "dune" {build & >= "1.3.0"} 16 | "lwt_ppx" {build & >= "1.2.1"} 17 | "tls" {>= "0.9.2"} 18 | 19 | "oraft-lwt" {= "0.1"} 20 | ] 21 | synopsis: "TLS connection wrappers for `oraft-lwt`" 22 | description: """ 23 | oraft consists of: 24 | 25 | * a core Raft (purely functional) state machine featuring leader election + 26 | log replication, log compaction via snapshotting, cluster membership change 27 | support and efficient linearizable read-only operations. This state machine 28 | is abstracted over the specifics of peer communication (I/O, message 29 | serialization), concurrency and timeouts. (`Oraft` module) 30 | 31 | * a specialization of the above state machine using the Lwt library for 32 | concurrency (`Oraft_lwt` module) 33 | 34 | * a replicated state machine built atop `Oraft_lwt` (`Oraft_rsm` module) 35 | 36 | * a sample distributed dictionary client/server (`dict.ml`) built atop `Oraft_rsm` 37 | 38 | ## Status 39 | 40 | The core state machine has been tested using a discrete event simulator 41 | that simulates node failures, network failures, message loss, random delays, 42 | cluster changes (node deployment and decommissioning)... 43 | 44 | ## Performance 45 | 46 | The sample distributed dictionary has been clocked at rates exceeding 70000 47 | ops/s on a 3-node cluster. There is potential for optimization, both in the 48 | core state machine (by decreasing GC pressure via lesser copying) and the 49 | communication layer (faster message serialization, command batching). 50 | 51 | ## References 52 | 53 | In Search of an Understandable Consensus Algorithm. Diego Ongaro and John 54 | Ousterhout. Stanford University. (Draft of October 7, 2013). 55 | https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf""" 56 | -------------------------------------------------------------------------------- /oraft-lwt.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "oraft-lwt" 3 | version: "0.1" 4 | authors: [ 5 | "Mauricio Fernandez " 6 | "Vincent Bernardoff " 7 | ] 8 | maintainer: "Vincent Bernardoff " 9 | homepage: "https://github.com/mfp/oraft" 10 | bug-reports: "https://github.com/mfp/oraft/issues" 11 | dev-repo: "git+https://github.com/mfp/oraft" 12 | doc: "https://mfp.github.io/oraft/doc" 13 | build: [ "dune" "build" "-j" jobs "-p" name "@install" ] 14 | depends: [ 15 | "dune" {build & >= "1.3.0"} 16 | "lwt_ppx" {build & >= "1.2.1"} 17 | "lwt" {>= "4.1.0"} 18 | "logs" {>= "0.6.2"} 19 | 20 | "oraft" {= "0.1"} 21 | ] 22 | synopsis: "Lwt specialization of `oraft`" 23 | description: """ 24 | oraft consists of: 25 | 26 | * a core Raft (purely functional) state machine featuring leader election + 27 | log replication, log compaction via snapshotting, cluster membership change 28 | support and efficient linearizable read-only operations. This state machine 29 | is abstracted over the specifics of peer communication (I/O, message 30 | serialization), concurrency and timeouts. (`Oraft` module) 31 | 32 | * a specialization of the above state machine using the Lwt library for 33 | concurrency (`Oraft_lwt` module) 34 | 35 | * a replicated state machine built atop `Oraft_lwt` (`Oraft_rsm` module) 36 | 37 | * a sample distributed dictionary client/server (`dict.ml`) built atop `Oraft_rsm` 38 | 39 | ## Status 40 | 41 | The core state machine has been tested using a discrete event simulator 42 | that simulates node failures, network failures, message loss, random delays, 43 | cluster changes (node deployment and decommissioning)... 44 | 45 | ## Performance 46 | 47 | The sample distributed dictionary has been clocked at rates exceeding 70000 48 | ops/s on a 3-node cluster. There is potential for optimization, both in the 49 | core state machine (by decreasing GC pressure via lesser copying) and the 50 | communication layer (faster message serialization, command batching). 51 | 52 | ## References 53 | 54 | In Search of an Understandable Consensus Algorithm. Diego Ongaro and John 55 | Ousterhout. Stanford University. (Draft of October 7, 2013). 56 | https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf""" 57 | -------------------------------------------------------------------------------- /oraft-rsm.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "oraft-rsm" 3 | version: "0.1" 4 | authors: [ 5 | "Mauricio Fernandez " 6 | "Vincent Bernardoff " 7 | ] 8 | maintainer: "Vincent Bernardoff " 9 | homepage: "https://github.com/mfp/oraft" 10 | bug-reports: "https://github.com/mfp/oraft/issues" 11 | dev-repo: "git+https://github.com/mfp/oraft" 12 | doc: "https://mfp.github.io/oraft/doc" 13 | build: [ "dune" "build" "-j" jobs "-p" name "@install" ] 14 | depends: [ 15 | "dune" {build & >= "1.3.0"} 16 | "lwt_ppx" {build & >= "1.2.1"} 17 | 18 | "oraft-lwt" {= "0.1"} 19 | "oraft-lwt-tls" {= "0.1"} 20 | "oraft-lwt-extprot-io" {= "0.1"} 21 | ] 22 | synopsis: "Replicated State Machine built atop `oraft-lwt`" 23 | description: """ 24 | oraft consists of: 25 | 26 | * a core Raft (purely functional) state machine featuring leader election + 27 | log replication, log compaction via snapshotting, cluster membership change 28 | support and efficient linearizable read-only operations. This state machine 29 | is abstracted over the specifics of peer communication (I/O, message 30 | serialization), concurrency and timeouts. (`Oraft` module) 31 | 32 | * a specialization of the above state machine using the Lwt library for 33 | concurrency (`Oraft_lwt` module) 34 | 35 | * a replicated state machine built atop `Oraft_lwt` (`Oraft_rsm` module) 36 | 37 | * a sample distributed dictionary client/server (`dict.ml`) built atop 38 | `Oraft_rsm` 39 | 40 | ## Status 41 | 42 | The core state machine has been tested using a discrete event simulator 43 | that simulates node failures, network failures, message loss, random delays, 44 | cluster changes (node deployment and decommissioning)... 45 | 46 | ## Performance 47 | 48 | The sample distributed dictionary has been clocked at rates exceeding 70000 49 | ops/s on a 3-node cluster. There is potential for optimization, both in the 50 | core state machine (by decreasing GC pressure via lesser copying) and the 51 | communication layer (faster message serialization, command batching). 52 | 53 | ## References 54 | 55 | In Search of an Understandable Consensus Algorithm. Diego Ongaro and John 56 | Ousterhout. Stanford University. (Draft of October 7, 2013). 57 | https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf""" 58 | -------------------------------------------------------------------------------- /oraft.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "oraft" 3 | version: "0.1" 4 | authors: [ 5 | "Mauricio Fernandez " 6 | "Vincent Bernardoff " 7 | ] 8 | maintainer: "Vincent Bernardoff " 9 | homepage: "https://github.com/mfp/oraft" 10 | bug-reports: "https://github.com/mfp/oraft/issues" 11 | dev-repo: "git+https://github.com/mfp/oraft" 12 | doc: "https://mfp.github.io/oraft/doc" 13 | build: [ "dune" "build" "-j" jobs "-p" name "@install" ] 14 | depends: [ 15 | "dune" {build & >= "1.3.0"} 16 | "batteries" {>= "2.8.0"} 17 | ] 18 | synopsis: "Implementation of Raft consensus algorithm" 19 | description: """ 20 | oraft consists of: 21 | 22 | * a core Raft (purely functional) state machine featuring leader election + 23 | log replication, log compaction via snapshotting, cluster membership change 24 | support and efficient linearizable read-only operations. This state machine 25 | is abstracted over the specifics of peer communication (I/O, message 26 | serialization), concurrency and timeouts. (`Oraft` module) 27 | 28 | * a specialization of the above state machine using the Lwt library for 29 | concurrency (`Oraft_lwt` module) 30 | 31 | * a replicated state machine built atop `Oraft_lwt` (`Oraft_rsm` module) 32 | 33 | * a sample distributed dictionary client/server (`dict.ml`) built atop `Oraft_rsm` 34 | 35 | ## Status 36 | 37 | The core state machine has been tested using a discrete event simulator 38 | that simulates node failures, network failures, message loss, random delays, 39 | cluster changes (node deployment and decommissioning)... 40 | 41 | ## Performance 42 | 43 | The sample distributed dictionary has been clocked at rates exceeding 70000 44 | ops/s on a 3-node cluster. There is potential for optimization, both in the 45 | core state machine (by decreasing GC pressure via lesser copying) and the 46 | communication layer (faster message serialization, command batching). 47 | 48 | ## References 49 | 50 | In Search of an Understandable Consensus Algorithm. Diego Ongaro and John 51 | Ousterhout. Stanford University. (Draft of October 7, 2013). 52 | https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf""" 53 | -------------------------------------------------------------------------------- /src/dune: -------------------------------------------------------------------------------- 1 | (rule 2 | (targets oraft_proto_types.ml) 3 | (deps oraft_proto_types.proto) 4 | (action (run extprotc %{deps}))) 5 | 6 | (rule 7 | (targets oraft_proto.ml) 8 | (deps oraft_proto_types.proto 9 | oraft_proto.proto) 10 | (action (run extprotc %{deps}))) 11 | 12 | (rule 13 | (targets oraft_proto_rsm.ml) 14 | (deps oraft_proto_types.proto 15 | oraft_proto_rsm.proto) 16 | (action (run extprotc %{deps}))) 17 | 18 | (library 19 | (name oraft) 20 | (public_name oraft) 21 | (libraries batteries) 22 | (modules oraft) 23 | (flags -w -30) 24 | (synopsis "")) 25 | 26 | (library 27 | (name oraft_lwt) 28 | (public_name oraft-lwt) 29 | (preprocess (pps lwt_ppx)) 30 | (libraries lwt.unix logs logs.lwt oraft) 31 | (wrapped false) 32 | (modules oraft_lwt_s 33 | oraft_lwt 34 | oraft_lwt_conn_wrapper) 35 | (flags -w -30) 36 | (synopsis "")) 37 | 38 | (library 39 | (name oraft_lwt_extprot_io) 40 | (public_name oraft-lwt-extprot-io) 41 | (preprocess (pps lwt_ppx)) 42 | (libraries extprot oraft-lwt) 43 | (wrapped false) 44 | (modules oraft_proto_types oraft_proto oraft_lwt_extprot_io) 45 | (flags -w -30) 46 | (synopsis "")) 47 | 48 | (library 49 | (name oraft_lwt_tls) 50 | (public_name oraft-lwt-tls) 51 | (preprocess (pps lwt_ppx)) 52 | (libraries tls.lwt oraft-lwt) 53 | (modules oraft_lwt_tls) 54 | (synopsis "")) 55 | 56 | (library 57 | (name oraft_rsm) 58 | (public_name oraft-rsm) 59 | (preprocess (pps lwt_ppx)) 60 | (libraries extprot logs oraft-lwt-extprot-io) 61 | (wrapped false) 62 | (modules oraft_proto_rsm oraft_rsm_s oraft_rsm) 63 | (flags -w -32) 64 | (synopsis "")) 65 | -------------------------------------------------------------------------------- /src/oraft.ml: -------------------------------------------------------------------------------- 1 | module Map = BatMap 2 | module List = BatList 3 | module Option = BatOption 4 | module Array = BatArray 5 | 6 | module Kernel = 7 | struct 8 | type status = Leader | Follower | Candidate 9 | type term = Int64.t 10 | type index = Int64.t 11 | type rep_id = string 12 | type client_id = string 13 | type req_id = client_id * Int64.t 14 | type address = string 15 | 16 | type config = 17 | Simple_config of simple_config * passive_peers 18 | | Joint_config of simple_config * simple_config * passive_peers 19 | and simple_config = (rep_id * address) list 20 | and passive_peers = (rep_id * address) list 21 | 22 | module REPID = struct type t = rep_id let compare = String.compare end 23 | module IM = Map.Make(Int64) 24 | module RM = Map.Make(REPID) 25 | module RS = Set.Make(REPID) 26 | 27 | module CONFIG : 28 | sig 29 | type t 30 | 31 | val make : node_id:rep_id -> index:index -> config -> t 32 | val status : t -> [`Normal | `Joint] 33 | val target : t -> (simple_config * passive_peers) option 34 | 35 | (** Return whether we are the only active node in the latest config. *) 36 | val is_alone : t -> bool 37 | 38 | (** Returns all peers (including passive). *) 39 | val peers : t -> (rep_id * address) list 40 | 41 | val address : rep_id -> t -> address option 42 | 43 | (** Returns whether the node is included in the last committed 44 | * configuration. *) 45 | val mem_committed : rep_id -> t -> [`Active | `Passive | `Not_included] 46 | 47 | (** Returns whether the node is an active member of the configuration. *) 48 | val mem_active : rep_id -> t -> bool 49 | 50 | val update : (index * config) list -> t -> t 51 | val join : index -> ?passive:passive_peers -> simple_config -> 52 | t -> (t * config) option 53 | val drop : at_or_after:index -> t -> t 54 | 55 | val has_quorum : rep_id list -> t -> bool 56 | val quorum_min : (rep_id -> Int64.t) -> t -> Int64.t 57 | 58 | val last_commit : t -> config 59 | val current : t -> config 60 | 61 | (* returns the new config and the simple_config we must transition to (if 62 | * any) when we have just committed a joint config *) 63 | val commit : index -> t -> t * (simple_config * passive_peers) option 64 | end = 65 | struct 66 | module M = Map.Make(String) 67 | 68 | type t = 69 | { id : rep_id; 70 | committed : (index * config * addr_map * addr_map); 71 | latest : (index * config * addr_map * addr_map); 72 | q : (index * config * addr_map * addr_map) list; 73 | } 74 | 75 | and addr_map = address M.t 76 | 77 | let addr_map_of_config = 78 | List.fold_left (fun s (id, addr) -> M.add id addr s) M.empty 79 | 80 | let active_of_config = function 81 | Simple_config (c, _) -> addr_map_of_config c 82 | | Joint_config (c1, c2, _) -> addr_map_of_config (c1 @ c2) 83 | 84 | let all_nodes_of_config = function 85 | Simple_config (c, p) -> addr_map_of_config (c @ p) 86 | | Joint_config (c1, c2, p) -> addr_map_of_config (p @ c1 @ c2) 87 | 88 | let make ~node_id:id ~index config = 89 | let active = active_of_config config in 90 | let all = all_nodes_of_config config in 91 | { id; 92 | committed = (index, config, active, all); 93 | latest = (index, config, active, all); 94 | q = []; 95 | } 96 | 97 | let quorum c = List.length c / 2 + 1 98 | 99 | let target t = match t.latest with 100 | | (_, Simple_config _, _, _) -> None 101 | | (_, Joint_config (_, c2, p), _, _) -> Some (c2, p) 102 | 103 | let has_quorum votes t = 104 | let aux_quorum c = 105 | List.fold_left (fun s x -> if List.mem_assoc x c then s + 1 else s) 0 votes >= 106 | quorum c 107 | in 108 | match t.latest with 109 | | _, Simple_config (c, _), _, _ -> aux_quorum c 110 | | _, Joint_config (c1, c2, _), _, _ -> aux_quorum c1 && aux_quorum c2 111 | 112 | let quorum_min_simple get ?only c = 113 | let vs = List.map (fun (id, _) -> get id) (Option.default c only) |> 114 | List.sort Int64.compare |> List.rev 115 | in 116 | try List.nth vs (quorum c - 1) with _ -> 0L 117 | 118 | let set_diff l1 l2 = 119 | List.filter (fun x -> not (List.mem x l2)) l1 120 | 121 | let quorum_min get t = match t.latest with 122 | _, Simple_config (c, _), _, _ -> quorum_min_simple get c 123 | | _, Joint_config (c1, c2, _), _, _ -> 124 | (* If new nodes are added, we require a quorum (len (c1) / 2 + 1) 125 | * amongst the nodes that are not removed when going from c1 to c2, 126 | * i.e. 127 | * set(c1) - (set(c1) - set(c2)) 128 | * 129 | * This is required to handle cases like: 130 | * 131 | * n0 n1 n2 -> n1 n2 n3 132 | * transitional config: Joint_config ([n0 n1 n2], [n1 n2 n3]) 133 | * 134 | * We need a consensus involving n1 + n2. 135 | * n0 + n1 would not do, since the moment we transition to 136 | * [n1 n2 n3] if n1 were to fail [n2 n3] would still have a quorum 137 | * and could lose the committed config entry that completed the 138 | * transition (as well as other committed operations). 139 | * 140 | * As a special case, transitions like (i.e., only 1 active node 141 | * in prior config): 142 | * 143 | * n0 -> n1 144 | * 145 | * require the following quora: n0 n1 146 | * 147 | * 148 | * OTOH, if we're merely removing nodes, we just want a "normal" 149 | * consensus in both the previous and next configurations: 150 | * 151 | * n0 n1 -> n1 152 | * Joint_config ([n0 n1], [n1]) required quora n0+n1 n1 153 | * 154 | * n0 -> n1 155 | * Joint_config ([n0], [n1]) required quora n0 n1 156 | * *) 157 | if List.length c1 > 1 && 158 | List.exists (fun (id, _) -> not (List.mem_assoc id c1)) c2 then 159 | let only = set_diff c1 (set_diff c1 c2) in 160 | min (quorum_min_simple get ~only c1) (quorum_min_simple get c2) 161 | else 162 | min (quorum_min_simple get c1) (quorum_min_simple get c2) 163 | 164 | let join index ?passive:p c2 t = match t.latest with 165 | | (idx, Simple_config (c1, passive), _, _) when index > idx -> 166 | let p = Option.default passive p in 167 | let conf = Joint_config (c1, c2, p) in 168 | let active = active_of_config conf in 169 | let all = all_nodes_of_config conf in 170 | let latest = (index, conf, active, all) in 171 | let q = latest :: t.q in 172 | let t = { id = t.id; committed = t.committed; latest; q; } in 173 | Some (t, conf) 174 | | _ -> None 175 | 176 | let update l t = 177 | let idx, _, _, _ = t.latest in 178 | let l = List.sort (fun (i1, _) (i2, _) -> - Int64.compare i1 i2) l |> 179 | List.take_while (fun (idx', _) -> idx' > idx) |> 180 | List.map 181 | (fun (idx, c) -> 182 | (idx, c, active_of_config c, all_nodes_of_config c)) in 183 | let q = l @ t.q in 184 | let latest = match q with 185 | | [] -> t.committed 186 | | x :: _ -> x 187 | in 188 | { id = t.id; committed = t.committed; latest; q; } 189 | 190 | let status t = match t.latest with 191 | | (_, Simple_config _, _, _) -> `Normal 192 | | (_, Joint_config _, _, _) -> `Joint 193 | 194 | let drop ~at_or_after:n t = 195 | let q = List.drop_while (fun (idx, _, _, _) -> idx >= n) t.q in 196 | let latest = match q with 197 | | [] -> t.committed 198 | | x :: _ -> x 199 | in 200 | { id = t.id; committed = t.committed; latest; q; } 201 | 202 | let commit index t = 203 | try 204 | let committed = List.find (fun (n, _, _, _) -> n <= index) t.q in 205 | let q = List.take_while (fun (n, _, _, _) -> n > index) t.q in 206 | let target = match q, committed with 207 | | [], (_, Joint_config (_, c2, passive), _, _) -> 208 | Some (c2, passive) 209 | | _ -> None in 210 | let t = { t with committed; q; } in 211 | (t, target) 212 | with Not_found -> (t, None) 213 | 214 | let last_commit { committed = (_, c, _, _); _ } = c 215 | 216 | let current { latest = (_, c, _, _); _ } = c 217 | 218 | let peers { id; latest = (_, _, _, all); _ } = 219 | M.remove id all |> M.bindings 220 | 221 | let is_alone { id; latest = (_, _, active, _); _ } = 222 | M.mem id active && (M.remove id active |> M.is_empty) 223 | 224 | let address id { latest = (_, _, _, all); } = 225 | M.Exceptionless.find id all 226 | 227 | let mem_committed id { committed = (_, _, active, all); _ } = 228 | if M.mem id active then `Active 229 | else if M.mem id all then `Passive 230 | else `Not_included 231 | 232 | let mem_active id { latest = (_, _, active, _); _ } = 233 | M.mem id active 234 | end 235 | 236 | module LOG : sig 237 | type 'a t 238 | 239 | val empty : prev_log_index:index -> prev_log_term:term -> 'a t 240 | val to_list : 'a t -> (index * 'a * term) list 241 | val of_list : prev_log_index:index -> prev_log_term:term -> 242 | (index * 'a * term) list -> 'a t 243 | val append : term:term -> 'a -> 'a t -> 'a t 244 | val last_index : 'a t -> (term * index) 245 | 246 | (** @return new log and index of first conflicting entry (if any) *) 247 | val append_many : (index * ('a * term)) list -> 'a t -> 'a t * index option 248 | val get_range : from_inclusive:index -> to_inclusive:index -> 'a t -> 249 | (index * ('a * term)) list 250 | val get_term : index -> 'a t -> term option 251 | 252 | val trim_prefix : prev_log_index:index -> prev_log_term:term -> 'a t -> 'a t 253 | 254 | val prev_log_index : 'a t -> index 255 | val prev_log_term : 'a t -> term 256 | end = 257 | struct 258 | type 'a t = 259 | { 260 | prev_log_index : index; 261 | prev_log_term : term; 262 | last_index : index; 263 | last_term : term; 264 | entries : ('a * term) IM.t; 265 | } 266 | 267 | let empty ~prev_log_index ~prev_log_term = 268 | { prev_log_index; prev_log_term; 269 | last_index = prev_log_index; 270 | last_term = prev_log_term; 271 | entries = IM.empty; 272 | } 273 | 274 | let trim_prefix ~prev_log_index ~prev_log_term t = 275 | let _, _, entries = IM.split prev_log_index t.entries in 276 | { t with entries; prev_log_index; prev_log_term; } 277 | 278 | let prev_log_index t = t.prev_log_index 279 | let prev_log_term t = t.prev_log_term 280 | 281 | let to_list t = 282 | IM.bindings t.entries |> List.map (fun (i, (x, t)) -> (i, x, t)) 283 | 284 | let of_list ~prev_log_index ~prev_log_term = function 285 | [] -> empty ~prev_log_index ~prev_log_term 286 | | l -> 287 | let entries = 288 | List.fold_left 289 | (fun m (idx, x, term) -> IM.add idx (x, term) m) 290 | IM.empty l in 291 | let (prev_log_index, (_, prev_log_term)) = IM.min_binding entries in 292 | let (last_index, (_, last_term)) = IM.max_binding entries in 293 | { prev_log_index; prev_log_term; last_index; last_term; entries; } 294 | 295 | let append ~term:last_term x t = 296 | let last_index = Int64.succ t.last_index in 297 | let entries = IM.add last_index (x, last_term) t.entries in 298 | { t with last_index; last_term; entries; } 299 | 300 | let last_index t = (t.last_term, t.last_index) 301 | 302 | let get idx t = 303 | try 304 | Some (IM.find idx t.entries) 305 | with Not_found -> None 306 | 307 | let append_many l t = match l with 308 | [] -> (t, None) 309 | | l -> 310 | let nonconflicting, conflict_idx = 311 | try 312 | let idx, term = 313 | List.find 314 | (fun (idx, (_, term)) -> 315 | match get idx t with 316 | Some (_, term') when term <> term' -> true 317 | | _ -> false) 318 | l in 319 | let entries, _, _ = IM.split idx t.entries in 320 | (entries, Some idx) 321 | with Not_found -> 322 | (t.entries, None) 323 | in 324 | let entries = 325 | List.fold_left 326 | (fun m (idx, (x, term)) -> IM.add idx (x, term) m) 327 | nonconflicting l in 328 | let last_index, last_term = 329 | try 330 | let idx, (_, term) = IM.max_binding entries in 331 | (idx, term) 332 | with _ -> t.prev_log_index, t.prev_log_term 333 | in 334 | ({ t with last_index; last_term; entries; }, conflict_idx) 335 | 336 | let get_range ~from_inclusive ~to_inclusive t = 337 | if from_inclusive = t.last_index && 338 | to_inclusive >= t.last_index 339 | then 340 | [ from_inclusive, IM.find from_inclusive t.entries ] 341 | else 342 | let _, _, post = IM.split (Int64.pred from_inclusive) t.entries in 343 | let pre, _, _ = if to_inclusive = Int64.max_int then (post, None, post) 344 | else IM.split (Int64.succ to_inclusive) post 345 | in 346 | IM.bindings pre 347 | 348 | let get_term idx t = 349 | if t.last_index = idx then 350 | Some t.last_term 351 | else 352 | try 353 | Some (snd (IM.find idx t.entries)) 354 | with Not_found -> 355 | if idx = t.prev_log_index then Some t.prev_log_term else None 356 | end 357 | 358 | type 'a state = 359 | { 360 | (* persistent *) 361 | current_term : term; 362 | voted_for : rep_id option; 363 | log : 'a entry LOG.t; 364 | id : rep_id; 365 | config : CONFIG.t; 366 | 367 | (* volatile *) 368 | state : status; 369 | commit_index : index; 370 | last_applied : index; 371 | 372 | leader_id : rep_id option; 373 | 374 | (* volatile on leaders *) 375 | next_index : index RM.t; 376 | match_index : index RM.t; 377 | pongs : index RM.t; 378 | ping_index : Int64.t; 379 | acked_ro_op : Int64.t; 380 | 381 | snapshot_transfers : RS.t; 382 | 383 | votes : RS.t; 384 | } 385 | 386 | and 'a entry = Nop | Op of 'a | Config of config 387 | 388 | type 'a message = 389 | Request_vote of request_vote 390 | | Vote_result of vote_result 391 | | Append_entries of 'a append_entries 392 | | Append_result of append_result 393 | | Ping of ping 394 | | Pong of ping 395 | 396 | and request_vote = 397 | { 398 | term : term; 399 | candidate_id : rep_id; 400 | last_log_index : index; 401 | last_log_term : term; 402 | } 403 | 404 | and vote_result = 405 | { 406 | term : term; 407 | vote_granted : bool; 408 | } 409 | 410 | and 'a append_entries = 411 | { 412 | term : term; 413 | leader_id : rep_id; 414 | prev_log_index : index; 415 | prev_log_term : term; 416 | entries : (index * ('a entry * term)) list; 417 | leader_commit : index; 418 | } 419 | 420 | and append_result = 421 | { 422 | term : term; 423 | result : actual_append_result; 424 | } 425 | 426 | and actual_append_result = 427 | Append_success of index (* last log entry included in msg we respond to *) 428 | | Append_failure of index 429 | (* Index of log entry preceding those in message we respond to or 430 | * the index following the last entry we do have. 431 | * *) 432 | 433 | and ping = { term : term; n : Int64.t; } 434 | 435 | type 'a action = 436 | Apply of (index * 'a * term) list 437 | | Become_candidate 438 | | Become_follower of rep_id option 439 | | Become_leader 440 | | Changed_config 441 | | Exec_readonly of Int64.t 442 | | Redirect of rep_id option * 'a 443 | | Reset_election_timeout 444 | | Reset_heartbeat 445 | | Send of rep_id * address * 'a message 446 | | Send_snapshot of rep_id * address * index * config 447 | | Stop 448 | end 449 | 450 | include Kernel 451 | 452 | let quorum_of_config config = Array.length config / 2 + 1 453 | let quorum_of_peers peers = (1 + Array.length peers) / 2 + 1 454 | 455 | let vote_result s vote_granted = 456 | Vote_result { term = s.current_term; vote_granted; } 457 | 458 | let append_result s result = Append_result { term = s.current_term; result } 459 | 460 | let append_ok ~last_log_index s = 461 | append_result s (Append_success last_log_index) 462 | 463 | let append_fail ~prev_log_index s = 464 | append_result s (Append_failure prev_log_index) 465 | 466 | let update_commit_index s = 467 | (* Find last N such that log[N].term = current term AND 468 | * a majority of peers has got match_index[peer] >= N. *) 469 | (* We require majorities in both the old and the new configutation during 470 | * the joint consensus. *) 471 | 472 | let get_match_index id = 473 | if id = s.id then LOG.last_index s.log |> snd 474 | else try RM.find id s.match_index with Not_found -> 0L in 475 | 476 | let i = CONFIG.quorum_min get_match_index s.config in 477 | 478 | (* i is the largest N such that at least half of the peers have 479 | * match_Index[peer] >= N; we have to enforce the other restriction: 480 | * log[N].term = current *) 481 | let commit_index' = 482 | match LOG.get_term i s.log with 483 | | Some term when term = s.current_term -> i 484 | | _ -> s.commit_index in 485 | 486 | (* increate monotonically *) 487 | let commit_index = max s.commit_index commit_index' in 488 | if commit_index = s.commit_index then s 489 | else { s with commit_index } 490 | 491 | let step_down term s = 492 | { s with current_term = term; 493 | voted_for = None; 494 | leader_id = None; 495 | state = Follower; 496 | } 497 | 498 | let try_commit s = 499 | let prev = s.last_applied in 500 | if prev = s.commit_index then 501 | (s, []) 502 | else 503 | let s = { s with last_applied = s.commit_index } in 504 | let entries = LOG.get_range 505 | ~from_inclusive:(Int64.succ prev) 506 | ~to_inclusive:s.commit_index 507 | s.log in 508 | let ops = List.filter_map 509 | (function 510 | (index, (Op x, term)) -> Some (index, x, term) 511 | | (_, ((Nop | Config _), _)) -> None) 512 | entries in 513 | 514 | (* We check whether the node was included in the configuration before 515 | * the commit. We make it stop only if it was, and it is no longer in 516 | * the newly committed one *) 517 | let was_included = CONFIG.mem_committed s.id s.config <> `Not_included in 518 | let config, wanted_config = CONFIG.commit s.commit_index s.config in 519 | let changed_config = List.exists 520 | (function (_, (Config _, _)) -> true | _ -> false) 521 | entries in 522 | (* if we're the leader and wanted_config = Some conf, add a new log 523 | * entry for the wanted configuration [conf]; it will be replicated on 524 | * the next heartbeat *) 525 | let s = match wanted_config, s.state with 526 | | Some (c, passive), Leader -> 527 | let conf = Simple_config (c, passive) in 528 | let log = LOG.append ~term:s.current_term 529 | (Config conf) s.log in 530 | let index = LOG.last_index log |> snd in 531 | let config = CONFIG.update [index, conf] config in 532 | { s with log; config } 533 | | _ -> { s with config } in 534 | let actions = match ops with [] -> [] | l -> [Apply ops] in 535 | (* "In Raft the leader steps down immediately after committing a 536 | * configuration entry that does not include itself." *) 537 | let s, actions = match s.state, CONFIG.mem_committed s.id s.config with 538 | | (Leader | Candidate), `Passive -> 539 | let actions = actions @ [Become_follower None] in 540 | (step_down s.current_term s, actions) 541 | | _, `Not_included when was_included -> 542 | let actions = actions @ [Become_follower None; Stop] in 543 | (step_down s.current_term s, actions) 544 | | _, _ -> (s, actions) in 545 | let actions = if changed_config then Changed_config :: actions 546 | else actions 547 | in 548 | (s, actions) 549 | 550 | let heartbeat s = 551 | let prev_log_term, prev_log_index = LOG.last_index s.log in 552 | Append_entries { term = s.current_term; leader_id = s.id; 553 | prev_log_term; prev_log_index; entries = []; 554 | leader_commit = s.commit_index } 555 | 556 | type 'a send_entries = 557 | Snapshot | Send_entries of 'a | Snapshot_in_progress 558 | 559 | let send_entries s from = 560 | match LOG.get_term (Int64.pred from) s.log with 561 | None -> None 562 | | Some prev_log_term -> 563 | Some 564 | (Append_entries 565 | { prev_log_term; 566 | term = s.current_term; 567 | leader_id = s.id; 568 | prev_log_index = Int64.pred from; 569 | entries = LOG.get_range 570 | ~from_inclusive:from 571 | ~to_inclusive:Int64.max_int 572 | s.log; 573 | leader_commit = s.commit_index; 574 | }) 575 | 576 | let send_entries_or_snapshot s peer from = 577 | if RS.mem peer s.snapshot_transfers then 578 | Snapshot_in_progress 579 | else 580 | match send_entries s from with 581 | None -> Snapshot 582 | | Some x -> Send_entries x 583 | 584 | let broadcast s msg = 585 | CONFIG.peers s.config |> List.map (fun (id, addr) -> Send (id, addr, msg)) 586 | 587 | let receive_msg s peer = function 588 | (* Reject vote requests/results and Append_entries (which should not be 589 | * received anyway since passive members cannot become leaders) from passive 590 | * members *) 591 | | Append_entries _ | Ping _ | Pong _ 592 | | Request_vote _ | Vote_result _ when not (CONFIG.mem_active peer s.config) -> 593 | (s, []) 594 | 595 | (* " If a server receives a request with a stale term number, it rejects the 596 | * request." *) 597 | | Request_vote { term; _ } | Ping { term; _ } when term < s.current_term -> 598 | begin match CONFIG.address peer s.config with 599 | None -> (s, []) 600 | | Some addr -> 601 | (s, [Send (peer, addr, vote_result s false)]) 602 | end 603 | 604 | | Pong { term; _ } when term <> s.current_term -> (s, []) 605 | 606 | (* "Current terms are exchanged whenever servers communicate; if one 607 | * server’s current term is smaller than the other, then it updates its 608 | * current term to the larger value. If a candidate or leader discovers that 609 | * its term is out of date, it immediately reverts to follower state." 610 | * *) 611 | | Vote_result { term; _ } 612 | | Append_result { term; _ } 613 | | Ping { term; _ } when term > s.current_term -> 614 | (step_down term s, [Become_follower None]) 615 | 616 | | Ping { term; n; } (* term = current_term *) -> begin 617 | match s.leader_id, CONFIG.address peer s.config with 618 | Some leader, Some addr when leader = peer -> 619 | (s, [Send (peer, addr, Pong { term; n; })]) 620 | | _ -> 621 | (* We ignore Ping messages not coming from the leader. 622 | * They should not happen, since followers/candidates do not send 623 | * Pings. *) 624 | (s, []) 625 | end 626 | 627 | | Pong { term; n; } -> 628 | let pongs = RM.modify_def 0L peer (max n) s.pongs in 629 | 630 | let get_last_pong id = 631 | if id = s.id then s.ping_index 632 | else Option.default 0L (RM.Exceptionless.find id pongs) in 633 | 634 | let acked = CONFIG.quorum_min get_last_pong s.config in 635 | let actions = if acked > s.acked_ro_op then [ Exec_readonly acked ] 636 | else [] in 637 | let s = { s with pongs; acked_ro_op = max s.acked_ro_op acked; } in 638 | (s, actions) 639 | 640 | | Request_vote { term; candidate_id; last_log_index; last_log_term; } 641 | when term > s.current_term -> 642 | let s = step_down term s in 643 | 644 | (* "If votedFor is null or candidateId, and candidate's log is at 645 | * least as up-to-date as receiver’s log, grant vote" 646 | * 647 | * [voted_for] is None since it was reset above when we updated 648 | * [current_term] 649 | * 650 | * "Raft determines which of two logs is more up-to-date 651 | * by comparing the index and term of the last entries in the 652 | * logs. If the logs have last entries with different terms, then 653 | * the log with the later term is more up-to-date. If the logs 654 | * end with the same term, then whichever log is longer is 655 | * more up-to-date." 656 | * *) 657 | begin match CONFIG.address peer s.config with 658 | None -> (s, [ Become_follower None ]) 659 | | Some addr -> 660 | if (last_log_term, last_log_index) < LOG.last_index s.log then 661 | (s, [Become_follower None; Send (peer, addr, vote_result s false)]) 662 | else 663 | let s = { s with voted_for = Some candidate_id } in 664 | (s, [Become_follower (Some candidate_id); 665 | Send (peer, addr, vote_result s true)]) 666 | end 667 | 668 | | Request_vote { term; candidate_id; last_log_index; last_log_term; } -> begin 669 | (* case term = current_term *) 670 | match s.state, s.voted_for, CONFIG.address peer s.config with 671 | _, Some candidate, Some addr when candidate <> candidate_id -> 672 | (s, [Send (peer, addr, vote_result s false)]) 673 | | (Candidate | Leader), _, Some addr -> 674 | (s, [Send (peer, addr, vote_result s false)]) 675 | | Follower, _, Some addr 676 | (* voted for None or Some candidate equal to candidate_id *) -> 677 | 678 | (* "If votedFor is null or candidateId, and candidate's log is at 679 | * least as up-to-date as receiver’s log, grant vote" 680 | * 681 | * "Raft determines which of two logs is more up-to-date 682 | * by comparing the index and term of the last entries in the 683 | * logs. If the logs have last entries with different terms, then 684 | * the log with the later term is more up-to-date. If the logs 685 | * end with the same term, then whichever log is longer is 686 | * more up-to-date." 687 | * *) 688 | if (last_log_term, last_log_index) < LOG.last_index s.log then 689 | (s, [Send (peer, addr, vote_result s false)]) 690 | else 691 | let s = { s with voted_for = Some candidate_id } in 692 | (s, [Send (peer, addr, vote_result s true)]) 693 | | _, _, None -> (s, []) 694 | end 695 | 696 | (* " If a server receives a request with a stale term number, it rejects the 697 | * request." *) 698 | | Append_entries { term; prev_log_index; _ } when term < s.current_term -> 699 | begin match CONFIG.address peer s.config with 700 | None -> (s, []) 701 | | Some addr -> (s, [Send (peer, addr, append_fail ~prev_log_index s)]) 702 | end 703 | | Append_entries 704 | { term; prev_log_index; prev_log_term; entries; leader_commit; } -> begin 705 | (* "Current terms are exchanged whenever servers communicate; if one 706 | * server’s current term is smaller than the other, then it updates 707 | * its current term to the larger value. If a candidate or leader 708 | * discovers that its term is out of date, it immediately reverts to 709 | * follower state." *) 710 | let s, actions = 711 | if term > s.current_term || s.state = Candidate then 712 | let s = { s with current_term = term; 713 | state = Follower; 714 | leader_id = Some peer; 715 | (* set voted_for so that no other candidates are 716 | * accepted during the new term *) 717 | voted_for = Some peer; 718 | } 719 | in 720 | (s, [Become_follower (Some peer)]) 721 | else (* term = s.current_term && s.state <> Candidate *) 722 | (s, [Reset_election_timeout]) in 723 | 724 | (* "Reply false if log doesn’t contain an entry at prevLogIndex 725 | * whose term matches prevLogTerm" *) 726 | (* In the presence of snapshots, prev_log_index might refer to a log 727 | * entry that was removed (subsumed by the snapshot), in which case 728 | * we must use as prev_log_index the index of the latest entry in 729 | * the snapshot (equal to the prev_log_index of the log), and as 730 | * prev_log_term the term of the corresponding entry in the 731 | * Append_entries message. *) 732 | let prev_log_index, prev_log_term, entries = 733 | if prev_log_index >= LOG.prev_log_index s.log then 734 | (prev_log_index, prev_log_term, entries) 735 | else 736 | try 737 | let prev_idx = LOG.prev_log_index s.log in 738 | let prev_term = List.assoc prev_idx entries |> snd in 739 | let entries = List.drop_while 740 | (fun (idx', _) -> idx' <= prev_idx) entries 741 | in 742 | (prev_idx, prev_term, entries) 743 | with _ -> (prev_log_index, prev_log_term, entries) 744 | in 745 | match 746 | LOG.get_term prev_log_index s.log, CONFIG.address peer s.config 747 | with 748 | None, Some addr -> 749 | (* we don't have the entry at prev_log_index; we use the 750 | * successor of the last entry we do have as the 751 | * prev_log_index in the failure msg, thus allowing to rewind 752 | * next_index in the leader quickly *) 753 | let prev_log_index = LOG.last_index s.log |> snd |> Int64.succ in 754 | (s, Send (peer, addr, append_fail ~prev_log_index s) :: actions) 755 | | Some term', Some addr when prev_log_term <> term' -> 756 | (s, Send (peer, addr, append_fail ~prev_log_index s) :: actions) 757 | | _, Some addr -> 758 | let log, c_idx = LOG.append_many entries s.log in 759 | let config = match c_idx with 760 | | None -> s.config 761 | | Some idx -> 762 | CONFIG.drop ~at_or_after:idx s.config in 763 | (* "a server always uses the latest configuration in its log, 764 | * regardless of whether the entry is committed" *) 765 | let new_configs = List.filter_map 766 | (function 767 | (idx, (Config c, _)) -> Some (idx, c) 768 | | _ -> None) 769 | entries in 770 | let config = CONFIG.update new_configs config in 771 | let last_index = snd (LOG.last_index log) in 772 | let commit_index = if leader_commit > s.commit_index then 773 | min leader_commit last_index 774 | else s.commit_index in 775 | let reply = append_ok ~last_log_index:last_index s in 776 | let s = { s with commit_index; log; config; 777 | leader_id = Some peer; } in 778 | let s, commits = try_commit s in 779 | let actions = List.concat 780 | [ [Send (peer, addr, reply)]; 781 | commits; 782 | actions ] 783 | in 784 | (s, actions) 785 | | _, None -> (s, []) 786 | end 787 | 788 | | Vote_result { term; _ } when term < s.current_term -> 789 | (s, []) 790 | | Vote_result { term; vote_granted; } when s.state <> Candidate -> 791 | (s, []) 792 | | Vote_result { term; vote_granted; } -> 793 | if not vote_granted then 794 | (s, []) 795 | else 796 | (* "If votes received from majority of servers: become leader" *) 797 | let votes = RS.add peer s.votes in 798 | let s = { s with votes } in 799 | 800 | if not (CONFIG.has_quorum (RS.elements votes) s.config) then 801 | (s, []) 802 | else 803 | (* become leader! *) 804 | 805 | (* So as to have the leader know which entries are committed 806 | * (one of the 2 requirements to support read-only operations in 807 | * the leader, the other being making sure it's still the leader 808 | * with a hearbeat exchange), we have it commit a blank 809 | * no-op/config request entry at the start of its term, which also 810 | * serves as the initial heartbeat. If we're in a joint consensus, 811 | * we send a config request for the target configuration instead 812 | * of a Nop. *) 813 | let entry = match CONFIG.target s.config with 814 | | None -> Nop 815 | | Some (c, passive) -> Config (Simple_config (c, passive)) in 816 | let log = LOG.append ~term:s.current_term entry s.log in 817 | 818 | (* With a regular (empty) hearbeat, this applies: 819 | * "When a leader first comes to power, it initializes all 820 | * nextIndex values to the index just after the last one in its 821 | * log" 822 | * However, in this case we want to send the Nop too, so no Int64.succ. 823 | * *) 824 | let next_idx = LOG.last_index log |> snd (* |> Int64.succ *) in 825 | let next_index = List.fold_left 826 | (fun m (peer, _) -> RM.add peer next_idx m) 827 | RM.empty (CONFIG.peers s.config) in 828 | let match_index = List.fold_left 829 | (fun m (peer, _) -> RM.add peer 0L m) 830 | RM.empty (CONFIG.peers s.config) in 831 | let s = { s with log; next_index; match_index; 832 | state = Leader; 833 | leader_id = Some s.id; 834 | pongs = RM.empty; 835 | ping_index = 1L; 836 | acked_ro_op = 0L; 837 | snapshot_transfers = RS.empty; 838 | } in 839 | (* This heartbeat is replaced by the broadcast of the no-op 840 | * explained above: 841 | * 842 | (* "Upon election: send initial empty AppendEntries RPCs 843 | * (heartbeat) to each server; repeat during idle periods to 844 | * prevent election timeouts" *) 845 | let sends = broadcast s (heartbeat s) in 846 | *) 847 | let msg = send_entries s next_idx in 848 | let sends = CONFIG.peers s.config |> 849 | List.filter_map 850 | (fun (peer, addr) -> 851 | Option.map (fun m -> Send (peer, addr, m)) msg) 852 | in 853 | (s, (Become_leader :: sends)) 854 | 855 | | Append_result { term; _ } when term < s.current_term || s.state <> Leader -> 856 | (s, []) 857 | | Append_result { result = Append_success last_log_index; _ } -> 858 | let next_index = RM.modify_def 859 | 0L peer 860 | (fun idx -> max idx (Int64.succ last_log_index)) 861 | s.next_index in 862 | let match_index = RM.modify_def 863 | last_log_index peer 864 | (max last_log_index) s.match_index in 865 | let s = update_commit_index { s with next_index; match_index } in 866 | let s, actions = try_commit s in 867 | (s, (Reset_election_timeout :: actions)) 868 | | Append_result { result = Append_failure prev_log_index; _ } -> 869 | (* "After a rejection, the leader decrements nextIndex and retries 870 | * the AppendEntries RPC. Eventually nextIndex will reach a point 871 | * where the leader and follower logs match." *) 872 | let next_index = RM.modify_def 873 | prev_log_index peer 874 | (fun idx -> min idx prev_log_index) 875 | s.next_index in 876 | let s = { s with next_index } in 877 | let idx = RM.find peer next_index in 878 | match send_entries_or_snapshot s peer idx, CONFIG.address peer s.config with 879 | | Snapshot, Some addr -> 880 | (* Must send snapshot *) 881 | let config = CONFIG.last_commit s.config in 882 | let transfers = RS.add peer s.snapshot_transfers in 883 | let s = { s with snapshot_transfers = transfers } in 884 | (s, [Reset_election_timeout; Send_snapshot (peer, addr, idx, config)]) 885 | | Snapshot_in_progress, _ -> (s, []) 886 | | Send_entries msg, Some addr -> 887 | (s, [Reset_election_timeout; Send (peer, addr, msg)]) 888 | | _ , None -> (s, []) 889 | 890 | let election_timeout s = match s.state with 891 | (* passive nodes do not trigger elections *) 892 | | _ when not (CONFIG.mem_active s.id s.config) -> (s, [Reset_election_timeout]) 893 | 894 | (* if we're the active only node in the cluster, win the elections right 895 | * away *) 896 | | Follower | Candidate when CONFIG.is_alone s.config -> 897 | let s = { s with current_term = Int64.succ s.current_term; 898 | state = Leader; 899 | votes = RS.singleton s.id; 900 | leader_id = Some s.id; 901 | voted_for = Some s.id; 902 | pongs = RM.empty; 903 | ping_index = 1L; 904 | acked_ro_op = 0L; 905 | snapshot_transfers = RS.empty; 906 | } 907 | in (s, [Become_leader]) 908 | 909 | | Leader when CONFIG.is_alone s.config -> 910 | (s, [Reset_election_timeout]) 911 | 912 | (* we have the leader step down if it does not get any append_result 913 | * within an election timeout *) 914 | | Leader 915 | 916 | (* "If election timeout elapses without receiving AppendEntries RPC from 917 | * current leader or granting vote to candidate: convert to candidate" *) 918 | | Follower 919 | (* "If election timeout elapses: start new election" *) 920 | | Candidate -> 921 | (* "On conversion to candidate, start election: 922 | * * Increment currentTerm 923 | * * Vote for self 924 | * * Reset election timeout 925 | * * Send RequestVote RPCs to all other servers" 926 | * *) 927 | let s = { s with current_term = Int64.succ s.current_term; 928 | state = Candidate; 929 | votes = RS.singleton s.id; 930 | leader_id = None; 931 | voted_for = Some s.id; 932 | } in 933 | let term_, idx_ = LOG.last_index s.log in 934 | let msg = Request_vote 935 | { 936 | term = s.current_term; 937 | candidate_id = s.id; 938 | last_log_index = idx_; 939 | last_log_term = term_; 940 | } in 941 | let sends = broadcast s msg in 942 | (s, (Become_candidate :: sends)) 943 | 944 | let heartbeat_timeout s = match s.state with 945 | Follower | Candidate -> (s, [Reset_heartbeat]) 946 | | Leader when CONFIG.is_alone s.config -> 947 | let s, actions = update_commit_index s |> try_commit in 948 | (s, Reset_heartbeat :: actions) 949 | | Leader -> 950 | let s, sends = 951 | CONFIG.peers s.config |> 952 | List.fold_left 953 | (fun (s, sends) (peer, addr) -> 954 | let idx = try RM.find peer s.next_index 955 | with Not_found -> LOG.last_index s.log |> snd 956 | in 957 | match send_entries_or_snapshot s peer idx with 958 | Snapshot_in_progress -> (s, sends) 959 | | Send_entries msg -> (s, Send (peer, addr, msg) :: sends) 960 | | Snapshot -> 961 | (* must send snapshot if cannot send log *) 962 | let transfers = RS.add peer s.snapshot_transfers in 963 | let s = { s with snapshot_transfers = transfers } in 964 | let config = CONFIG.last_commit s.config in 965 | (s, Send_snapshot (peer, addr, idx, config) :: sends)) 966 | (s, []) 967 | in 968 | (s, (Reset_heartbeat :: sends)) 969 | 970 | let client_command x s = match s.state with 971 | Follower | Candidate -> (s, [Redirect (s.leader_id, x)]) 972 | | Leader when CONFIG.is_alone s.config -> 973 | let log = LOG.append ~term:s.current_term (Op x) s.log in 974 | let s = update_commit_index { s with log; } in 975 | try_commit s 976 | | Leader -> 977 | let log = LOG.append ~term:s.current_term (Op x) s.log in 978 | let s, actions = 979 | CONFIG.peers s.config |> 980 | (* We update next_index to be past the last entry we send to each 981 | * peer. This way, we don't send entries repeatedly to a peer when we 982 | * get several commands before the peer has ACKed them. *) 983 | List.fold_left 984 | (fun (s, next_index, actions) (peer, addr) -> 985 | let next = LOG.last_index s.log |> snd |> Int64.succ in 986 | let idx = try RM.find peer s.next_index with Not_found -> next in 987 | let next_index = RM.add peer next s.next_index in 988 | match send_entries_or_snapshot s peer idx with 989 | Snapshot -> 990 | (* must send snapshot if cannot send log *) 991 | let transfers = RS.add peer s.snapshot_transfers in 992 | let s = { s with snapshot_transfers = transfers } in 993 | let config = CONFIG.last_commit s.config in 994 | let actions = Send_snapshot (peer, addr, idx, config) :: 995 | actions 996 | in (s, next_index, actions) 997 | | Snapshot_in_progress -> (s, next_index, actions) 998 | | Send_entries msg -> 999 | (* we don't update s with the new next_index directly 1000 | * here so as to avoid making 1 copy per peer --- we'll 1001 | * update at once after the fold *) 1002 | (s, next_index, Send (peer, addr, msg) :: actions)) 1003 | ({ s with log }, s.next_index, []) |> 1004 | (fun (s, next_index, actions) -> ({ s with next_index }, actions)) in 1005 | let actions = match actions with 1006 | | [] -> [] 1007 | | l -> Reset_heartbeat :: actions 1008 | in 1009 | (s, actions) 1010 | 1011 | let install_snapshot ~last_term ~last_index ~config s = match s.state with 1012 | Leader | Candidate -> (s, false) 1013 | | Follower -> 1014 | let config = CONFIG.make ~node_id:s.id ~index:last_index config in 1015 | let log = 1016 | match LOG.get_term last_index s.log with 1017 | (* "If the follower has an entry that matches the snapshot’s last 1018 | * included index and term, then there is no conflict: it removes only 1019 | * the prefix of its log that the snapshot replaces. Otherwise, the 1020 | * follower removes its entire log; it is all superseded by the 1021 | * snapshot." 1022 | * *) 1023 | Some t when t = last_term -> 1024 | LOG.trim_prefix ~prev_log_index:last_index ~prev_log_term:last_term s.log 1025 | | _ -> LOG.empty ~prev_log_index:last_index ~prev_log_term:last_term in 1026 | 1027 | let s = { s with log; config; 1028 | last_applied = last_index; 1029 | commit_index = last_index; 1030 | } 1031 | in 1032 | (s, true) 1033 | 1034 | let snapshot_sent peer ~last_index s = match s.state with 1035 | Follower | Candidate -> (s, []) 1036 | | Leader -> 1037 | let transfers = RS.remove peer s.snapshot_transfers in 1038 | let next_index = RM.modify_def 1039 | 0L peer 1040 | (fun idx -> max idx (Int64.succ last_index)) 1041 | s.next_index in 1042 | let s = { s with snapshot_transfers = transfers; next_index } in 1043 | match 1044 | send_entries s (RM.find peer s.next_index), CONFIG.address peer s.config 1045 | with 1046 | None, _ | _, None -> (s, []) 1047 | | Some msg, Some addr -> (s, [Send (peer, addr, msg)]) 1048 | 1049 | let snapshot_send_failed peer s = 1050 | let s = { s with snapshot_transfers = RS.remove peer s.snapshot_transfers } in 1051 | (s, []) 1052 | 1053 | let compact_log last_index s = match s.state with 1054 | | Follower | Candidate -> s 1055 | | Leader -> 1056 | if not (RS.is_empty s.snapshot_transfers) then s 1057 | else 1058 | match LOG.get_term last_index s.log with 1059 | None -> s 1060 | | Some last_term -> 1061 | let log = LOG.trim_prefix 1062 | ~prev_log_index:last_index 1063 | ~prev_log_term:last_term s.log 1064 | in 1065 | { s with log } 1066 | 1067 | let readonly_operation s = match s.state with 1068 | | Follower | Candidate -> (s, None) 1069 | | Leader -> 1070 | let n = s.ping_index in 1071 | let ping = Ping { term = s.current_term; n; } in 1072 | let s = { s with ping_index = Int64.succ s.ping_index; } in 1073 | (s, Some (n, broadcast s ping)) 1074 | 1075 | module Types = Kernel 1076 | 1077 | module Core = 1078 | struct 1079 | include Types 1080 | 1081 | let make ~id ~current_term ~voted_for ~log ~config () = 1082 | let log = LOG.of_list ~prev_log_index:0L ~prev_log_term:current_term log in 1083 | let config = CONFIG.make ~node_id:id ~index:0L config in 1084 | { 1085 | current_term; voted_for; log; id; config; 1086 | state = Follower; 1087 | commit_index = 0L; 1088 | last_applied = 0L; 1089 | leader_id = None; 1090 | next_index = RM.empty; 1091 | match_index = RM.empty; 1092 | pongs = RM.empty; 1093 | ping_index = 1L; 1094 | acked_ro_op = 0L; 1095 | votes = RS.empty; 1096 | snapshot_transfers = RS.empty; 1097 | } 1098 | 1099 | let is_single_node_cluster t = CONFIG.is_alone t.config 1100 | 1101 | let leader_id (s : _ state) = s.leader_id 1102 | 1103 | let id s = s.id 1104 | let status s = s.state 1105 | let config s = CONFIG.current s.config 1106 | let committed_config s = CONFIG.last_commit s.config 1107 | let last_index s = snd (LOG.last_index s.log) 1108 | let last_term s = fst (LOG.last_index s.log) 1109 | let peers s = CONFIG.peers s.config 1110 | 1111 | let receive_msg = receive_msg 1112 | let election_timeout = election_timeout 1113 | let heartbeat_timeout = heartbeat_timeout 1114 | let client_command = client_command 1115 | let install_snapshot = install_snapshot 1116 | let snapshot_sent = snapshot_sent 1117 | let compact_log = compact_log 1118 | 1119 | let readonly_operation = readonly_operation 1120 | let snapshot_send_failed = snapshot_send_failed 1121 | 1122 | module Config = 1123 | struct 1124 | let config_eq c1 c2 = 1125 | List.sort String.compare c1 = List.sort String.compare c2 1126 | 1127 | type 'a result = 1128 | [ 1129 | | `Already_changed 1130 | | `Cannot_change 1131 | | `Change_in_process 1132 | | `Redirect of (rep_id * address) option 1133 | | `Start_change of 'a state 1134 | | `Unsafe_change of simple_config * passive_peers 1135 | ] 1136 | 1137 | let config_change_aux s f = match s.state with 1138 | Follower -> 1139 | begin match 1140 | Option.map (fun id -> (id, CONFIG.address id s.config)) 1141 | s.leader_id 1142 | with 1143 | Some (id, Some address) -> `Redirect (Some (id, address)) 1144 | | _ -> `Redirect None 1145 | end 1146 | | Candidate -> `Redirect None 1147 | | Leader -> 1148 | match CONFIG.current s.config with 1149 | Joint_config _ -> `Change_in_process 1150 | | Simple_config (c, p) -> 1151 | match f c p with 1152 | #result as x -> x 1153 | | `Perform_change (c, passive) -> 1154 | match 1155 | CONFIG.join 1156 | (LOG.last_index s.log |> snd |> Int64.succ) 1157 | ~passive c s.config 1158 | with 1159 | None -> `Change_in_process 1160 | | Some (config, target) -> 1161 | let log = LOG.append ~term:s.current_term 1162 | (Config target) s.log in 1163 | let s = { s with config; log } in 1164 | `Start_change s 1165 | 1166 | let add_failover id addr s = 1167 | config_change_aux s 1168 | (fun c p -> 1169 | if List.mem_assoc id c || List.mem_assoc id p then `Already_changed 1170 | else `Perform_change (c, (id, addr) :: p)) 1171 | 1172 | let remove_failover id s = 1173 | config_change_aux s 1174 | (fun c p -> 1175 | if not (List.mem_assoc id p) then `Already_changed 1176 | else `Perform_change (c, List.remove_assoc id p)) 1177 | 1178 | let safe_assoc = List.Exceptionless.assoc 1179 | 1180 | let decommission id s = 1181 | config_change_aux s 1182 | (fun c p -> 1183 | let len = List.length c in 1184 | let quorum = len / 2 + 1 in 1185 | if List.mem_assoc id c && len - 1 < quorum then `Unsafe_change (c, p) 1186 | else if not (List.mem_assoc id c) && not (List.mem_assoc id p) then 1187 | `Already_changed 1188 | else `Perform_change (List.remove_assoc id c, List.remove_assoc id p)) 1189 | 1190 | let demote id s = 1191 | config_change_aux s 1192 | (fun c p -> 1193 | let len = List.length c in 1194 | let quorum = len / 2 + 1 in 1195 | if List.mem_assoc id c && len - 1 < quorum then `Unsafe_change (c, p) 1196 | else match safe_assoc id c with 1197 | None -> `Cannot_change 1198 | | Some addr -> 1199 | `Perform_change (List.remove_assoc id c, (id, addr) :: p)) 1200 | 1201 | let promote id s = 1202 | config_change_aux s 1203 | (fun c p -> 1204 | if List.mem_assoc id c then `Already_changed 1205 | else match safe_assoc id p with 1206 | None -> `Cannot_change 1207 | | Some addr -> 1208 | `Perform_change ((id, addr) :: c, List.remove_assoc id p)) 1209 | 1210 | let replace ~replacee ~failover s = 1211 | config_change_aux s 1212 | (fun c p -> 1213 | let len = List.length c in 1214 | let quorum = len / 2 + 1 in 1215 | match safe_assoc replacee c, safe_assoc failover p with 1216 | Some _, Some _ when len > 1 && len - 1 < quorum -> 1217 | `Unsafe_change (c, p) 1218 | | Some _, Some addr2 -> 1219 | `Perform_change 1220 | ((failover, addr2) :: List.remove_assoc replacee c, 1221 | List.remove_assoc failover p) 1222 | | None, Some addr -> 1223 | `Perform_change ((failover, addr) :: c, 1224 | List.remove_assoc failover p) 1225 | | Some _, None -> 1226 | if len - 1 < quorum then `Unsafe_change (c, p) 1227 | else `Cannot_change 1228 | | None, None -> `Cannot_change) 1229 | end 1230 | end 1231 | -------------------------------------------------------------------------------- /src/oraft.mli: -------------------------------------------------------------------------------- 1 | (** Implentation of the RAFT consensus algorithm. 2 | * 3 | * Refer to 4 | * "In Search of an Understandable Consensus Algorithm", Diego Ongaro and John 5 | * Ousterhout, Stanford University. (Draft of October 7, 2013). 6 | * [https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf] 7 | * *) 8 | 9 | module Types : 10 | sig 11 | type status = Leader | Follower | Candidate 12 | type term = Int64.t 13 | type index = Int64.t 14 | type rep_id = string 15 | type client_id = string 16 | type req_id = client_id * Int64.t 17 | type address = string 18 | 19 | type config = 20 | Simple_config of simple_config * passive_peers 21 | | Joint_config of simple_config * simple_config * passive_peers 22 | and simple_config = (rep_id * address) list 23 | and passive_peers = (rep_id * address) list 24 | 25 | type 'a message = 26 | Request_vote of request_vote 27 | | Vote_result of vote_result 28 | | Append_entries of 'a append_entries 29 | | Append_result of append_result 30 | | Ping of ping 31 | | Pong of ping 32 | 33 | and request_vote = { 34 | term : term; 35 | candidate_id : rep_id; 36 | last_log_index : index; 37 | last_log_term : term; 38 | } 39 | 40 | and vote_result = { 41 | term : term; 42 | vote_granted : bool; 43 | } 44 | 45 | and 'a append_entries = { 46 | term : term; 47 | leader_id : rep_id; 48 | prev_log_index : index; 49 | prev_log_term : term; 50 | entries : (index * ('a entry * term)) list; 51 | leader_commit : index; 52 | } 53 | 54 | and 'a entry = Nop | Op of 'a | Config of config 55 | 56 | and append_result = 57 | { 58 | term : term; 59 | result : actual_append_result; 60 | } 61 | 62 | and actual_append_result = 63 | Append_success of index (* last log entry included in msg we respond to *) 64 | | Append_failure of index (* index of log entry preceding those in 65 | message we respond to *) 66 | 67 | and ping = { term : term; n : Int64.t; } 68 | 69 | type 'a action = 70 | Apply of (index * 'a * term) list 71 | | Become_candidate 72 | | Become_follower of rep_id option 73 | | Become_leader 74 | | Changed_config 75 | | Exec_readonly of Int64.t 76 | | Redirect of rep_id option * 'a 77 | | Reset_election_timeout 78 | | Reset_heartbeat 79 | | Send of rep_id * address * 'a message 80 | | Send_snapshot of rep_id * address * index * config 81 | | Stop 82 | end 83 | 84 | module Core : 85 | sig 86 | open Types 87 | 88 | type 'a state 89 | 90 | val make : 91 | id:rep_id -> current_term:term -> voted_for:rep_id option -> 92 | log:(index * 'a entry * term) list -> 93 | config:config -> unit -> 'a state 94 | 95 | val is_single_node_cluster : 'a state -> bool 96 | 97 | val leader_id : 'a state -> rep_id option 98 | val id : 'a state -> rep_id 99 | val status : 'a state -> status 100 | val config : 'a state -> config 101 | val committed_config : 'a state -> config 102 | 103 | val last_index : 'a state -> index 104 | val last_term : 'a state -> term 105 | val peers : 'a state -> (rep_id * address) list 106 | 107 | val receive_msg : 108 | 'a state -> rep_id -> 'a message -> 'a state * 'a action list 109 | 110 | val election_timeout : 'a state -> 'a state * 'a action list 111 | val heartbeat_timeout : 'a state -> 'a state * 'a action list 112 | val client_command : 'a -> 'a state -> 'a state * 'a action list 113 | 114 | (** @return [(state, None)] if the node is not the leader, 115 | * [(state, Some (id, actions))] otherwise, where [id] identifies the 116 | * requested read-only operation, which can be executed once an 117 | * [Exec_readonly m] action with [m >= id] is returned within the same term 118 | * (i.e., with no intermediate [Become_candidate], [Become_follower] or 119 | * [Become_leader]). *) 120 | val readonly_operation : 121 | 'a state -> 'a state * (Int64.t * 'a action list) option 122 | 123 | val snapshot_sent : 124 | rep_id -> last_index:index -> 'a state -> ('a state * 'a action list) 125 | 126 | val snapshot_send_failed : rep_id -> 'a state -> ('a state * 'a action list) 127 | 128 | val install_snapshot : 129 | last_term:term -> last_index:index -> config:config -> 'a state -> 130 | 'a state * bool 131 | 132 | val compact_log : index -> 'a state -> 'a state 133 | 134 | module Config : 135 | sig 136 | type 'a result = 137 | [ 138 | | `Already_changed 139 | | `Cannot_change 140 | | `Change_in_process 141 | | `Redirect of (rep_id * address) option 142 | | `Start_change of 'a state 143 | | `Unsafe_change of simple_config * passive_peers 144 | ] 145 | 146 | val add_failover : rep_id -> address -> 'a state -> 'a result 147 | val remove_failover : rep_id -> 'a state -> 'a result 148 | val decommission : rep_id -> 'a state -> 'a result 149 | val demote : rep_id -> 'a state -> 'a result 150 | val promote : rep_id -> 'a state -> 'a result 151 | val replace : replacee:rep_id -> failover:rep_id -> 'a state -> 'a result 152 | end 153 | end 154 | -------------------------------------------------------------------------------- /src/oraft_lwt.ml: -------------------------------------------------------------------------------- 1 | open Lwt.Infix 2 | open Oraft 3 | open Oraft.Types 4 | open Oraft_lwt_s 5 | 6 | let s_of_simple_config string_of_address l = 7 | List.map 8 | (fun (id, addr) -> Printf.sprintf "%S:%S" id (string_of_address addr)) l |> 9 | String.concat "; " 10 | 11 | let string_of_config string_of_address c = 12 | match c with 13 | Simple_config (c, passive) -> 14 | Printf.sprintf "Simple ([%s], [%s])" 15 | (s_of_simple_config string_of_address c) 16 | (s_of_simple_config string_of_address passive) 17 | | Joint_config (c1, c2, passive) -> 18 | Printf.sprintf "Joint ([%s], [%s], [%s])" 19 | (s_of_simple_config string_of_address c1) 20 | (s_of_simple_config string_of_address c2) 21 | (s_of_simple_config string_of_address passive) 22 | 23 | module Map = BatMap 24 | module List = BatList 25 | module Option = BatOption 26 | module Queue = BatQueue 27 | 28 | let src = Logs.Src.create "oraft_lwt" 29 | 30 | let pp_exn ppf exn = 31 | Format.pp_print_string ppf (Printexc.to_string exn) 32 | 33 | let pp_saddr ppf = function 34 | | Unix.ADDR_INET (a, p) -> 35 | Format.fprintf ppf "%s/%d" (Unix.string_of_inet_addr a) p 36 | | Unix.ADDR_UNIX s -> 37 | Format.fprintf ppf "unix://%s" s 38 | 39 | module REPID = struct type t = rep_id let compare = String.compare end 40 | module RM = Map.Make(REPID) 41 | module RS = Set.Make(REPID) 42 | 43 | let retry_delay = 0.05 44 | 45 | module Make_server(IO : LWTIO) = 46 | struct 47 | module S = Set.Make(String) 48 | module CMDM = Map.Make(struct 49 | type t = req_id 50 | let compare = compare 51 | end) 52 | 53 | exception Stop_node 54 | 55 | type op = IO.op 56 | type connection = IO.connection 57 | type conn_manager = IO.conn_manager 58 | 59 | type 'a execution = 60 | | Sync of 'a Lwt.t 61 | | Async of 'a Lwt.t 62 | 63 | type 'a apply = 'a server -> op -> ('a, exn) result execution 64 | 65 | and 'a server = 66 | { 67 | execute : 'a apply; 68 | conn_manager : IO.conn_manager; 69 | election_period : float; 70 | heartbeat_period : float; 71 | mutable next_req_id : Int64.t; 72 | mutable conns : IO.connection RM.t; 73 | mutable connecting : RS.t; 74 | mutable state : (req_id * IO.op) Core.state; 75 | mutable running : bool; 76 | msg_stream : (rep_id * (req_id * IO.op) message) Lwt_stream.t; 77 | push_msg : rep_id * (req_id * IO.op) message -> unit; 78 | mutable get_msg : th_res Lwt.t; 79 | mutable election_timeout : th_res Lwt.t; 80 | mutable heartbeat : th_res Lwt.t; 81 | mutable abort : th_res Lwt.t * th_res Lwt.u; 82 | mutable get_cmd : th_res Lwt.t; 83 | mutable get_ro_op : th_res Lwt.t; 84 | push_cmd : (req_id * IO.op) -> unit; 85 | cmd_stream : (req_id * IO.op) Lwt_stream.t; 86 | push_ro_op : ro_op_res Lwt.u -> unit; 87 | ro_op_stream : ro_op_res Lwt.u Lwt_stream.t; 88 | pending_ro_ops : (Int64.t * ro_op_res Lwt.u) Queue.t; 89 | mutable pending_cmds : ('a cmd_res Lwt.t * 'a cmd_res Lwt.u) CMDM.t; 90 | leader_signal : unit Lwt_condition.t; 91 | sent_snapshots : (rep_id * index) Lwt_stream.t; 92 | mutable sent_snapshots_th : th_res Lwt.t; 93 | snapshot_sent : ((rep_id * index) -> unit); 94 | failed_snapshots : rep_id Lwt_stream.t; 95 | mutable failed_snapshot_th : th_res Lwt.t; 96 | snapshot_failed : rep_id -> unit; 97 | mutable config_change : config_change; 98 | 99 | apply_stream : (req_id * IO.op) Lwt_stream.t; 100 | push_apply : (req_id * IO.op) -> unit; 101 | } 102 | 103 | and config_change = 104 | | No_change 105 | | New_failover of change_result Lwt.u * rep_id * address 106 | | Remove_failover of change_result Lwt.u * rep_id 107 | | Decommission of change_result Lwt.u * rep_id 108 | | Promote of change_result Lwt.u * rep_id 109 | | Demote of change_result Lwt.u * rep_id 110 | | Replace of change_result Lwt.u * rep_id * rep_id 111 | 112 | and change_result = OK | Retry 113 | 114 | and th_res = 115 | Message of rep_id * (req_id * IO.op) message 116 | | Client_command of req_id * IO.op 117 | | Abort 118 | | Election_timeout 119 | | Heartbeat_timeout 120 | | Snapshots_sent of (rep_id * index) list 121 | | Snapshot_send_failed of rep_id 122 | | Readonly_op of ro_op_res Lwt.u 123 | 124 | and 'a cmd_res = 125 | Redirect of rep_id option 126 | | Executed of ('a, exn) result 127 | 128 | and ro_op_res = OK | Retry 129 | 130 | type cmd_error = 131 | Exn of exn 132 | | Redirect of rep_id * address 133 | | Retry 134 | 135 | type 'a cmd_result = ('a, cmd_error) result 136 | type ro_op_result = (unit, cmd_error) result 137 | 138 | let get_sent_snapshots t = 139 | match%lwt Lwt_stream.get t.sent_snapshots with 140 | None -> fst (Lwt.wait ()) 141 | | Some (peer, last_index) -> 142 | let l = Lwt_stream.get_available t.sent_snapshots in 143 | Lwt_stream.njunk (List.length l) t.sent_snapshots>>= fun () -> 144 | Lwt.return (Snapshots_sent ((peer, last_index) :: l)) 145 | 146 | let get_failed_snapshot t = 147 | match%lwt Lwt_stream.get t.failed_snapshots with 148 | None -> fst (Lwt.wait ()) 149 | | Some rep_id -> Lwt.return (Snapshot_send_failed rep_id) 150 | 151 | let get_msg t = 152 | match%lwt Lwt_stream.get t.msg_stream with 153 | | None -> fst (Lwt.wait ()) 154 | | Some (rep_id, msg) -> Lwt.return (Message (rep_id, msg)) 155 | 156 | let get_cmd t = 157 | match%lwt Lwt_stream.get t.cmd_stream with 158 | | None -> fst (Lwt.wait ()) 159 | | Some (req_id, op) -> Lwt.return (Client_command (req_id, op)) 160 | 161 | let get_ro_op t = 162 | match%lwt Lwt_stream.get t.ro_op_stream with 163 | | None -> fst (Lwt.wait ()) 164 | | Some x -> Lwt.return (Readonly_op x) 165 | 166 | let sleep_randomized period = 167 | Lwt_unix.sleep (period *. 0.75 +. Random.float (period *. 0.5)) 168 | 169 | let make 170 | execute 171 | ?(election_period = 0.5) 172 | ?(heartbeat_period = election_period /. 2.) state conn_manager = 173 | let msg_stream, p = Lwt_stream.create () in 174 | let push_msg x = p (Some x) in 175 | let cmd_stream, p = Lwt_stream.create () in 176 | let push_cmd x = p (Some x) in 177 | let ro_op_stream, p = Lwt_stream.create () in 178 | let push_ro_op x = p (Some x) in 179 | let election_timeout = match Core.status state with 180 | | Follower | Candidate -> 181 | sleep_randomized election_period>>= fun () -> 182 | Lwt.return Election_timeout 183 | | Leader -> fst (Lwt.wait ()) in 184 | let heartbeat = match Core.status state with 185 | | Follower | Candidate -> fst (Lwt.wait ()) 186 | | Leader -> 187 | Lwt_unix.sleep heartbeat_period>>= fun () -> 188 | Lwt.return Heartbeat_timeout in 189 | let sent_snapshots, p = Lwt_stream.create () in 190 | let snapshot_sent x = p (Some x) in 191 | let sent_snapshots_th = fst (Lwt.wait ()) in 192 | 193 | let failed_snapshots, p = Lwt_stream.create () in 194 | let snapshot_failed x = p (Some x) in 195 | let failed_snapshot_th = fst (Lwt.wait ()) in 196 | 197 | let apply_stream, p = Lwt_stream.create () in 198 | let push_apply x = p (Some x) in 199 | 200 | let t = 201 | { 202 | execute; 203 | conn_manager; 204 | heartbeat_period; 205 | election_period; 206 | state; 207 | election_timeout; 208 | heartbeat; 209 | sent_snapshots; 210 | snapshot_sent; 211 | sent_snapshots_th; 212 | failed_snapshots; 213 | snapshot_failed; 214 | failed_snapshot_th; 215 | msg_stream; 216 | push_msg; 217 | cmd_stream; 218 | push_cmd; 219 | ro_op_stream; 220 | push_ro_op; 221 | apply_stream; 222 | push_apply; 223 | next_req_id = 42L; 224 | conns = RM.empty; 225 | connecting = RS.empty; 226 | running = true; 227 | abort = Lwt.task (); 228 | get_msg = fst (Lwt.wait ()); 229 | get_cmd = fst (Lwt.wait ()); 230 | get_ro_op = fst (Lwt.wait ()); 231 | pending_ro_ops= Queue.create (); 232 | pending_cmds = CMDM.empty; 233 | leader_signal = Lwt_condition.create (); 234 | config_change = No_change; 235 | } 236 | in 237 | t.sent_snapshots_th <- get_sent_snapshots t; 238 | t.failed_snapshot_th <- get_failed_snapshot t; 239 | t.get_msg <- get_msg t; 240 | t.get_cmd <- get_cmd t; 241 | t.get_ro_op <- get_ro_op t; 242 | 243 | let rec apply_loop () = 244 | match%lwt Lwt_stream.get t.apply_stream with 245 | None -> Lwt.return_unit 246 | | Some (req_id, op) -> 247 | let return_result resp = 248 | try%lwt 249 | let (_, u), pending = CMDM.extract req_id t.pending_cmds in 250 | t.pending_cmds <- pending; 251 | Lwt.wakeup_later u (Executed resp); 252 | Lwt.return_unit 253 | with _ -> Lwt.return_unit 254 | in 255 | match t.execute t op with 256 | | exception exn -> 257 | return_result (Error exn) >>= fun () -> 258 | apply_loop () 259 | | Sync resp -> 260 | (try%lwt resp with exn -> Lwt.return_error exn) >>= 261 | return_result>>= fun () -> 262 | apply_loop () 263 | | Async resp -> 264 | ignore begin 265 | (try%lwt resp with exn -> Lwt.return_error exn) >>= 266 | return_result 267 | end; 268 | apply_loop () 269 | in 270 | Lwt.async begin fun () -> 271 | try%lwt apply_loop () with exn -> 272 | Logs_lwt.err ~src begin fun m -> 273 | m "Error in Oraft_lwt apply loop: %a" pp_exn exn 274 | end 275 | end; 276 | t 277 | 278 | let config t = Core.config t.state 279 | 280 | let abort t = 281 | if not t.running then 282 | Lwt.return_unit 283 | else begin 284 | t.running <- false; 285 | begin try (Lwt.wakeup (snd t.abort) Abort) with _ -> () end; 286 | RM.bindings t.conns |> List.map snd |> Lwt_list.iter_p IO.abort 287 | end 288 | 289 | let connect_and_get_msgs t (peer, addr) = 290 | let rec make_thread = function 291 | 0 -> Lwt_unix.sleep 5. >>= fun () ->make_thread 5 292 | | n -> 293 | if RM.mem peer t.conns || RS.mem peer t.connecting || 294 | not (List.mem_assoc peer (Core.peers t.state)) then 295 | Lwt.return_unit 296 | else begin 297 | t.connecting <- RS.add peer t.connecting; 298 | match%lwt IO.connect t.conn_manager peer addr with 299 | | None -> 300 | t.connecting <- RS.remove peer t.connecting; 301 | Lwt_unix.sleep 0.1 >>= fun () ->make_thread (n - 1) 302 | | Some conn -> 303 | t.connecting <- RS.remove peer t.connecting; 304 | t.conns <- RM.add peer conn t.conns; 305 | let rec loop_receive () = 306 | match%lwt IO.receive conn with 307 | None -> 308 | let%lwt () = Lwt_unix.sleep 0.1 in 309 | t.conns <- RM.remove peer t.conns; 310 | make_thread 5 311 | | Some msg -> 312 | t.push_msg (peer, msg); 313 | loop_receive () 314 | in 315 | loop_receive () 316 | end 317 | in 318 | make_thread 5 319 | 320 | let rec clear_pending_ro_ops t = 321 | match Queue.Exceptionless.take t.pending_ro_ops with 322 | None -> () 323 | | Some (_, u) -> (try Lwt.wakeup_later u Retry with _ -> ()); 324 | clear_pending_ro_ops t 325 | 326 | let abort_ongoing_config_change t = 327 | match t.config_change with 328 | No_change -> () 329 | | New_failover (u, _, _) 330 | | Remove_failover (u, _) 331 | | Decommission (u, _) 332 | | Promote (u, _) 333 | | Demote (u, _) 334 | | Replace (u, _, _) -> 335 | try Lwt.wakeup_later u Retry with _ -> () 336 | 337 | let notify_config_result t task result = 338 | t.config_change <- No_change; 339 | try Lwt.wakeup_later task result with _ -> () 340 | 341 | let notify_ok_if_mem t u rep_id l = 342 | notify_config_result t u (if List.mem_assoc rep_id l then OK else Retry) 343 | 344 | let notify_ok_if_not_mem t u rep_id l = 345 | notify_config_result t u (if List.mem_assoc rep_id l then (Retry : change_result) else OK) 346 | 347 | let check_config_change_completion t = 348 | match Core.committed_config t.state with 349 | Joint_config _ -> (* wait for the final Simple_config *) () 350 | | Simple_config (active, passive) -> 351 | match t.config_change with 352 | | No_change -> () 353 | | New_failover (u, rep_id, addr) -> 354 | if List.mem_assoc rep_id active || List.mem_assoc rep_id passive then 355 | notify_config_result t u OK 356 | else 357 | notify_config_result t u Retry 358 | | Remove_failover (u, rep_id) -> notify_ok_if_not_mem t u rep_id passive 359 | | Decommission (u, rep_id) -> 360 | if List.mem_assoc rep_id active || List.mem_assoc rep_id passive then 361 | notify_config_result t u Retry 362 | else 363 | notify_config_result t u OK 364 | | Promote (u, rep_id) -> notify_ok_if_mem t u rep_id active 365 | | Demote (u, rep_id) -> notify_ok_if_not_mem t u rep_id active 366 | | Replace (u, replacee, failover) -> 367 | if not (List.mem_assoc failover active) then 368 | notify_config_result t u Retry 369 | else 370 | notify_config_result t u OK 371 | 372 | let rec exec_action t : _ action -> unit Lwt.t = function 373 | | Reset_election_timeout -> 374 | t.election_timeout <- (sleep_randomized t.election_period>>= fun () -> 375 | Lwt.return Election_timeout); 376 | Lwt.return_unit 377 | | Reset_heartbeat -> 378 | t.heartbeat <- (Lwt_unix.sleep t.heartbeat_period>>= fun () -> 379 | Lwt.return Heartbeat_timeout); 380 | Lwt.return_unit 381 | | Become_candidate 382 | | Become_follower None as ev -> 383 | clear_pending_ro_ops t; 384 | abort_ongoing_config_change t; 385 | t.heartbeat <- fst (Lwt.wait ()); 386 | Logs_lwt.info ~src begin fun m -> 387 | m "Becoming %s" 388 | (match ev with 389 | | Become_candidate -> "candidate" 390 | | _ -> "follower (unknown leader)") 391 | end >>= fun () -> 392 | exec_action t Reset_election_timeout 393 | | Become_follower (Some id) -> 394 | clear_pending_ro_ops t; 395 | abort_ongoing_config_change t; 396 | Lwt_condition.broadcast t.leader_signal (); 397 | t.heartbeat <- fst (Lwt.wait ()); 398 | Logs_lwt.info ~src 399 | (fun m -> m "Becoming follower leader:%S" id) >>= fun () -> 400 | exec_action t Reset_election_timeout 401 | | Become_leader -> 402 | clear_pending_ro_ops t; 403 | abort_ongoing_config_change t; 404 | Lwt_condition.broadcast t.leader_signal (); 405 | Logs_lwt.info ~src (fun m -> m "Becoming leader") >>= fun () -> 406 | exec_action t Reset_election_timeout>>= fun () -> 407 | exec_action t Reset_heartbeat 408 | | Changed_config -> 409 | check_config_change_completion t; 410 | Lwt.return_unit 411 | | Apply l -> 412 | List.iter (fun (index, (req_id, op), term) -> t.push_apply (req_id, op)) l; 413 | Lwt.return_unit 414 | | Redirect (rep_id, (req_id, _)) -> begin 415 | try%lwt 416 | let (_, u), pending_cmds = CMDM.extract req_id t.pending_cmds in 417 | t.pending_cmds <- pending_cmds; 418 | Lwt.wakeup_later u (Redirect rep_id); 419 | Lwt.return_unit 420 | with _ -> Lwt.return_unit 421 | end 422 | | Send (rep_id, addr, msg) -> 423 | (* we allow to run this in parallel with rest RAFT algorithm. 424 | * It's OK to reorder sends. *) 425 | (* TODO: limit the number of msgs in outboung queue. 426 | * Drop after the nth? *) 427 | ignore begin try%lwt 428 | let c = RM.find rep_id t.conns in 429 | if IO.is_saturated c then Lwt.return_unit 430 | else IO.send c msg 431 | with _ -> 432 | (* cannot send -- partition *) 433 | Lwt.return_unit 434 | end; 435 | Lwt.return_unit 436 | | Send_snapshot (rep_id, addr, idx, config) -> 437 | ignore begin 438 | match%lwt IO.prepare_snapshot (RM.find rep_id t.conns) idx config with 439 | | None -> Lwt.return_unit 440 | | Some transfer -> 441 | try%lwt 442 | match%lwt IO.send_snapshot transfer with 443 | true -> t.snapshot_sent (rep_id, idx); 444 | Lwt.return_unit 445 | | false -> failwith "error" 446 | with _ -> 447 | t.snapshot_failed rep_id; 448 | Lwt.return_unit 449 | end; 450 | Lwt.return_unit 451 | | Stop -> Lwt.fail Stop_node 452 | | Exec_readonly n -> 453 | (* can execute all RO ops whose ID is >= n *) 454 | let rec notify_ok () = 455 | match Queue.Exceptionless.peek t.pending_ro_ops with 456 | None -> Lwt.return_unit 457 | | Some (m, _) when m > n -> Lwt.return_unit 458 | | Some (_, u) -> 459 | ignore (Queue.Exceptionless.take t.pending_ro_ops); 460 | Lwt.wakeup_later u OK; 461 | notify_ok () 462 | in 463 | notify_ok () 464 | 465 | let exec_actions t l = Lwt_list.iter_s (exec_action t) l 466 | 467 | let rec run t = 468 | if not t.running then Lwt.return_unit 469 | else begin 470 | (* Launch new connections as needed. 471 | * connect_and_get_msgs will ignore peers for which a connection already 472 | * exists or is being established. *) 473 | ignore (List.map (connect_and_get_msgs t) (Core.peers t.state)); 474 | match%lwt 475 | Lwt.choose 476 | [ t.election_timeout; 477 | fst t.abort; 478 | t.get_msg; 479 | t.get_cmd; 480 | t.get_ro_op; 481 | t.heartbeat; 482 | t.sent_snapshots_th; 483 | ] 484 | with 485 | | Abort -> t.running <- false; Lwt.return_unit 486 | | Readonly_op u -> begin 487 | t.get_ro_op <- get_ro_op t; 488 | match Core.readonly_operation t.state with 489 | (s, None) -> Lwt.wakeup_later u Retry; 490 | Lwt.return_unit 491 | | (s, Some (id, actions)) -> 492 | Queue.push (id, u) t.pending_ro_ops; 493 | t.state <- s; 494 | exec_actions t actions>>= fun () -> 495 | run t 496 | end 497 | | Client_command (req_id, op) -> 498 | let state, actions = Core.client_command (req_id, op) t.state in 499 | t.get_cmd <- get_cmd t; 500 | t.state <- state; 501 | exec_actions t actions>>= fun () -> 502 | run t 503 | | Message (rep_id, msg) -> 504 | let state, actions = Core.receive_msg t.state rep_id msg in 505 | t.get_msg <- get_msg t; 506 | t.state <- state; 507 | exec_actions t actions>>= fun () -> 508 | run t 509 | | Election_timeout -> 510 | let state, actions = Core.election_timeout t.state in 511 | t.state <- state; 512 | exec_actions t actions>>= fun () -> 513 | run t 514 | | Heartbeat_timeout -> 515 | let state, actions = Core.heartbeat_timeout t.state in 516 | t.state <- state; 517 | exec_actions t actions>>= fun () -> 518 | run t 519 | | Snapshots_sent data -> 520 | let state, actions = 521 | List.fold_left 522 | (fun (s, actions) (peer, last_index) -> 523 | let s, actions' = Core.snapshot_sent peer ~last_index s in 524 | (s, actions' @ actions)) 525 | (t.state, []) 526 | data 527 | in 528 | t.sent_snapshots_th <- get_sent_snapshots t; 529 | t.state <- state; 530 | exec_actions t actions>>= fun () -> 531 | run t 532 | | Snapshot_send_failed rep_id -> 533 | let state, actions = Core.snapshot_send_failed rep_id t.state in 534 | t.failed_snapshot_th <- get_failed_snapshot t; 535 | t.state <- state; 536 | exec_actions t actions>>= fun () -> 537 | run t 538 | end 539 | 540 | let run t = 541 | try%lwt 542 | run t 543 | with Stop_node -> t.running <- false; Lwt.return_unit 544 | 545 | let gen_req_id t = 546 | let id = t.next_req_id in 547 | t.next_req_id <- Int64.succ id; 548 | (Core.id t.state, id) 549 | 550 | let rec exec_aux t f = 551 | match Core.status t.state, Core.leader_id t.state with 552 | | Follower, Some leader_id -> begin 553 | match List.Exceptionless.assoc leader_id (Core.peers t.state) with 554 | Some address -> Lwt.return_error (Redirect (leader_id, address)) 555 | | None -> 556 | (* redirect to a random server, hoping it knows better *) 557 | try%lwt 558 | let leader_id, address = 559 | Core.peers t.state |> 560 | Array.of_list |> 561 | (fun x -> if x = [||] then failwith "empty"; x) |> 562 | (fun a -> a.(Random.int (Array.length a))) 563 | in 564 | Lwt.return_error (Redirect (leader_id, address)) 565 | with _ -> 566 | Lwt_unix.sleep retry_delay >>= fun () -> 567 | Lwt.return_error Retry 568 | end 569 | | Candidate, _ | Follower, _ -> 570 | (* await leader, retry *) 571 | Lwt_condition.wait t.leader_signal >>= fun () -> 572 | exec_aux t f 573 | | Leader, _ -> 574 | f t 575 | 576 | let rec execute t cmd = 577 | exec_aux t begin fun t -> 578 | let req_id = gen_req_id t in 579 | let th, _ as task = Lwt.task () in 580 | t.pending_cmds <- CMDM.add req_id task t.pending_cmds; 581 | t.push_cmd (req_id, cmd); 582 | match%lwt th with 583 | Executed (Ok res) -> Lwt.return_ok res 584 | | Executed (Error exn) -> Lwt.return_error (Exn exn) 585 | | Redirect _ -> execute t cmd 586 | end 587 | 588 | let rec readonly_operation t = 589 | if Core.is_single_node_cluster t.state then 590 | Lwt.return_ok () 591 | else 592 | exec_aux t 593 | (fun t -> 594 | let th, u = Lwt.task () in 595 | t.push_ro_op u; 596 | match%lwt th with 597 | OK -> Lwt.return_ok () 598 | | Retry -> readonly_operation t) 599 | 600 | let compact_log t index = 601 | t.state <- Core.compact_log index t.state 602 | 603 | module Config = struct 604 | type error = 605 | | Redirect of rep_id * address 606 | | Retry 607 | | Cannot_change 608 | | Unsafe_change of simple_config * passive_peers 609 | 610 | let get t = Core.config t.state 611 | 612 | let rec perform_change t perform mk_change = 613 | match t.config_change with 614 | New_failover _ | Remove_failover _ | Decommission _ 615 | | Promote _ | Demote _ | Replace _ -> 616 | Lwt_unix.sleep retry_delay>>= fun () -> 617 | perform_change t perform mk_change 618 | | No_change -> 619 | match perform t.state with 620 | `Already_changed -> Lwt.return_ok () 621 | | `Cannot_change -> Lwt.return_error (Cannot_change) 622 | | `Unsafe_change (c, p) -> Lwt.return_error (Unsafe_change (c, p)) 623 | | `Redirect (Some (rep_id, addr)) -> Lwt.return_error (Redirect (rep_id, addr)) 624 | | `Redirect None -> Lwt.return_error Retry 625 | | `Change_in_process -> 626 | Lwt_unix.sleep retry_delay>>= fun () -> 627 | perform_change t perform mk_change 628 | | `Start_change state -> 629 | t.state <- state; 630 | let th, u = Lwt.task () in 631 | t.config_change <- mk_change u; 632 | match%lwt th with 633 | OK -> Lwt.return_ok () 634 | | Retry -> 635 | Lwt_unix.sleep retry_delay>>= fun () -> 636 | perform_change t perform mk_change 637 | 638 | let rec add_failover t rep_id addr = 639 | perform_change t 640 | (Core.Config.add_failover rep_id addr) 641 | (fun u -> New_failover (u, rep_id, addr)) 642 | 643 | let remove_failover t rep_id = 644 | perform_change t 645 | (Core.Config.remove_failover rep_id) 646 | (fun u -> Remove_failover (u, rep_id)) 647 | 648 | let decommission t rep_id = 649 | perform_change t 650 | (Core.Config.decommission rep_id) 651 | (fun u -> Decommission (u, rep_id)) 652 | 653 | let promote t rep_id = 654 | perform_change t 655 | (Core.Config.promote rep_id) 656 | (fun u -> Promote (u, rep_id)) 657 | 658 | let demote t rep_id = 659 | perform_change t 660 | (Core.Config.demote rep_id) 661 | (fun u -> Demote (u, rep_id)) 662 | 663 | let replace t ~replacee ~failover = 664 | perform_change t 665 | (Core.Config.replace ~replacee ~failover) 666 | (fun u -> Replace (u, replacee, failover)) 667 | end 668 | end 669 | -------------------------------------------------------------------------------- /src/oraft_lwt.mli: -------------------------------------------------------------------------------- 1 | open Oraft_lwt_s 2 | open Oraft_lwt_conn_wrapper 3 | 4 | val string_of_config : 5 | (Oraft.Types.address -> string) -> Oraft.Types.config -> string 6 | 7 | val pp_exn : Format.formatter -> exn -> unit 8 | val pp_saddr : Format.formatter -> Unix.sockaddr -> unit 9 | 10 | module Make_server (IO : LWTIO) : SERVER_GENERIC 11 | with type op = IO.op 12 | and type connection = IO.connection 13 | and type conn_manager = IO.conn_manager 14 | (** Specialization of [oraft] on top of Lwt IO. A ready-to-use 15 | implementation of a module of signature [LWTIO] can be found in 16 | [Oraft_lwt_simple_io]. *) 17 | -------------------------------------------------------------------------------- /src/oraft_lwt_conn_wrapper.ml: -------------------------------------------------------------------------------- 1 | type 'a conn_wrapper = { 2 | wrap_incoming_conn : 3 | Lwt_unix.file_descr -> 4 | (Lwt_io.input_channel * Lwt_io.output_channel) Lwt.t; 5 | wrap_outgoing_conn : 6 | Lwt_unix.file_descr -> 7 | (Lwt_io.input_channel * Lwt_io.output_channel) Lwt.t; 8 | } 9 | 10 | type simple_wrapper = 11 | Lwt_unix.file_descr -> (Lwt_io.input_channel * Lwt_io.output_channel) Lwt.t 12 | 13 | let make_client_conn_wrapper f = { 14 | wrap_incoming_conn = 15 | (fun fd -> Lwt.fail_with "Incoming conn wrapper invoked in client"); 16 | wrap_outgoing_conn = f; 17 | } 18 | 19 | let make_server_conn_wrapper ~incoming ~outgoing = 20 | { wrap_incoming_conn = incoming; wrap_outgoing_conn = outgoing } 21 | 22 | let trivial_wrap_outgoing_conn ?buffer_size fd = 23 | let close = 24 | lazy begin 25 | (try%lwt 26 | Lwt_unix.shutdown fd Unix.SHUTDOWN_ALL; 27 | Lwt.return_unit 28 | with Unix.Unix_error(Unix.ENOTCONN, _, _) -> 29 | (* This may happen if the server closed the connection before us *) 30 | Lwt.return_unit) 31 | [%finally 32 | Lwt_unix.close fd] 33 | end in 34 | let buf1 = BatOption.map Lwt_bytes.create buffer_size in 35 | let buf2 = BatOption.map Lwt_bytes.create buffer_size in 36 | try%lwt 37 | (try Lwt_unix.set_close_on_exec fd with Invalid_argument _ -> ()); 38 | Lwt.return (Lwt_io.make ?buffer:buf1 39 | ~close:(fun _ -> Lazy.force close) 40 | ~mode:Lwt_io.input (Lwt_bytes.read fd), 41 | Lwt_io.make ?buffer:buf2 42 | ~close:(fun _ -> Lazy.force close) 43 | ~mode:Lwt_io.output (Lwt_bytes.write fd)) 44 | with exn -> 45 | let%lwt () = Lwt_unix.close fd in 46 | Lwt.fail exn 47 | 48 | let trivial_wrap_incoming_conn ?buffer_size fd = 49 | let buf1 = BatOption.map Lwt_bytes.create buffer_size in 50 | let buf2 = BatOption.map Lwt_bytes.create buffer_size in 51 | Lwt.return 52 | (Lwt_io.of_fd ?buffer:buf1 ~mode:Lwt_io.input fd, 53 | Lwt_io.of_fd ?buffer:buf2 ~mode:Lwt_io.output fd) 54 | 55 | let trivial_conn_wrapper ?buffer_size () = 56 | { wrap_incoming_conn = trivial_wrap_incoming_conn ?buffer_size; 57 | wrap_outgoing_conn = trivial_wrap_outgoing_conn ?buffer_size; 58 | } 59 | 60 | let wrap_outgoing_conn w fd = w.wrap_outgoing_conn fd 61 | let wrap_incoming_conn w fd = w.wrap_incoming_conn fd 62 | -------------------------------------------------------------------------------- /src/oraft_lwt_conn_wrapper.mli: -------------------------------------------------------------------------------- 1 | type -'a conn_wrapper 2 | 3 | type simple_wrapper = 4 | Lwt_unix.file_descr -> (Lwt_io.input_channel * Lwt_io.output_channel) Lwt.t 5 | 6 | val make_client_conn_wrapper : 7 | simple_wrapper -> [`Outgoing] conn_wrapper 8 | 9 | val make_server_conn_wrapper : 10 | incoming:simple_wrapper -> 11 | outgoing:simple_wrapper -> 12 | [`Incoming | `Outgoing] conn_wrapper 13 | 14 | val wrap_outgoing_conn : 15 | [> `Outgoing] conn_wrapper -> Lwt_unix.file_descr -> 16 | (Lwt_io.input_channel * Lwt_io.output_channel) Lwt.t 17 | 18 | val wrap_incoming_conn : 19 | [> `Incoming] conn_wrapper -> Lwt_unix.file_descr -> 20 | (Lwt_io.input_channel * Lwt_io.output_channel) Lwt.t 21 | 22 | val trivial_conn_wrapper : 23 | ?buffer_size:int -> unit -> [< `Incoming | `Outgoing] conn_wrapper 24 | -------------------------------------------------------------------------------- /src/oraft_lwt_extprot_io.ml: -------------------------------------------------------------------------------- 1 | open Lwt.Infix 2 | 3 | open Oraft.Types 4 | open Oraft_lwt_s 5 | open Oraft_lwt_conn_wrapper 6 | 7 | module Map = BatMap 8 | 9 | module Make(C : SERVER_CONF) = struct 10 | type op = C.op 11 | 12 | module M = Map.Make(String) 13 | module MB = Extprot.Msg_buffer 14 | 15 | let src = Logs.Src.create "oraft_lwt.simple_io" 16 | 17 | type conn_manager = 18 | { 19 | id : string; 20 | sock : Lwt_unix.file_descr; 21 | mutable conns : connection M.t; 22 | conn_signal : unit Lwt_condition.t; 23 | conn_wrapper : [`Incoming | `Outgoing] conn_wrapper; 24 | } 25 | 26 | and connection = 27 | { 28 | id : rep_id; 29 | mgr : conn_manager; 30 | ich : Lwt_io.input_channel; 31 | och : Lwt_io.output_channel; 32 | mutable closed : bool; 33 | mutable in_buf : Bytes.t; 34 | out_buf : MB.t; 35 | mutable noutgoing : int; 36 | } 37 | 38 | let make_conn_manager ?(conn_wrapper = trivial_conn_wrapper ()) ~id addr = 39 | let sock = Lwt_unix.(socket (Unix.domain_of_sockaddr addr) Unix.SOCK_STREAM 0) in 40 | Lwt_unix.setsockopt sock Unix.SO_REUSEADDR true; 41 | Lwt_unix.bind sock addr >>= fun () -> 42 | Lwt_unix.listen sock 256; 43 | 44 | let rec accept_loop t = 45 | let%lwt (fd, addr) = Lwt_unix.accept sock in 46 | Lwt.async begin fun () -> begin try%lwt 47 | (* the following are not supported for ADDR_UNIX sockets, so catch 48 | * possible exceptions *) 49 | (try Lwt_unix.setsockopt fd Unix.TCP_NODELAY true with _ -> ()); 50 | (try Lwt_unix.setsockopt fd Unix.SO_KEEPALIVE true with _ -> ()); 51 | let%lwt ich, och = wrap_incoming_conn conn_wrapper fd in 52 | let%lwt id = Lwt_io.read_line ich in 53 | let c = { id; mgr = t; ich; och; closed = false; 54 | in_buf = Bytes.empty; out_buf = MB.create (); 55 | noutgoing = 0; 56 | } 57 | in 58 | t.conns <- M.add id c t.conns; 59 | Lwt_condition.broadcast t.conn_signal (); 60 | Logs_lwt.info ~src (fun m -> m "Incoming connection from peer %S" id) 61 | with _ -> 62 | Lwt_unix.shutdown fd Unix.SHUTDOWN_ALL; 63 | Lwt_unix.close fd 64 | end 65 | end; 66 | accept_loop t 67 | in 68 | let conn_signal = Lwt_condition.create () in 69 | let t = { id; sock; conn_signal; conns = M.empty; conn_wrapper; } in 70 | ignore begin 71 | try%lwt 72 | Logs_lwt.info ~src 73 | (fun m -> m "Running node server at %a" Oraft_lwt.pp_saddr addr) >>= fun () -> 74 | accept_loop t 75 | with 76 | | Exit -> Lwt.return_unit 77 | | exn -> Logs_lwt.err ~src begin fun m -> 78 | m "Error in connection manager accept loop: %a" Oraft_lwt.pp_exn exn 79 | end 80 | end; 81 | Lwt.return t 82 | 83 | let connect t dst_id addr = 84 | match M.Exceptionless.find dst_id t.conns with 85 | Some _ as x -> Lwt.return x 86 | | None when dst_id < t.id -> (* wait for other end to connect *) 87 | let rec await_conn () = 88 | match M.Exceptionless.find dst_id t.conns with 89 | Some _ as x -> Lwt.return x 90 | | None -> Lwt_condition.wait t.conn_signal>>= fun () -> 91 | await_conn () 92 | in 93 | await_conn () 94 | | None -> (* we must connect ourselves *) 95 | try%lwt 96 | Logs_lwt.info ~src begin fun m -> 97 | m "Connecting to %S" (C.string_of_address addr) 98 | end >>= fun () -> 99 | let saddr = C.node_sockaddr addr in 100 | let fd = Lwt_unix.socket (Unix.domain_of_sockaddr saddr) 101 | Unix.SOCK_STREAM 0 in 102 | let%lwt () = Lwt_unix.connect fd saddr in 103 | let%lwt ich, och = wrap_outgoing_conn t.conn_wrapper fd in 104 | try%lwt 105 | (try Lwt_unix.setsockopt fd Unix.TCP_NODELAY true with _ -> ()); 106 | (try Lwt_unix.setsockopt fd Unix.SO_KEEPALIVE true with _ -> ()); 107 | Lwt_io.write_line och t.id >>= fun () -> 108 | Lwt_io.flush och >>= fun () -> 109 | Lwt.return (Some { id = dst_id; mgr = t; ich; och; closed = false; 110 | in_buf = Bytes.empty; out_buf = MB.create (); 111 | noutgoing = 0; }) 112 | with exn -> 113 | Logs_lwt.err ~src begin fun m -> 114 | m "Failed to write (%s)" (Printexc.to_string exn) 115 | end >>= fun () -> 116 | Lwt_unix.close fd >>= fun () -> 117 | Lwt.fail exn 118 | with exn -> 119 | Logs_lwt.err ~src begin fun m -> 120 | m "Failed to connect (%s)" (Printexc.to_string exn) 121 | end >>= fun () -> 122 | Lwt.return_none 123 | 124 | let is_saturated conn = conn.noutgoing > 10 125 | 126 | open Oraft_proto 127 | open Raft_message 128 | open Oraft 129 | 130 | let wrap_msg : _ Oraft.Types.message -> Raft_message.raft_message = function 131 | Request_vote { term; candidate_id; last_log_index; last_log_term; } -> 132 | Request_vote { Request_vote.term; candidate_id; 133 | last_log_index; last_log_term; } 134 | | Vote_result { term; vote_granted; } -> 135 | Vote_result { Vote_result.term; vote_granted; } 136 | | Ping { term; n } -> Ping { Ping_msg.term; n; } 137 | | Pong { term; n } -> Pong { Ping_msg.term; n; } 138 | | Append_result { term; result; } -> 139 | Append_result { Append_result.term; result } 140 | | Append_entries { term; leader_id; prev_log_index; prev_log_term; 141 | entries; leader_commit; } -> 142 | let map_entry = function 143 | (index, (Nop, term)) -> (index, Entry.Nop, term) 144 | | (index, (Config c, term)) -> (index, Entry.Config c, term) 145 | | (index, (Op (req_id, x), term)) -> 146 | (index, Entry.Op (req_id, C.string_of_op x), term) in 147 | 148 | let entries = List.map map_entry entries in 149 | Append_entries { Append_entries.term; leader_id; prev_log_index; 150 | prev_log_term; entries; leader_commit; } 151 | 152 | let unwrap_msg : Raft_message.raft_message -> _ Oraft.Types.message = function 153 | | Request_vote { Request_vote.term; candidate_id; last_log_index; 154 | last_log_term } -> 155 | Request_vote { term; candidate_id; last_log_index; last_log_term } 156 | | Vote_result { Vote_result.term; vote_granted; } -> 157 | Vote_result { term; vote_granted; } 158 | | Ping { Ping_msg.term; n } -> Ping { term; n; } 159 | | Pong { Ping_msg.term; n } -> Pong { term; n; } 160 | | Append_result { Append_result.term; result; } -> 161 | Append_result { term; result } 162 | | Append_entries { Append_entries.term; leader_id; 163 | prev_log_index; prev_log_term; 164 | entries; leader_commit; } -> 165 | let map_entry = function 166 | | (index, Entry.Nop, term) -> (index, (Nop, term)) 167 | | (index, Entry.Config c, term) -> (index, (Config c, term)) 168 | | (index, Entry.Op (req_id, x), term) -> 169 | let op = C.op_of_string x in 170 | (index, (Op (req_id, op), term)) 171 | in 172 | Append_entries 173 | { term; leader_id; prev_log_index; prev_log_term; 174 | entries = List.map map_entry entries; 175 | leader_commit; 176 | } 177 | 178 | let abort c = 179 | if c.closed then Lwt.return_unit 180 | else begin 181 | c.mgr.conns <- M.remove c.id c.mgr.conns; 182 | c.closed <- true; 183 | Lwt_io.abort c.och 184 | end 185 | 186 | let send c msg = 187 | if c.closed then Lwt.return_unit 188 | else begin 189 | let wrapped = wrap_msg msg in 190 | Logs_lwt.debug ~src begin fun m -> 191 | m "Sending@ %s" (Extprot.Pretty_print.pp pp_raft_message wrapped) 192 | end >>= fun () -> 193 | (try%lwt 194 | c.noutgoing <- c.noutgoing + 1; 195 | Lwt_io.atomic 196 | (fun och -> 197 | MB.clear c.out_buf; 198 | Raft_message.write c.out_buf wrapped; 199 | Lwt_io.LE.write_int och (MB.length c.out_buf)>>= fun () -> 200 | Lwt_io.write_from_exactly 201 | och (MB.unsafe_contents c.out_buf) 0 (MB.length c.out_buf)>>= fun () -> 202 | Lwt_io.flush och) 203 | c.och 204 | with exn -> 205 | let%lwt () = Logs_lwt.info ~src begin fun m -> 206 | m "Error on send to %s, closing connection@ %s" 207 | c.id (Printexc.to_string exn) 208 | end 209 | in 210 | abort c) 211 | [%finally 212 | c.noutgoing <- c.noutgoing - 1; 213 | Lwt.return_unit] 214 | end 215 | 216 | let receive c = 217 | if c.closed then 218 | Lwt.return None 219 | else 220 | try%lwt 221 | Lwt_io.atomic 222 | (fun ich -> 223 | let%lwt len = Lwt_io.LE.read_int ich in 224 | if Bytes.length c.in_buf < len 225 | then c.in_buf <- Bytes.create len; 226 | let%lwt () = Lwt_io.read_into_exactly ich c.in_buf 0 len in 227 | let msg = Extprot.Conv.deserialize Raft_message.read 228 | (Bytes.unsafe_to_string c.in_buf) in 229 | Logs_lwt.debug ~src begin fun m -> 230 | m "Received@ %s" 231 | (Extprot.Pretty_print.pp pp_raft_message msg) 232 | end >>= fun () -> 233 | Lwt.return (Some (unwrap_msg msg))) 234 | c.ich 235 | with exn -> 236 | Logs_lwt.info ~src begin fun m -> 237 | m "Error on receive from %S, closing connection. %a" c.id Oraft_lwt.pp_exn exn 238 | end >>= fun () -> 239 | abort c >>= fun () -> 240 | Lwt.return None 241 | 242 | type snapshot_transfer = unit 243 | 244 | let prepare_snapshot conn index config = Lwt.return None 245 | let send_snapshot () = Lwt.return false 246 | end 247 | -------------------------------------------------------------------------------- /src/oraft_lwt_extprot_io.mli: -------------------------------------------------------------------------------- 1 | open Oraft_lwt_s 2 | 3 | module Make (C : SERVER_CONF) : LWTIO with type op = C.op 4 | (** Ready-to-use implementation of LWTIO interface using `extprot` for 5 | message serialization. *) 6 | -------------------------------------------------------------------------------- /src/oraft_lwt_s.ml: -------------------------------------------------------------------------------- 1 | module type LWTIO_TYPES = sig 2 | type op 3 | type connection 4 | type conn_manager 5 | end 6 | 7 | module type LWTIO = sig 8 | open Oraft.Types 9 | 10 | include LWTIO_TYPES 11 | 12 | val make_conn_manager : 13 | ?conn_wrapper:[ `Incoming | `Outgoing ] Oraft_lwt_conn_wrapper.conn_wrapper -> 14 | id:address -> Unix.sockaddr -> conn_manager Lwt.t 15 | 16 | val connect : conn_manager -> rep_id -> address -> connection option Lwt.t 17 | val send : connection -> (req_id * op) message -> unit Lwt.t 18 | val receive : connection -> (req_id * op) message option Lwt.t 19 | val abort : connection -> unit Lwt.t 20 | 21 | val is_saturated : connection -> bool 22 | 23 | type snapshot_transfer 24 | 25 | val prepare_snapshot : 26 | connection -> index -> config -> snapshot_transfer option Lwt.t 27 | 28 | val send_snapshot : snapshot_transfer -> bool Lwt.t 29 | end 30 | 31 | module type SERVER_GENERIC = sig 32 | open Oraft.Types 33 | 34 | include LWTIO_TYPES 35 | 36 | type 'a server 37 | 38 | type cmd_error = 39 | | Exn of exn 40 | | Redirect of rep_id * address 41 | | Retry 42 | 43 | type 'a cmd_result = ('a, cmd_error) result 44 | type ro_op_result = (unit, cmd_error) result 45 | 46 | type 'a execution = 47 | | Sync of 'a Lwt.t 48 | | Async of 'a Lwt.t 49 | 50 | type 'a apply = 'a server -> op -> ('a, exn) result execution 51 | 52 | val make : 53 | 'a apply -> ?election_period:float -> ?heartbeat_period:float -> 54 | (req_id * op) Oraft.Core.state -> conn_manager -> 'a server 55 | 56 | val config : _ server -> config 57 | val run : _ server -> unit Lwt.t 58 | val abort : _ server -> unit Lwt.t 59 | val execute : 'a server -> op -> 'a cmd_result Lwt.t 60 | val readonly_operation : _ server -> ro_op_result Lwt.t 61 | 62 | val compact_log : _ server -> index -> unit 63 | 64 | module Config : sig 65 | type error = 66 | | Redirect of rep_id * address 67 | | Retry 68 | | Cannot_change 69 | | Unsafe_change of simple_config * passive_peers 70 | 71 | val get : _ server -> config 72 | val add_failover : _ server -> rep_id -> address -> (unit, error) result Lwt.t 73 | val remove_failover : _ server -> rep_id -> (unit, error) result Lwt.t 74 | val decommission : _ server -> rep_id -> (unit, error) result Lwt.t 75 | val demote : _ server -> rep_id -> (unit, error) result Lwt.t 76 | val promote : _ server -> rep_id -> (unit, error) result Lwt.t 77 | val replace : _ server -> replacee:rep_id -> failover:rep_id -> (unit, error) result Lwt.t 78 | end 79 | end 80 | 81 | module type OP = sig 82 | type op 83 | 84 | val string_of_op : op -> string 85 | val op_of_string : string -> op 86 | end 87 | 88 | module type SERVER_CONF = sig 89 | open Oraft.Types 90 | include OP 91 | val node_sockaddr : address -> Unix.sockaddr 92 | val string_of_address : address -> string 93 | end 94 | -------------------------------------------------------------------------------- /src/oraft_lwt_tls.ml: -------------------------------------------------------------------------------- 1 | open Lwt.Infix 2 | 3 | let wrap_incoming_conn client_config fd = 4 | try%lwt 5 | (try Lwt_unix.set_close_on_exec fd with Invalid_argument _ -> ()); 6 | Tls_lwt.(Unix.client_of_fd ~host:"" client_config fd >|= of_t) 7 | with exn -> 8 | Lwt_unix.close fd >>= fun () -> 9 | Lwt.fail exn 10 | 11 | let wrap_outgoing_conn server_config fd = 12 | Tls_lwt.(Unix.server_of_fd server_config fd >|= of_t) 13 | 14 | let make_client_wrapper ~client_config = 15 | Oraft_lwt_conn_wrapper.make_client_conn_wrapper (wrap_incoming_conn client_config) 16 | 17 | let make_server_wrapper ~client_config ~server_config = 18 | Oraft_lwt_conn_wrapper.make_server_conn_wrapper 19 | ~outgoing:(wrap_incoming_conn client_config) 20 | ~incoming:(wrap_outgoing_conn server_config) 21 | -------------------------------------------------------------------------------- /src/oraft_lwt_tls.mli: -------------------------------------------------------------------------------- 1 | val make_client_wrapper : 2 | client_config:Tls.Config.client -> 3 | [`Outgoing] Oraft_lwt_conn_wrapper.conn_wrapper 4 | 5 | val make_server_wrapper : 6 | client_config:Tls.Config.client -> 7 | server_config:Tls.Config.server -> 8 | [`Incoming | `Outgoing] Oraft_lwt_conn_wrapper.conn_wrapper 9 | -------------------------------------------------------------------------------- /src/oraft_proto.proto: -------------------------------------------------------------------------------- 1 | include "oraft_proto_types.proto" 2 | 3 | type actual_append_result = 4 | Append_success index | Append_failure index 5 | options "ocaml.type_equals" = "Oraft.Types.actual_append_result" 6 | 7 | type ping_msg = { term : term; n : long } 8 | 9 | type entry = Nop | Op (req_id * string) | Config config 10 | 11 | message raft_message = 12 | Request_vote { term : term; candidate_id : rep_id; 13 | last_log_index : index; last_log_term : term } 14 | | Vote_result { term : term; vote_granted : bool } 15 | | Append_entries { term : term; leader_id : rep_id; 16 | prev_log_index : index; prev_log_term : term; 17 | entries : [ (index * entry * term) ]; 18 | leader_commit : index } 19 | | Append_result { term : term; result : actual_append_result } 20 | | Ping ping_msg 21 | | Pong ping_msg 22 | 23 | (* vim: set ft=omlet: *) 24 | -------------------------------------------------------------------------------- /src/oraft_proto_rsm.proto: -------------------------------------------------------------------------------- 1 | include "oraft_proto_types.proto" 2 | 3 | type msg_id = long 4 | 5 | type config_change = 6 | Add_failover rep_id address 7 | | Remove_failover rep_id 8 | | Decommission rep_id 9 | | Demote rep_id 10 | | Promote rep_id 11 | | Replace rep_id rep_id 12 | 13 | type client_op = 14 | Connect string 15 | | Execute string 16 | | Execute_RO string 17 | | Change_config config_change 18 | | Get_config 19 | 20 | message client_msg = { id : msg_id; op : client_op } 21 | 22 | type opt 'a = None | Some 'a 23 | options "ocaml.type_equals" = "option" 24 | 25 | type response = 26 | OK string 27 | | Redirect rep_id address 28 | | Retry 29 | | Cannot_change 30 | | Unsafe_change simple_config passive_peers 31 | | Error string 32 | | Config config 33 | 34 | message server_msg = { id : msg_id; response : response } 35 | 36 | (* vim: set ft=omlet: *) -------------------------------------------------------------------------------- /src/oraft_proto_types.proto: -------------------------------------------------------------------------------- 1 | type rep_id = string 2 | type client_id = string 3 | type address = string 4 | 5 | type simple_config = [(rep_id * address)] 6 | type passive_peers = simple_config 7 | 8 | type term = long 9 | type index = long 10 | 11 | type req_id = (client_id * long) 12 | 13 | type config = 14 | Simple_config simple_config passive_peers 15 | | Joint_config simple_config simple_config passive_peers 16 | options "ocaml.type_equals" = "Oraft.Types.config" 17 | 18 | (* vim: set ft=omlet: *) 19 | -------------------------------------------------------------------------------- /src/oraft_rsm.ml: -------------------------------------------------------------------------------- 1 | open Lwt.Infix 2 | 3 | open Oraft.Types 4 | open Oraft_rsm_s 5 | 6 | module Map = BatMap 7 | module Hashtbl = BatHashtbl 8 | 9 | module MB = Extprot.Msg_buffer 10 | 11 | type 'a conn = 12 | { 13 | addr : 'a; 14 | ich : Lwt_io.input_channel; 15 | och : Lwt_io.output_channel; 16 | in_buf : Bytes.t ref; 17 | out_buf : MB.t; 18 | } 19 | 20 | let send_msg write conn msg = 21 | (* Since the buffers are private to the conn AND Lwt_io.atomic prevents 22 | * concurrent IO operations, it's safe to reuse the buffer for a given 23 | * channel across send_msg calls. *) 24 | Lwt_io.atomic 25 | (fun och -> 26 | let buf = conn.out_buf in 27 | MB.clear buf; 28 | write buf msg; 29 | Lwt_io.LE.write_int och (MB.length buf) >>= fun () -> 30 | Lwt_io.write_from_exactly 31 | och (MB.unsafe_contents buf) 0 (MB.length buf) >>= fun () -> 32 | Lwt_io.flush och) 33 | conn.och 34 | 35 | let read_msg read conn = 36 | (* same as send_msg applies here regarding the buffers *) 37 | Lwt_io.atomic 38 | (fun ich -> 39 | let%lwt len = Lwt_io.LE.read_int ich in 40 | let buf = conn.in_buf in 41 | if Bytes.length !buf < len then buf := Bytes.create len; 42 | Lwt_io.read_into_exactly ich !buf 0 len >>= fun () -> 43 | Lwt.return (Extprot.Conv.deserialize read (Bytes.unsafe_to_string !buf))) 44 | conn.ich 45 | 46 | module Make_client(C : CONF) = struct 47 | module M = Map.Make(String) 48 | 49 | open Oraft_proto_rsm 50 | open Client_msg 51 | open Client_op 52 | open Server_msg 53 | open Response 54 | 55 | exception Not_connected 56 | exception Bad_response 57 | 58 | module H = Hashtbl.Make(struct 59 | type t = Int64.t 60 | let hash = Hashtbl.hash 61 | let equal i1 i2 = Int64.compare i1 i2 = 0 62 | end) 63 | 64 | type t = 65 | { 66 | id : string; 67 | mutable dst : address conn option; 68 | mutable conns : address conn M.t; 69 | mutable req_id : Int64.t; 70 | pending_reqs : response Lwt.u H.t; 71 | conn_wrapper : [`Outgoing] Oraft_lwt_conn_wrapper.conn_wrapper; 72 | } 73 | 74 | and address = string 75 | 76 | let src = Logs.Src.create "oraft_rsm.client" 77 | 78 | let trivial_wrapper () = 79 | (Oraft_lwt_conn_wrapper.trivial_conn_wrapper () :> 80 | [`Outgoing] Oraft_lwt_conn_wrapper.conn_wrapper) 81 | 82 | let make ?conn_wrapper ~id () = 83 | { id; dst = None; conns = M.empty; req_id = 0L; 84 | pending_reqs = H.create 13; 85 | conn_wrapper = Option.map_default 86 | (fun w -> (w :> [`Outgoing] Oraft_lwt_conn_wrapper.conn_wrapper)) 87 | (trivial_wrapper ()) 88 | conn_wrapper; 89 | } 90 | 91 | let gen_id t = 92 | t.req_id <- Int64.succ t.req_id; 93 | t.req_id 94 | 95 | let send_msg conn msg = send_msg Client_msg.write conn msg 96 | let read_msg conn = read_msg Server_msg.read conn 97 | 98 | let connect t peer_id addr = 99 | let do_connect () = 100 | let saddr = C.app_sockaddr addr in 101 | let fd = Lwt_unix.socket (Unix.domain_of_sockaddr saddr) Unix.SOCK_STREAM 0 in 102 | let%lwt () = Lwt_unix.connect fd saddr in 103 | let%lwt ich, och = Oraft_lwt_conn_wrapper.wrap_outgoing_conn t.conn_wrapper fd in 104 | let out_buf = MB.create () in 105 | let in_buf = ref Bytes.empty in 106 | let conn = { addr; ich; och; in_buf; out_buf } in 107 | (try Lwt_unix.setsockopt fd Unix.TCP_NODELAY true with _ -> ()); 108 | (try Lwt_unix.setsockopt fd Unix.SO_KEEPALIVE true with _ -> ()); 109 | try%lwt 110 | send_msg conn { id = 0L; op = (Connect t.id) } >>= fun () -> 111 | match%lwt read_msg conn with 112 | | { response = OK id; _ } -> 113 | t.conns <- M.add id conn t.conns; 114 | t.dst <- Some conn; 115 | ignore begin 116 | let rec loop_recv () = 117 | let%lwt msg = read_msg conn in 118 | Logs_lwt.debug ~src begin fun m -> 119 | m "Received from server@ %s" 120 | (Extprot.Pretty_print.pp Server_msg.pp msg) 121 | end >>= fun () -> 122 | match H.Exceptionless.find t.pending_reqs msg.id with 123 | None -> loop_recv () 124 | | Some u -> 125 | Lwt.wakeup_later u msg.response; 126 | loop_recv () 127 | in 128 | (loop_recv ()) 129 | [%finally 130 | t.conns <- M.remove peer_id t.conns; 131 | Lwt_io.abort och] 132 | end; 133 | Lwt.return_unit 134 | | _ -> failwith "conn refused" 135 | with exn -> 136 | Logs_lwt.err ~src begin fun m -> 137 | m "Error while connecting (%s)" (Printexc.to_string exn) 138 | end >>= fun () -> 139 | t.conns <- M.remove peer_id t.conns; 140 | Lwt_io.abort och 141 | in 142 | match M.Exceptionless.find peer_id t.conns with 143 | Some conn when conn.addr = addr -> 144 | t.dst <- Some conn; 145 | Lwt.return_unit 146 | | Some conn (* when addr <> address *) -> 147 | t.conns <- M.remove peer_id t.conns; 148 | Lwt_io.abort conn.och >>= fun () -> do_connect () 149 | | None -> do_connect () 150 | 151 | let send_and_await_response t op f = 152 | match t.dst with 153 | None -> Lwt.fail Not_connected 154 | | Some c -> 155 | let th, u = Lwt.task () in 156 | let id = gen_id t in 157 | let msg = { id; op; } in 158 | H.add t.pending_reqs id u; 159 | Logs_lwt.debug ~src begin fun m -> 160 | m "Sending to server@ %s" 161 | (Extprot.Pretty_print.pp Client_msg.pp msg) 162 | end >>= fun () -> 163 | send_msg c msg >>= fun () -> 164 | let%lwt x = th in 165 | f c.addr x 166 | 167 | let rec do_execute t op = 168 | send_and_await_response t op 169 | (fun dst resp -> match resp with 170 | OK s -> Lwt.return_ok s 171 | | Error s -> Lwt.return_error s 172 | | Redirect (peer_id, address) when peer_id <> dst -> 173 | connect t peer_id address >>= fun () -> 174 | do_execute t op 175 | | Redirect _ | Retry -> 176 | Lwt_unix.sleep 0.050 >>= fun () -> 177 | do_execute t op 178 | | Cannot_change | Unsafe_change _ | Config _ -> 179 | Lwt.fail Bad_response) 180 | 181 | let execute t op = 182 | do_execute t (Execute (C.string_of_op op)) 183 | 184 | let execute_ro t op = 185 | do_execute t (Execute_RO (C.string_of_op op)) 186 | 187 | let rec get_config t = 188 | send_and_await_response t Get_config 189 | (fun dst resp -> match resp with 190 | Config c -> Lwt.return_ok c 191 | | Error x -> Lwt.return_error x 192 | | Redirect (peer_id, address) when peer_id <> dst -> 193 | connect t peer_id address >>= fun () -> 194 | get_config t 195 | | Redirect _ | Retry -> 196 | Lwt_unix.sleep 0.050 >>= fun () -> 197 | get_config t 198 | | OK _ | Cannot_change | Unsafe_change _ -> 199 | Lwt.fail Bad_response) 200 | 201 | type change_config_error = 202 | | Cannot_change 203 | | Error of string 204 | | Unsafe_change of simple_config * passive_peers 205 | 206 | let rec change_config t op = 207 | send_and_await_response t (Change_config op) 208 | (fun dst resp -> match resp with 209 | OK _ -> Lwt.return_ok () 210 | | Error x -> Lwt.return_error (Error x) 211 | | Redirect (peer_id, address) when peer_id <> dst -> 212 | connect t peer_id address >>= fun () -> 213 | change_config t op 214 | | Redirect _ | Retry -> 215 | Lwt_unix.sleep 0.050 >>= fun () -> 216 | change_config t op 217 | | Cannot_change -> Lwt.return_error Cannot_change 218 | | Unsafe_change (c, p) -> Lwt.return_error (Unsafe_change (c, p)) 219 | | Config _ -> Lwt.fail Bad_response) 220 | 221 | let connect t ~addr = connect t "" addr 222 | end 223 | 224 | module Make_server(C : CONF) = struct 225 | module IO = Oraft_lwt_extprot_io.Make(C) 226 | module SS = Oraft_lwt.Make_server(IO) 227 | module SSC = SS.Config 228 | module CC = Make_client(C) 229 | 230 | module Core = SS 231 | 232 | open Oraft_proto_rsm 233 | open Client_msg 234 | open Client_op 235 | open Server_msg 236 | open Response 237 | open Config_change 238 | 239 | type 'a apply = 240 | 'a Core.server -> C.op -> ('a, exn) result Core.execution 241 | 242 | type 'a t = 243 | { 244 | id : rep_id; 245 | addr : string; 246 | c : CC.t option; 247 | node_sockaddr : Unix.sockaddr; 248 | app_sockaddr : Unix.sockaddr; 249 | serv : 'a SS.server; 250 | exec : 'a apply; 251 | conn_wrapper : [`Incoming | `Outgoing] Oraft_lwt_conn_wrapper.conn_wrapper; 252 | } 253 | 254 | let src = Logs.Src.create "oraft_rsm.server" 255 | 256 | let raise_if_error = function 257 | | Ok x -> Lwt.return x 258 | | Error s -> Lwt.fail_with s 259 | 260 | let check_config_err : 261 | (unit, CC.change_config_error) result -> unit Lwt.t = function 262 | | Ok () -> Lwt.return_unit 263 | | Error (Error s) -> Lwt.fail_with s 264 | | Error Cannot_change -> Lwt.fail_with "Cannot perform config change" 265 | | Error (Unsafe_change _) -> Lwt.fail_with "Unsafe config change" 266 | 267 | let make exec addr 268 | ?(conn_wrapper = Oraft_lwt_conn_wrapper.trivial_conn_wrapper ()) 269 | ?join ?election_period ?heartbeat_period id = 270 | begin match join with 271 | | None -> 272 | Lwt.return (Simple_config ([id, addr], []), None) 273 | | Some peer_addr -> 274 | let c = CC.make ~conn_wrapper ~id () in 275 | Logs_lwt.info ~src begin fun m -> 276 | m "Connecting to %S" (peer_addr |> C.string_of_address) 277 | end >>= fun () -> 278 | CC.connect c ~addr:peer_addr >>= fun () -> 279 | CC.get_config c >>= raise_if_error >>= fun config -> 280 | Logs_lwt.info ~src begin fun m -> 281 | m "Got initial configuration %s" 282 | (Oraft_lwt.string_of_config C.string_of_address config) 283 | end >>= fun () -> 284 | Lwt.return (config, Some c) 285 | end >>= fun (config, c) -> 286 | let state = Oraft.Core.make 287 | ~id ~current_term:0L ~voted_for:None 288 | ~log:[] ~config () in 289 | let node_sockaddr = C.node_sockaddr addr in 290 | let app_sockaddr = C.app_sockaddr addr in 291 | let%lwt conn_mgr = IO.make_conn_manager ~id node_sockaddr in 292 | let serv = SS.make exec ?election_period ?heartbeat_period 293 | state conn_mgr in 294 | Lwt.return { id; addr; c ; node_sockaddr; 295 | app_sockaddr; serv; exec; conn_wrapper; } 296 | 297 | let send_msg conn msg = send_msg Server_msg.write conn msg 298 | let read_msg conn = read_msg Client_msg.read conn 299 | 300 | let map_op_result : string Core.cmd_result -> response = function 301 | | Error (Exn exn) -> Error (Printexc.to_string exn) 302 | | Error (Redirect (peer_id, addr)) -> Redirect (peer_id, addr) 303 | | Error Retry -> Retry 304 | | Ok response -> OK response 305 | 306 | let map_apply = function 307 | | Result.Error exn -> Error (Printexc.to_string exn) 308 | | Ok _ -> OK "" 309 | 310 | let perform_change t op = 311 | let map = function 312 | | Ok () -> OK "" 313 | | Error SSC.Cannot_change -> Cannot_change 314 | | Error Unsafe_change (c, p) -> Unsafe_change (c, p) 315 | | Error Redirect (peer_id, addr) -> Redirect (peer_id, addr) 316 | | Error Retry -> Retry 317 | in 318 | try%lwt 319 | let%lwt ret = 320 | match op with 321 | Add_failover (peer_id, addr) -> SSC.add_failover t.serv peer_id addr 322 | | Remove_failover peer_id -> SSC.remove_failover t.serv peer_id 323 | | Decommission peer_id -> SSC.decommission t.serv peer_id 324 | | Demote peer_id -> SSC.demote t.serv peer_id 325 | | Promote peer_id -> SSC.promote t.serv peer_id 326 | | Replace (replacee, failover) -> SSC.replace t.serv ~replacee ~failover 327 | in 328 | Lwt.return (map ret) 329 | with exn -> 330 | Logs_lwt.debug ~src begin fun m -> 331 | m "Error while changing cluster configuration@ %s: %s" 332 | (Extprot.Pretty_print.pp pp_config_change op) 333 | (Printexc.to_string exn) 334 | end >>= fun () -> 335 | Lwt.return (Error (Printexc.to_string exn)) 336 | 337 | let process_message t client_id conn = function 338 | { id; op = Connect _ } -> 339 | send_msg conn { id; response = Error "Unexpected request" } 340 | | { id; op = Get_config } -> 341 | let config = SS.Config.get t.serv in 342 | send_msg conn { id; response = Config config } 343 | | { id; op = Change_config x } -> 344 | Logs_lwt.info ~src begin fun m -> 345 | m "Config change requested:@ %s" 346 | (Extprot.Pretty_print.pp pp_config_change x) 347 | end >>= fun () -> 348 | let%lwt response = perform_change t x in 349 | Logs_lwt.info ~src begin fun m -> 350 | m "Config change result:@ %s" 351 | (Extprot.Pretty_print.pp pp_response response) 352 | end >>= fun () -> 353 | Logs_lwt.info ~src begin fun m -> 354 | m "New config:@ %s" 355 | (Oraft_lwt.string_of_config C.string_of_address (SS.config t.serv)) 356 | end >>= fun () -> 357 | send_msg conn { id; response } 358 | | { id; op = Execute_RO op; } -> begin 359 | match%lwt SS.readonly_operation t.serv with 360 | | Error (Redirect _) | Error Retry | Error _ as x -> 361 | let response = map_op_result x in 362 | send_msg conn { id; response; } 363 | | Ok () -> 364 | match t.exec t.serv (C.op_of_string op) with 365 | | Sync resp -> 366 | let%lwt resp = resp in 367 | send_msg conn { id; response = map_apply resp } 368 | | Async resp -> 369 | ignore begin try%lwt 370 | let%lwt resp = try%lwt resp 371 | with exn -> Lwt.return_error exn 372 | in 373 | send_msg conn { id; response = map_apply resp } 374 | with exn -> 375 | Logs_lwt.debug ~src 376 | (fun m -> m "Caught exn: %a" Oraft_lwt.pp_exn exn) 377 | end; 378 | Lwt.return_unit 379 | end 380 | | { id; op = Execute op; } -> 381 | let%lwt response = SS.execute t.serv (C.op_of_string op) >|= map_op_result in 382 | send_msg conn { id; response } 383 | 384 | let rec request_loop t client_id conn = 385 | let%lwt msg = read_msg conn in 386 | ignore begin 387 | try%lwt 388 | process_message t client_id conn msg 389 | with exn -> 390 | Logs_lwt.debug ~src begin fun m -> 391 | m "Error while processing message@ %s: %s" 392 | (Extprot.Pretty_print.pp Client_msg.pp msg) 393 | (Printexc.to_string exn) 394 | end >>= fun () -> 395 | send_msg conn { id = msg.id; response = Error (Printexc.to_string exn) } 396 | end; 397 | request_loop t client_id conn 398 | 399 | let is_in_config t config = 400 | let all = match config with 401 | | Simple_config (a, p) -> a @ p 402 | | Joint_config (a1, a2, p) -> a1 @ a2 @ p 403 | in 404 | List.mem_assoc t.id all 405 | 406 | let is_active t config = 407 | let active = match config with 408 | | Simple_config (a, _) -> a 409 | | Joint_config (a1, a2, _) -> a1 @ a2 410 | in 411 | List.mem_assoc t.id active 412 | 413 | let add_as_failover_if_needed t c config = 414 | if is_in_config t config then 415 | Lwt.return_unit 416 | else begin 417 | Logs_lwt.info ~src begin fun m -> 418 | m "Adding failover id:%S addr:%S" 419 | t.id (C.string_of_address t.addr) 420 | end >>= fun () -> 421 | CC.change_config c (Add_failover (t.id, t.addr)) >>= check_config_err 422 | end 423 | 424 | let promote_if_needed t c config = 425 | if is_active t config then 426 | Lwt.return_unit 427 | else begin 428 | Logs_lwt.info ~src (fun m -> m "Promoting failover id:%S" t.id) >>= fun () -> 429 | CC.change_config c (Promote t.id) >>= check_config_err 430 | end 431 | 432 | let join_cluster t c = 433 | (* We only try to add as failover/promote if actually needed. 434 | * Otherwise, we could get blocked in situations were the node is 435 | * rejoining the cluster (and thus already active in its configuration) 436 | * and the remaining nodes do not have the quorum to perform a 437 | * configuration change (even if it'd eventually be a NOP). *) 438 | let%lwt config = CC.get_config c >>= raise_if_error in 439 | add_as_failover_if_needed t c config>>= fun () -> 440 | promote_if_needed t c config>>= fun () -> 441 | let%lwt config = CC.get_config c >>= raise_if_error in 442 | Logs_lwt.info ~src begin fun m -> 443 | m "Final config: %s" 444 | (Oraft_lwt.string_of_config C.string_of_address config) 445 | end 446 | 447 | let handle_conn t fd addr = 448 | (* the following are not supported for ADDR_UNIX sockets, so catch *) 449 | (* possible exceptions *) 450 | (try Lwt_unix.setsockopt fd Unix.TCP_NODELAY true with _ -> ()); 451 | (try Lwt_unix.setsockopt fd Unix.SO_KEEPALIVE true with _ -> ()); 452 | (try%lwt 453 | let%lwt ich, och = Oraft_lwt_conn_wrapper.wrap_incoming_conn t.conn_wrapper fd in 454 | let conn = { addr; ich; och; in_buf = ref Bytes.empty; out_buf = MB.create () } in 455 | (match%lwt read_msg conn with 456 | | { id; op = Connect client_id; _ } -> 457 | Logs_lwt.info ~src begin fun m -> 458 | m "Incoming client connection from %S" client_id 459 | end >>= fun () -> 460 | send_msg conn { id; response = OK "" }>>= fun () -> 461 | request_loop t client_id conn 462 | | { id; _ } -> 463 | send_msg conn { id; response = Error "Bad request" }) 464 | [%finally 465 | try%lwt Lwt_io.close ich with _ -> Lwt.return_unit] 466 | with 467 | | End_of_file 468 | | Unix.Unix_error (Unix.ECONNRESET, _, _) -> Lwt.return_unit 469 | | exn -> 470 | Logs_lwt.err ~src 471 | (fun m -> m "Error in dispatch: %a" Oraft_lwt.pp_exn exn)) 472 | [%finally 473 | try%lwt Lwt_unix.close fd with _ -> Lwt.return_unit] 474 | 475 | let run t = 476 | let sock = Lwt_unix.(socket (Unix.domain_of_sockaddr t.app_sockaddr) 477 | Unix.SOCK_STREAM 0) 478 | in 479 | Lwt_unix.setsockopt sock Unix.SO_REUSEADDR true; 480 | Lwt_unix.bind sock t.app_sockaddr >>= fun () -> 481 | Lwt_unix.listen sock 256; 482 | 483 | let rec accept_loop t = 484 | let%lwt (fd_addrs,_) = Lwt_unix.accept_n sock 50 in 485 | List.iter 486 | (fun (fd, addr) -> Lwt.async (fun () -> handle_conn t fd addr)) 487 | fd_addrs; 488 | accept_loop t 489 | in 490 | ignore begin try%lwt 491 | Logs_lwt.info ~src begin fun m -> 492 | m "Running app server at %a" Oraft_lwt.pp_saddr t.app_sockaddr 493 | end >>= fun () -> 494 | SS.run t.serv 495 | with exn -> 496 | Logs_lwt.err ~src begin fun m -> 497 | m "Error in Oraft_lwt server run(): %a" Oraft_lwt.pp_exn exn 498 | end 499 | end; 500 | (try%lwt 501 | match t.c with 502 | | None -> accept_loop t 503 | | Some c -> join_cluster t c >>= fun () -> accept_loop t 504 | with exn -> 505 | Logs_lwt.err ~src (fun m -> m "Exn raised: %a" Oraft_lwt.pp_exn exn)) 506 | [%finally 507 | (* FIXME: t.c client shutdown *) 508 | try%lwt Lwt_unix.close sock with _ -> Lwt.return_unit] 509 | end 510 | -------------------------------------------------------------------------------- /src/oraft_rsm.mli: -------------------------------------------------------------------------------- 1 | open Oraft_rsm_s 2 | 3 | module Make_client (C : CONF) : CLIENT with type op := C.op 4 | module Make_server (C: CONF) : SERVER with type op := C.op 5 | -------------------------------------------------------------------------------- /src/oraft_rsm_s.ml: -------------------------------------------------------------------------------- 1 | open Oraft.Types 2 | 3 | type config_change = 4 | Oraft_proto_rsm.Config_change.config_change = 5 | Add_failover of rep_id * address 6 | | Remove_failover of rep_id 7 | | Decommission of rep_id 8 | | Demote of rep_id 9 | | Promote of rep_id 10 | | Replace of rep_id * rep_id 11 | 12 | module type CONF = sig 13 | include Oraft_lwt_s.SERVER_CONF 14 | val app_sockaddr : address -> Unix.sockaddr 15 | end 16 | 17 | module type CLIENT = sig 18 | exception Not_connected 19 | exception Bad_response 20 | 21 | type op 22 | type t 23 | 24 | val make : 25 | ?conn_wrapper:[> `Outgoing] Oraft_lwt_conn_wrapper.conn_wrapper -> id:string -> unit -> t 26 | 27 | val connect : t -> addr:address -> unit Lwt.t 28 | 29 | val execute : t -> op -> (string, string) result Lwt.t 30 | val execute_ro : t -> op -> (string, string) result Lwt.t 31 | val get_config : t -> (config, string) result Lwt.t 32 | 33 | type change_config_error = 34 | | Cannot_change 35 | | Error of string 36 | | Unsafe_change of simple_config * passive_peers 37 | 38 | val change_config : 39 | t -> config_change -> (unit, change_config_error) result Lwt.t 40 | end 41 | 42 | module type SERVER = sig 43 | type op 44 | type 'a t 45 | 46 | module Core : Oraft_lwt_s.SERVER_GENERIC with type op = op 47 | 48 | type 'a apply = 'a Core.server -> op -> ('a, exn) result Core.execution 49 | 50 | val make : 51 | 'a apply -> address -> 52 | ?conn_wrapper:[`Outgoing | `Incoming] Oraft_lwt_conn_wrapper.conn_wrapper -> 53 | ?join:address -> 54 | ?election_period:float -> 55 | ?heartbeat_period:float -> rep_id -> 'a t Lwt.t 56 | 57 | val run : string t -> unit Lwt.t 58 | end 59 | -------------------------------------------------------------------------------- /test/dict.ml: -------------------------------------------------------------------------------- 1 | 2 | (* Trivial distributed key-value service. 3 | * 4 | * Usage: 5 | * 6 | * (1) Launch of 1st node (will be master with quorum = 1 at first): 7 | * 8 | * ./dict master n1a,n1b 9 | * (uses UNIX domain sockets n1a for Raft communication, 10 | * n1b as address for app server -- use ip1:port1,ip2:port2 11 | * to listen at ip1:port1 for Raft, ip1:port2 for app) 12 | * 13 | * 14 | * (2) Launch extra nodes. They will join the cluster and the quorum will be 15 | * updated. 16 | * 17 | * ./dict master n2a,n2b --join n1a,n1b 18 | * 19 | * ./dict master n3a,n3b --join n1a,n1b 20 | * 21 | * ... 22 | * 23 | * (3) perform client reqs 24 | * 25 | * ./dict get n1a,n1b foo # retrieve value assoc'ed 26 | * # to key "foo", block until 27 | * # available 28 | * 29 | * ./dict set n1a,n1b foo bar # associate 'bar' to 'foo' 30 | * 31 | * *) 32 | 33 | open Cmdliner 34 | open Lwt.Infix 35 | 36 | module String = BatString 37 | module Hashtbl = BatHashtbl 38 | module Option = BatOption 39 | 40 | type op = 41 | Get of string 42 | | Wait of string 43 | | Set of string * string 44 | 45 | module Conf = struct 46 | type nonrec op = op 47 | 48 | let string_of_op = function 49 | Get v -> "?" ^ v 50 | | Wait v -> "<" ^ v 51 | | Set (k, v) -> "!" ^ k ^ "=" ^ v 52 | 53 | let op_of_string s = 54 | if s = "" then failwith "bad op" 55 | else 56 | match s.[0] with 57 | '?' -> Get (String.slice ~first:1 s) 58 | | '<' -> Wait (String.slice ~first:1 s) 59 | | '!' -> 60 | let k, v = String.slice ~first:1 s |> String.split ~by:"=" in 61 | Set (k, v) 62 | | _ -> failwith "bad op" 63 | 64 | type addr = { 65 | node: Unix.sockaddr ; 66 | app: Unix.sockaddr ; 67 | } 68 | 69 | let fn_of_addr { node ; app } = 70 | match node, app with 71 | | Unix.ADDR_UNIX a, Unix.ADDR_UNIX b -> 72 | [a; b] 73 | | Unix.ADDR_UNIX a, _ -> [a] 74 | | _, Unix.ADDR_UNIX b -> [b] 75 | | _ -> [] 76 | 77 | let sockaddr s = 78 | try 79 | let host, port = String.split ~by:":" s in 80 | Unix.ADDR_INET (Unix.inet_addr_of_string host, int_of_string port) 81 | with Not_found -> 82 | Unix.ADDR_UNIX s 83 | 84 | let parse_sockaddr s = 85 | match String.split ~by:"," s with 86 | | exception _ -> None 87 | | a, b -> Some { 88 | node = sockaddr a ; 89 | app = sockaddr b ; 90 | } 91 | 92 | let node_sockaddr s = String.split ~by:"," s |> fst |> sockaddr 93 | let app_sockaddr s = String.split ~by:"," s |> snd |> sockaddr 94 | 95 | let string_of_address s = s 96 | end 97 | 98 | module Server = Oraft_rsm.Make_server(Conf) 99 | module Client = Oraft_rsm.Make_client(Conf) 100 | 101 | let make_tls_wrapper tls = 102 | Option.map 103 | (fun (client_config, server_config) -> 104 | Oraft_lwt_tls.make_server_wrapper 105 | ~client_config ~server_config) 106 | tls 107 | 108 | let run_server ?tls ~addr ?join ~id () = 109 | let h = Hashtbl.create 13 in 110 | let cond = Lwt_condition.create () in 111 | 112 | let exec _ op = match op with 113 | Get s -> Server.Core.Sync begin 114 | match Hashtbl.find_option h s with 115 | | None -> Lwt.return_error Not_found 116 | | Some v -> Lwt.return_ok v 117 | end 118 | | Wait k -> 119 | Async begin 120 | let rec attempt () = 121 | match Hashtbl.Exceptionless.find h k with 122 | Some v -> Lwt.return_ok v 123 | | None -> 124 | Lwt_condition.wait cond >>= fun () -> 125 | attempt () 126 | in 127 | attempt () 128 | end 129 | | Set (k, v) -> 130 | if v = "" then 131 | Hashtbl.remove h k 132 | else begin 133 | Hashtbl.add h k v; 134 | Lwt_condition.broadcast cond (); 135 | end; 136 | Sync (Lwt.return_ok "") 137 | in 138 | 139 | let%lwt server = 140 | Server.make ?conn_wrapper:(make_tls_wrapper tls) exec addr ?join id in 141 | Server.run server 142 | 143 | let client_op ?tls ~addr op = 144 | let c = Client.make 145 | ?conn_wrapper:(make_tls_wrapper tls) 146 | ~id:(string_of_int (Unix.getpid ())) () in 147 | let exec = match op with 148 | | Get _ | Wait _ -> Client.execute_ro 149 | | Set _ -> Client.execute 150 | in 151 | Client.connect c ~addr >>= fun () -> 152 | match%lwt exec c op with 153 | | Ok s -> Printf.printf "+OK %s\n" s; Lwt.return_unit 154 | | Error s -> Printf.printf "-ERR %s\n" s; Lwt.return_unit 155 | 156 | let ro_benchmark ?tls ?(iterations = 10_000) ~addr () = 157 | let c = Client.make 158 | ?conn_wrapper:(make_tls_wrapper tls) 159 | ~id:(string_of_int (Unix.getpid ())) () 160 | in 161 | Client.connect c ~addr >>= fun () -> 162 | Client.execute c (Set ("bm", "0")) >>= fun _ -> 163 | let t0 = Unix.gettimeofday () in 164 | for%lwt i = 1 to iterations do 165 | let%lwt _ = Client.execute_ro c (Get "bm") in 166 | Lwt.return_unit 167 | done >>= fun () -> 168 | let dt = Unix.gettimeofday () -. t0 in 169 | Printf.printf "%.0f RO ops/s\n" (float iterations /. dt); 170 | Lwt.return_unit 171 | 172 | let wr_benchmark ?tls ?(iterations = 10_000) ~addr () = 173 | let c = Client.make 174 | ?conn_wrapper:(make_tls_wrapper tls) 175 | ~id:(string_of_int (Unix.getpid ())) () 176 | in 177 | Client.connect c ~addr >>= fun () -> 178 | let t0 = Unix.gettimeofday () in 179 | for%lwt i = 1 to iterations do 180 | let%lwt _ = Client.execute c (Set ("bm", "")) in 181 | Lwt.return_unit 182 | done >>= fun () -> 183 | let dt = Unix.gettimeofday () -. t0 in 184 | Printf.printf "%.0f WR ops/s\n" (float iterations /. dt); 185 | Lwt.return_unit 186 | 187 | let tls_create cert priv_key = 188 | X509_lwt.private_of_pems ~cert ~priv_key >|= fun cert -> 189 | (* FIXME: authenticator *) 190 | Tls.Config.(client ~authenticator:X509.Authenticator.null (), 191 | server ~certificates:(`Single cert) ()) 192 | 193 | let lwt_reporter () = 194 | let buf_fmt ~like = 195 | let b = Buffer.create 512 in 196 | Fmt.with_buffer ~like b, 197 | fun () -> let m = Buffer.contents b in Buffer.reset b; m 198 | in 199 | let app, app_flush = buf_fmt ~like:Fmt.stdout in 200 | let dst, dst_flush = buf_fmt ~like:Fmt.stderr in 201 | let reporter = Logs_fmt.reporter ~app ~dst () in 202 | let report src level ~over k msgf = 203 | let k () = 204 | let write () = match level with 205 | | Logs.App -> Lwt_io.write Lwt_io.stdout (app_flush ()) 206 | | _ -> Lwt_io.write Lwt_io.stderr (dst_flush ()) 207 | in 208 | let unblock () = over (); Lwt.return_unit in 209 | Lwt.finalize write unblock |> Lwt.ignore_result; 210 | k () 211 | in 212 | reporter.Logs.report src level ~over:(fun () -> ()) k msgf; 213 | in 214 | { Logs.report = report } 215 | 216 | let setup_log style_renderer level = 217 | Fmt_tty.setup_std_outputs ?style_renderer (); 218 | Logs.set_level level; 219 | Logs.set_reporter (lwt_reporter ()); 220 | () 221 | 222 | let setup_log = 223 | Term.(const setup_log $ Fmt_cli.style_renderer () $ Logs_cli.level ()) 224 | 225 | let tls = 226 | Arg.(value & opt (t2 string string) ("", "") & info ["tls"] ~doc:"Use TLS") 227 | 228 | let tls_create (cert, priv_key) = 229 | match cert, priv_key with 230 | | "", "" -> Lwt.return_none 231 | | _ -> tls_create cert priv_key >>= Lwt.return_some 232 | 233 | let master tls addr join () = 234 | List.iter begin fun s -> 235 | Sys.set_signal s begin Sys.Signal_handle begin fun _ -> 236 | Option.may (fun a -> 237 | List.iter Sys.remove (Conf.fn_of_addr a)) 238 | (Conf.parse_sockaddr addr) ; 239 | exit 0 240 | end 241 | end 242 | end Sys.[sigint ; sigterm]; 243 | tls_create tls >>= fun tls -> 244 | run_server ?tls ~addr ?join ~id:addr () 245 | 246 | let master_cmd = 247 | let listen = 248 | let doc = "Where to listen." in 249 | Arg.(required 250 | & pos 0 (some string) None 251 | & info [] ~docv:"ADDRESS" ~doc) in 252 | let join = 253 | let doc = "Other servers to join." in 254 | Arg.(value & opt (some string) None & 255 | info ["join"] ~doc ~docv:"ADDRESS") in 256 | let doc = "Launch a server" in 257 | Term.(const master $ tls $ listen $ join $ setup_log), 258 | Term.info ~doc "master" 259 | 260 | let get_key tls addr k () = 261 | tls_create tls >>= fun tls -> 262 | client_op ?tls ~addr (Wait k) 263 | 264 | let set_key tls addr k v () = 265 | tls_create tls >>= fun tls -> 266 | client_op ?tls ~addr (Set (k, v)) 267 | 268 | let connect_arg = 269 | let doc = "Where to connect." in 270 | Arg.(required 271 | & pos 0 (some string) None 272 | & info [] ~docv:"ADDRESS" ~doc) 273 | 274 | let get_cmd = 275 | let doc = "Get a key" in 276 | let k = 277 | Arg.(required 278 | & pos 1 (some string) None 279 | & info [] ~docv:"STRING" ~doc) in 280 | Term.(const get_key $ tls $ connect_arg $ k $ setup_log), 281 | Term.info ~doc "get" 282 | 283 | let set_cmd = 284 | let doc = "Set a key" in 285 | let k = 286 | Arg.(required 287 | & pos 1 (some string) None 288 | & info [] ~docv:"STRING" ~doc) in 289 | let v = 290 | Arg.(required 291 | & pos 2 (some string) None 292 | & info [] ~docv:"STRING" ~doc) in 293 | Term.(const set_key $ tls $ connect_arg $ k $ v $ setup_log), 294 | Term.info ~doc "set" 295 | 296 | let ro_bm tls addr iterations () = 297 | tls_create tls >>= fun tls -> 298 | ro_benchmark ?tls ~iterations ~addr () 299 | 300 | let ro_bm_cmd = 301 | let doc = "Run the RO benchmark" in 302 | let nb_iters = 303 | Arg.(required 304 | & pos 1 (some int) None 305 | & info [] ~docv:"ITERATIONS" ~doc) in 306 | Term.(const ro_bm $ tls $ connect_arg $ nb_iters $ setup_log), 307 | Term.info ~doc "ro_bench" 308 | 309 | let rw_bm tls addr iterations = 310 | tls_create tls >>= fun tls -> 311 | wr_benchmark ?tls ~iterations ~addr () 312 | 313 | let rw_bm_cmd = 314 | let doc = "Run the RW benchmark" in 315 | let nb_iters = 316 | Arg.(required 317 | & pos 1 (some int) None 318 | & info [] ~docv:"ITERATIONS" ~doc) in 319 | Term.(const ro_bm $ tls $ connect_arg $ nb_iters $ setup_log), 320 | Term.info ~doc "rw_bench" 321 | 322 | let lwt_run v = 323 | Lwt.async_exception_hook := begin fun exn -> 324 | Logs.err (fun m -> m "%a" Oraft_lwt.pp_exn exn) ; 325 | end ; 326 | Lwt_main.run v 327 | 328 | let cmds = 329 | List.map begin fun (term, info) -> 330 | Term.((const lwt_run) $ term), info 331 | end [ 332 | master_cmd ; 333 | get_cmd ; 334 | set_cmd ; 335 | ro_bm_cmd ; 336 | rw_bm_cmd ; 337 | ] 338 | 339 | let default_cmd = 340 | let doc = "Dict: Trivial distributed key-value service." in 341 | Term.(ret (const (`Help (`Pager, None)))), 342 | Term.info ~doc "dict" 343 | 344 | let () = match Term.eval_choice default_cmd cmds with 345 | | `Error _ -> exit 1 346 | | #Term.result -> exit 0 347 | -------------------------------------------------------------------------------- /test/dune: -------------------------------------------------------------------------------- 1 | (executable 2 | (name dict) 3 | (public_name oraft-dict) 4 | (preprocess (pps lwt_ppx)) 5 | (modules Dict) 6 | (package oraft-rsm) 7 | (libraries cmdliner fmt.tty fmt.cli logs.fmt logs.cli logs.lwt x509 oraft-lwt-tls oraft-rsm)) 8 | 9 | (executable 10 | (name test_DES) 11 | (public_name oraft-test-des) 12 | (modules Test_DES) 13 | (package oraft) 14 | (libraries oraft)) 15 | -------------------------------------------------------------------------------- /test/test_DES.ml: -------------------------------------------------------------------------------- 1 | open Printf 2 | 3 | module List = BatList 4 | module Map = BatMap 5 | module Int64 = BatInt64 6 | module Option = BatOption 7 | module Array = BatArray 8 | module RND = Random.State 9 | module C = Oraft.Core 10 | 11 | module CLOCK = 12 | struct 13 | include BatInt64 14 | type delta = t 15 | end 16 | 17 | module type EVENT_QUEUE = 18 | sig 19 | open Oraft.Types 20 | type 'a t 21 | 22 | val create : unit -> 'a t 23 | val schedule : 'a t -> CLOCK.delta -> rep_id -> 'a -> CLOCK.t 24 | val next : 'a t -> (CLOCK.t * rep_id * 'a) option 25 | val is_empty : 'a t -> bool 26 | end 27 | 28 | module DES : 29 | sig 30 | open Oraft.Types 31 | 32 | module Event_queue : EVENT_QUEUE 33 | 34 | type ('op, 'snapshot) event = 35 | | Election_timeout 36 | | Heartbeat_timeout 37 | | Command of 'op 38 | | Message of rep_id * 'op message 39 | | Func of (CLOCK.t -> unit) 40 | | Install_snapshot of rep_id * 'snapshot * term * index * config 41 | | Snapshot_sent of rep_id * index 42 | | Snapshot_send_failed of rep_id 43 | 44 | type ('op, 'app_state, 'snapshot) t 45 | type ('op, 'app_state) node 46 | 47 | val make : 48 | ?rng:RND.t -> 49 | ?ev_queue:('op, 'snapshot) event Event_queue.t -> 50 | num_nodes:int -> 51 | election_period:CLOCK.t -> 52 | heartbeat_period:CLOCK.t -> 53 | rtt:CLOCK.t -> 54 | (unit -> 'app_state) -> ('op, 'app_state, 'snapshot) t 55 | 56 | val random_node_id : (_, _, _) t -> rep_id 57 | val live_nodes : ('op, 'app_state, _) t -> ('op, 'app_state) node list 58 | 59 | val node_id : (_, _) node -> rep_id 60 | val app_state : (_, 'app_state) node -> 'app_state 61 | val set_app_state : (_, 'app_state) node -> 'app_state -> unit 62 | val leader_id : (_, _) node -> rep_id option 63 | 64 | val simulate : 65 | ?verbose:bool -> 66 | ?string_of_cmd:('op -> string) -> 67 | msg_loss_rate:float -> 68 | on_apply:(time:Int64.t -> ('op, 'app_state) node -> 69 | (index * 'op * term) list -> 70 | [`Snapshot of index * 'app_state | 71 | `State of 'app_state]) -> 72 | take_snapshot:(('op, 'app_state) node -> index * 'snapshot * term) -> 73 | install_snapshot:(('op, 'app_state) node -> 'snapshot -> unit) -> 74 | ('op, 'app_state, 'snapshot) t -> int 75 | end = 76 | struct 77 | open Oraft.Types 78 | 79 | module Event_queue = 80 | struct 81 | type rep_id = string 82 | module type PACK = 83 | sig 84 | type elm 85 | module M : BatHeap.H with type elem = elm 86 | val h : M.t ref 87 | val t : CLOCK.t ref 88 | end 89 | 90 | type 'a m = (module PACK with type elm = 'a) 91 | type 'a t = (Int64.t * rep_id * 'a) m 92 | 93 | let create (type a) () : a t = 94 | let module P = 95 | struct 96 | type elm = Int64.t * rep_id * a 97 | module M = BatHeap.Make(struct 98 | type t = elm 99 | let compare (c1, _, _) (c2, _, _) = 100 | Int64.compare c1 c2 101 | end) 102 | let h = ref M.empty 103 | let t = ref 0L 104 | end 105 | in 106 | (module P) 107 | 108 | let schedule (type a) ((module P) : a t) dt node (ev : a) = 109 | let t = CLOCK.(!P.t + dt) in 110 | P.h := P.M.add (t, node, ev) !P.h; 111 | t 112 | 113 | let is_empty (type a) ((module P) : a t) = P.M.size !P.h = 0 114 | 115 | let next (type a) ((module P) : a t) = 116 | try 117 | let (t, _, _) as x = P.M.find_min !P.h in 118 | P.h := P.M.del_min !P.h; 119 | P.t := t; 120 | Some x 121 | with Invalid_argument _ -> None 122 | end 123 | 124 | module NODES = 125 | struct 126 | module M = Map.Make(String) 127 | type 'a t = { mutable ids : rep_id array; mutable m : 'a M.t } 128 | 129 | let create () = { ids = [||]; m = M.empty } 130 | 131 | let find t k = 132 | try Some (M.find k t.m) with Not_found -> None 133 | 134 | let iter f t = M.iter f t.m 135 | 136 | let random t rng = M.find t.ids.(RND.int rng (Array.length t.ids)) t.m 137 | 138 | let add t k v = 139 | if not (M.mem k t.m) then begin 140 | t.ids <- Array.append [|k|] t.ids; 141 | t.m <- M.add k v t.m 142 | end 143 | 144 | let remove t k = 145 | t.ids <- Array.filter ((<>) k) t.ids; 146 | t.m <- M.remove k t.m 147 | end 148 | 149 | type ('op, 'snapshot) event = 150 | | Election_timeout 151 | | Heartbeat_timeout 152 | | Command of 'op 153 | | Message of rep_id * 'op message 154 | | Func of (CLOCK.t -> unit) 155 | | Install_snapshot of rep_id * 'snapshot * term * index * config 156 | | Snapshot_sent of rep_id * index 157 | | Snapshot_send_failed of rep_id 158 | 159 | type ('op, 'app_state) node = 160 | { 161 | id : rep_id; 162 | addr : address; 163 | mutable state : 'op C.state; 164 | mutable next_heartbeat : CLOCK.t option; 165 | mutable next_election : CLOCK.t option; 166 | mutable app_state : 'app_state; 167 | mutable stopped : bool; 168 | } 169 | 170 | type ('op, 'app_state, 'snapshot) t = 171 | { 172 | rng : RND.t; 173 | ev_queue : ('op, 'snapshot) event Event_queue.t; 174 | nodes : ('op, 'app_state) node NODES.t; 175 | election_period : CLOCK.t; 176 | heartbeat_period : CLOCK.t; 177 | rtt : CLOCK.t; 178 | make_node : unit -> ('op, 'app_state) node; 179 | } 180 | 181 | let mk_node mk_app_state config (id, addr) = 182 | let state = C.make 183 | ~id ~current_term:0L ~voted_for:None 184 | ~log:[] ~config () in 185 | let app_state = mk_app_state () in 186 | { id; addr; state; app_state; 187 | stopped = false; next_heartbeat = None; next_election = None; 188 | } 189 | 190 | let node_id n = n.id 191 | let leader_id n = C.leader_id n.state 192 | let app_state n = n.app_state 193 | let set_app_state n x = n.app_state <- x 194 | 195 | let get_leader (type op) (type app_state) t = 196 | let module M = struct exception N of (op, app_state) node end in 197 | try 198 | NODES.iter 199 | (fun _ node -> if C.status node.state = Leader then raise (M.N node)) 200 | t.nodes; 201 | (* we pick an arbitrary config *) 202 | NODES.random t.nodes t.rng 203 | with M.N n -> n 204 | 205 | let get_leader_conf t = 206 | C.committed_config (get_leader t).state 207 | 208 | let make 209 | ?(rng = RND.make_self_init ()) 210 | ?(ev_queue = Event_queue.create ()) 211 | ~num_nodes 212 | ~election_period ~heartbeat_period ~rtt 213 | mk_app_state = 214 | let active = List.init num_nodes 215 | (fun n -> let s = sprintf "n%03d" n in (s, s)) in 216 | let passive = [] in 217 | let config = Simple_config (active, passive) in 218 | 219 | let mk_nodes l = List.map (mk_node mk_app_state config) l |> Array.of_list in 220 | 221 | let next_node_id = 222 | let n = ref (num_nodes - 1) in 223 | (fun () -> incr n; sprintf "n%03d" !n) in 224 | 225 | let active = mk_nodes active in 226 | let passive = mk_nodes passive in 227 | 228 | let nodes = NODES.create () in 229 | let () = Array.(iter (fun n -> NODES.add nodes n.id n) 230 | (append active passive)) in 231 | 232 | let rec make_node () = 233 | let id = next_node_id () in 234 | let addr = "proto://" ^ id in 235 | let config = get_leader_conf t in 236 | mk_node mk_app_state config (id, addr) 237 | 238 | and t = 239 | { rng; ev_queue; election_period; heartbeat_period; rtt; 240 | nodes; make_node; 241 | } 242 | in 243 | t 244 | 245 | let random_node_id t = (NODES.random t.nodes t.rng).id 246 | 247 | let live_nodes t = 248 | let l = ref [] in 249 | NODES.iter (fun _ n -> if not n.stopped then l := n :: !l) t.nodes; 250 | !l 251 | 252 | let s_of_simple_config l = 253 | List.map (fun (id, addr) -> sprintf "%S:%S" id addr) l |> String.concat "; " 254 | 255 | let string_of_config c = 256 | match c with 257 | Simple_config (c, passive) -> 258 | sprintf "Simple ([%s], [%s])" 259 | (s_of_simple_config c) (s_of_simple_config passive) 260 | | Joint_config (c1, c2, passive) -> 261 | sprintf "Joint ([%s], [%s], [%s])" 262 | (s_of_simple_config c1) (s_of_simple_config c2) 263 | (s_of_simple_config passive) 264 | 265 | let string_of_msg string_of_cmd = function 266 | Request_vote { term; candidate_id; last_log_term; last_log_index; _ } -> 267 | sprintf "Request_vote %S last_term:%Ld last_index:%Ld @ %Ld" 268 | candidate_id last_log_term last_log_index term 269 | | Vote_result { term; vote_granted } -> 270 | sprintf "Vote_result %b @ %Ld" vote_granted term 271 | | Append_entries { term; prev_log_index; prev_log_term; entries; _ } -> 272 | let string_of_entry = function 273 | Nop -> "Nop" 274 | | Op cmd -> "Op " ^ Option.default (fun _ -> "") string_of_cmd cmd 275 | | Config c -> "Config [" ^ string_of_config c ^ "]" in 276 | 277 | let payload_desc = 278 | entries |> 279 | List.map 280 | (fun (index, (entry, term)) -> 281 | sprintf "(%Ld, %s, %Ld)" index 282 | (string_of_entry entry) term) |> 283 | String.concat ", " 284 | in 285 | sprintf "Append_entries (%Ld, %Ld, [%s]) @ %Ld" 286 | prev_log_index prev_log_term payload_desc term 287 | | Append_result { term; result = Append_success last_log_index } -> 288 | sprintf "Append_result success %Ld @ %Ld" last_log_index term 289 | | Append_result { term; result = Append_failure prev_log_index } -> 290 | sprintf "Append_result failure %Ld @ %Ld" prev_log_index term 291 | | Pong { term; n } -> sprintf "Pong %Ld @ %Ld" n term 292 | | Ping { term; n } -> sprintf "Ping %Ld @ %Ld" n term 293 | 294 | let describe_event string_of_cmd = function 295 | Election_timeout -> "Election_timeout" 296 | | Heartbeat_timeout -> "Heartbeat_timeout" 297 | | Command cmd -> sprintf "Command %s" 298 | (Option.default (fun _ -> "") string_of_cmd cmd) 299 | | Message (rep_id, msg) -> 300 | sprintf "Message (%S, %s)" rep_id (string_of_msg string_of_cmd msg) 301 | | Func _ -> "Func _" 302 | | Install_snapshot (src, _, term, index, _config) -> 303 | sprintf "Install_snapshot (%S, _, %Ld, %Ld, _)" src term index 304 | | Snapshot_sent (dst, idx) -> sprintf "Snapshot_sent %S last_index:%Ld" dst idx 305 | | Snapshot_send_failed dst -> sprintf "Snapshot_send_failed %S" dst 306 | 307 | let schedule_election t node = 308 | let dt = CLOCK.(t.election_period - t.election_period / 4L + 309 | of_int (RND.int t.rng (to_int t.election_period lsr 2))) in 310 | let t1 = Event_queue.schedule t.ev_queue dt node.id Election_timeout in 311 | node.next_election <- Some t1 312 | 313 | let schedule_heartbeat t node = 314 | let t1 = Event_queue.schedule 315 | t.ev_queue t.heartbeat_period node.id Heartbeat_timeout 316 | in 317 | node.next_heartbeat <- Some t1 318 | 319 | let unschedule_election _t node = 320 | node.next_election <- None 321 | 322 | let unschedule_heartbeat _t node = 323 | node.next_heartbeat <- None 324 | 325 | let send_cmd t ?(dt=200L) node_id cmd = 326 | ignore (Event_queue.schedule t.ev_queue dt node_id (Command cmd)) 327 | 328 | let must_account time node = function 329 | Election_timeout -> begin match node.next_election with 330 | Some t when t = time -> true 331 | | _ -> false 332 | end 333 | | Heartbeat_timeout -> begin match node.next_heartbeat with 334 | Some t when t = time -> true 335 | | _ -> false 336 | end 337 | | Func _ -> false 338 | | Command _ | Message _ | Install_snapshot _ 339 | | Snapshot_sent _ | Snapshot_send_failed _ -> true 340 | 341 | module CONFIG_MANAGER : 342 | sig 343 | type config_manager 344 | 345 | val make : 346 | ?verbose:bool -> period:int -> 347 | ('op, 'app_state, 'snapshot) t -> config_manager 348 | 349 | val tick : config_manager -> int -> unit 350 | end = 351 | struct 352 | type config_manager = 353 | { verbose : bool; mutable ticks : int; 354 | period : int; des : des; 355 | } 356 | 357 | and des = DES : (_, _, _) t -> des 358 | 359 | let make ?(verbose=false) ~period des = 360 | { verbose; ticks = 0; period; des = DES des; } 361 | 362 | let add_or_promote_failover t des = 363 | 364 | match get_leader_conf des with 365 | | Simple_config (_, []) -> (* add failover *) 366 | begin 367 | let newnode = des.make_node () in 368 | let leader = get_leader des in 369 | let addr = "proto://" ^ newnode.id in 370 | 371 | match C.Config.add_failover newnode.id addr leader.state with 372 | `Already_changed | `Change_in_process 373 | | `Redirect _ | `Unsafe_change _ | `Cannot_change -> () 374 | | `Start_change state -> 375 | if t.verbose then 376 | printf 377 | "!! Adding passive node %S, transitioning to %s\n" 378 | newnode.id 379 | (string_of_config (C.config state)); 380 | leader.state <- state; 381 | NODES.add des.nodes newnode.id newnode 382 | end 383 | 384 | | Joint_config _ | Simple_config ([], _) -> () 385 | 386 | | Simple_config ((_ :: _ as active), (_ :: _ as passive)) -> 387 | let replacee = List.sort compare active |> List.hd |> fst in 388 | let failover = List.sort compare passive |> List.hd |> fst in 389 | 390 | let leader = get_leader des in 391 | 392 | match C.Config.replace ~replacee ~failover leader.state with 393 | `Already_changed | `Change_in_process 394 | | `Redirect _ | `Unsafe_change _ | `Cannot_change -> () 395 | 396 | | `Start_change state -> 397 | if t.verbose then 398 | printf "!! Replacing node %S with %S\n" replacee failover; 399 | leader.state <- state 400 | 401 | let tick ({ des = DES des; _ } as t) n = 402 | t.ticks <- t.ticks + n; 403 | (* printf "TICK %d\n" n; *) 404 | if t.ticks > t.period then begin 405 | t.ticks <- 0; 406 | add_or_promote_failover t des 407 | end 408 | end 409 | 410 | module FAILURE_SIMULATOR : 411 | sig 412 | type t 413 | 414 | val make : ?verbose:bool -> msg_loss_rate:float -> period:int -> RND.t -> t 415 | val tick : t -> rep_id list -> int -> unit 416 | val is_msg_lost : t -> src:rep_id -> dst:rep_id -> bool 417 | end = 418 | struct 419 | type t = 420 | { 421 | period : int; 422 | mutable ticks_to_transition : int; 423 | mutable state : [`Down of rep_id | `Up ]; 424 | msg_loss_rate : float; 425 | rng : RND.t; 426 | verbose : bool; 427 | } 428 | 429 | let make ?(verbose=false) ~msg_loss_rate ~period rng = 430 | { verbose; period; ticks_to_transition = period; msg_loss_rate; rng; state = `Up } 431 | 432 | let tick t node_ids n = 433 | t.ticks_to_transition <- t.ticks_to_transition - n; 434 | if t.ticks_to_transition <= 0 then begin 435 | t.ticks_to_transition <- t.period; 436 | match t.state with 437 | `Down id -> 438 | if t.verbose then printf "### Node %S BACK\n" id; 439 | t.state <- `Up 440 | | `Up -> 441 | let node_ids = Array.of_list node_ids in 442 | let id = node_ids.(RND.int t.rng (Array.length node_ids)) in 443 | if t.verbose then printf "### Node %S PARTITIONED\n" id; 444 | t.state <- `Down id 445 | end 446 | 447 | let is_msg_lost t ~src ~dst = match t.state with 448 | | `Down peer when peer = dst || peer = src -> true 449 | | _ -> RND.float t.rng 1.0 < t.msg_loss_rate 450 | end 451 | 452 | let simulate 453 | ?(verbose = false) ?string_of_cmd ~msg_loss_rate 454 | ~on_apply ~take_snapshot ~install_snapshot t = 455 | 456 | let send_cmd ?(dst = random_node_id t) ?dt cmd = 457 | send_cmd ?dt t dst cmd in 458 | 459 | let fail_sim = FAILURE_SIMULATOR.make 460 | ~verbose ~msg_loss_rate ~period:1000 t.rng in 461 | let configmgr = CONFIG_MANAGER.make ~verbose t ~period:5000 in 462 | 463 | let react_to_event time node ev = 464 | (* Tick at least once per event so that fallen nodes eventually come 465 | * back even if the cluster is making no progress (i.e. in absence of 466 | * commits by other nodes). Avoids getting stuck when commits are 467 | * blocked because there's no safe quorum (only nodes that will not be 468 | * decommissioned) during a configuration change. *) 469 | FAILURE_SIMULATOR.tick fail_sim 470 | (node.id :: (C.peers node.state |> List.map fst)) 1; 471 | CONFIG_MANAGER.tick configmgr 1; 472 | let considered = must_account time node ev in 473 | let () = 474 | if considered && verbose then 475 | printf "%Ld @ %s -> %s\n" time node.id (describe_event string_of_cmd ev) in 476 | 477 | let s, actions = match ev with 478 | Election_timeout -> begin 479 | match node.next_election with 480 | Some t when t = time -> C.election_timeout node.state 481 | | _ -> (node.state, []) 482 | end 483 | | Heartbeat_timeout -> begin 484 | match node.next_heartbeat with 485 | | Some t when t = time -> C.heartbeat_timeout node.state 486 | | _ -> (node.state, []) 487 | end 488 | | Command c -> C.client_command c node.state 489 | | Message (peer, msg) -> C.receive_msg node.state peer msg 490 | | Func f -> 491 | f time; 492 | (node.state, []) 493 | | Install_snapshot (src, snapshot, last_term, last_index, config) -> 494 | let s, accepted = C.install_snapshot 495 | ~last_term ~last_index ~config node.state 496 | in 497 | ignore (Event_queue.schedule t.ev_queue 40L src 498 | (Snapshot_sent (node.id, last_index))); 499 | if accepted then install_snapshot node snapshot; 500 | (s, []) 501 | | Snapshot_sent (peer, last_index) -> 502 | C.snapshot_sent peer ~last_index node.state 503 | | Snapshot_send_failed peer -> 504 | C.snapshot_send_failed peer node.state 505 | in 506 | 507 | let rec exec_action = function 508 | Apply cmds -> 509 | if verbose then 510 | printf " Apply %d cmds [%s]\n" 511 | (List.length cmds) 512 | (List.map 513 | (fun (idx, cmd, term) -> 514 | sprintf "(%Ld, %s, %Ld)" 515 | idx 516 | (Option.default (fun _ -> "") string_of_cmd cmd) 517 | term) 518 | cmds |> 519 | String.concat ", "); 520 | (* simulate current leader being cached by client *) 521 | begin match on_apply ~time node cmds with 522 | `Snapshot (last_index, app_state) -> 523 | node.app_state <- app_state; 524 | node.state <- C.compact_log last_index node.state 525 | | `State app_state -> 526 | node.app_state <- app_state 527 | end 528 | | Become_candidate -> 529 | if verbose then printf " Become_candidate\n"; 530 | unschedule_heartbeat t node; 531 | exec_action Reset_election_timeout 532 | | Become_follower None -> 533 | if verbose then printf " Become_follower\n"; 534 | unschedule_heartbeat t node; 535 | exec_action Reset_election_timeout 536 | | Become_follower (Some leader) -> 537 | if verbose then printf " Become_follower %S\n" leader; 538 | unschedule_heartbeat t node; 539 | exec_action Reset_election_timeout 540 | | Become_leader -> 541 | if verbose then printf " Become_leader\n"; 542 | unschedule_election t node; 543 | schedule_election t node; 544 | schedule_heartbeat t node 545 | | Changed_config -> 546 | if verbose then 547 | printf " Changed config to %s (committed %s)\n" 548 | (C.config node.state |> string_of_config) 549 | (C.committed_config node.state |> string_of_config); 550 | 551 | (* If a Simple_config has been committed, remove nodes no longer 552 | * active from node set. We also check in the most recent 553 | * configuration because the committed config could be severely 554 | * out of date if the node had received a snapshot. 555 | * *) 556 | begin match C.config node.state, C.committed_config node.state with 557 | Joint_config _, _ | _, Joint_config _ -> () 558 | | Simple_config (c1, p1), Simple_config (c2, p2) -> 559 | (* Stop nodes removed from configuration. Needed when the 560 | * node removed was not the leader and thus never gets the 561 | * Stop action, since it is no longer in the configuration 562 | * by the time the leader sends the Append_entries message 563 | * that would let it commit the configuration change. *) 564 | let module S = Set.Make(String) in 565 | let all = List.concat [c1; p1; c2; p2] |> List.map fst |> 566 | List.fold_left (fun s x -> S.add x s) S.empty in 567 | 568 | let to_be_removed = ref [] in 569 | 570 | let collect_removee id n = 571 | if not (S.mem id all) then begin 572 | n.stopped <- true; 573 | to_be_removed := id :: !to_be_removed 574 | end in 575 | 576 | let () = NODES.iter collect_removee t.nodes in 577 | List.iter (NODES.remove t.nodes) !to_be_removed 578 | end 579 | | Exec_readonly n -> 580 | if verbose then printf " Exec_readonly %Ld\n" n 581 | | Redirect (Some leader, cmd) -> 582 | if verbose then printf " Redirect %s\n" leader; 583 | send_cmd ~dst:leader cmd 584 | | Redirect (None, cmd) -> 585 | if verbose then printf " Redirect\n"; 586 | (* Send to a random server. *) 587 | (* We use a large redirection delay to speed up the simulation: 588 | * otherwise, most of the time is spent simulating redirections, 589 | * since there's no time to elect a new leader before the next 590 | * attempt. This makes sense in practice too, because when you 591 | * know there's no leader, you don't want to retry until you're 592 | * confident there's one --- and it's assumed the election period 593 | * is picked suitably to the network performance. *) 594 | send_cmd ~dt:CLOCK.(t.election_period / 2L) cmd 595 | | Reset_election_timeout -> 596 | if verbose then printf " Reset_election_timeout\n"; 597 | unschedule_election t node; 598 | schedule_election t node 599 | | Reset_heartbeat -> 600 | if verbose then printf " Reset_heartbeat\n"; 601 | unschedule_heartbeat t node; 602 | schedule_heartbeat t node 603 | | Send (rep_id, _addr, msg) -> 604 | let dropped = FAILURE_SIMULATOR.is_msg_lost fail_sim 605 | ~src:node.id ~dst:rep_id 606 | in 607 | if verbose then 608 | printf " Send to %S <- %s%s\n" rep_id 609 | (string_of_msg string_of_cmd msg) 610 | (if dropped then " DROPPED" else ""); 611 | if not dropped then begin 612 | let dt = Int64.(t.rtt - t.rtt / 4L + 613 | of_int (RND.int t.rng (to_int t.rtt lsr 1))) 614 | in 615 | ignore (Event_queue.schedule t.ev_queue dt rep_id 616 | (Message (node.id, msg))) 617 | end 618 | | Send_snapshot (dst, _addr, idx, config) -> 619 | if verbose then 620 | printf " Send_snapshot (%S, %Ld)\n" dst idx; 621 | let dropped = FAILURE_SIMULATOR.is_msg_lost fail_sim 622 | ~src:node.id ~dst 623 | in 624 | if not dropped then begin 625 | let last_index, snapshot, last_term = take_snapshot node in 626 | let dt = Int64.(t.rtt - t.rtt / 4L + 627 | of_int (RND.int t.rng (to_int t.rtt lsr 1))) 628 | in 629 | ignore begin 630 | Event_queue.schedule t.ev_queue dt dst 631 | (Install_snapshot 632 | (node.id, snapshot, last_term, last_index, config)) 633 | end 634 | end else begin 635 | ignore (Event_queue.schedule t.ev_queue 636 | CLOCK.(10L * t.rtt) node.id (Snapshot_send_failed dst)) 637 | end 638 | | Stop -> 639 | if verbose then printf " Stop\n"; 640 | (* block events on this node *) 641 | node.stopped <- true; 642 | () 643 | 644 | in 645 | node.state <- s; 646 | List.iter exec_action actions; 647 | considered 648 | in 649 | 650 | let steps = ref 0 in 651 | (* schedule initial election timeouts *) 652 | NODES.iter (fun _ n -> schedule_election t n) t.nodes; 653 | 654 | try 655 | let rec loop () = 656 | match Event_queue.next t.ev_queue with 657 | None -> !steps 658 | | Some (time, rep_id, ev) -> 659 | match NODES.find t.nodes rep_id, ev with 660 | None, Func f -> 661 | f time; 662 | loop () 663 | | None, _ -> loop () 664 | | Some n, _ -> 665 | let blocked = match ev with Func _ -> false | _ -> n.stopped in 666 | if not blocked && react_to_event time n ev then incr steps; 667 | loop () 668 | in loop () 669 | with Exit -> !steps 670 | end 671 | 672 | module FQueue : 673 | sig 674 | type 'a t 675 | 676 | val empty : 'a t 677 | val push : 'a -> 'a t -> 'a t 678 | val length : 'a t -> int 679 | val to_list : 'a t -> 'a list 680 | end = 681 | struct 682 | type 'a t = int * 'a list 683 | 684 | let empty = (0, []) 685 | let push x (n, l) = (n + 1, x :: l) 686 | let length (n, _) = n 687 | let to_list (_, l) = List.rev l 688 | end 689 | 690 | let run 691 | ?(seed = 2) 692 | ~num_nodes ?(num_cmds = 100_000) 693 | ~election_period ~heartbeat_period ~rtt ~msg_loss_rate 694 | ?(verbose=false) () = 695 | let module S = Set.Make(String) in 696 | 697 | let completed = ref S.empty in 698 | let init_cmd = 1 in 699 | let last_sent = ref init_cmd in 700 | let ev_queue = DES.Event_queue.create () in 701 | 702 | let batch_size = 20 in 703 | let retry_period = CLOCK.(4L * election_period) in 704 | 705 | let applied = BatBitSet.create (2 * num_cmds) (* work around BatBitSet bug *) in 706 | 707 | let rng = Random.State.make [| seed |] in 708 | 709 | let des = DES.make ~ev_queue ~rng ~num_nodes 710 | ~election_period ~heartbeat_period ~rtt 711 | (fun () -> (0L, FQueue.empty, 0L)) in 712 | 713 | let rec schedule dt node cmd = 714 | let _ = DES.Event_queue.schedule ev_queue dt node (DES.Command cmd) in 715 | (* after the retry_period, check if the cmd has been executed 716 | * and reschedule if needed *) 717 | 718 | let f _ = 719 | if not (BatBitSet.mem applied cmd) then 720 | schedule election_period (DES.random_node_id des) cmd in 721 | 722 | let _ = DES.Event_queue.schedule ev_queue 723 | CLOCK.(dt + retry_period) 724 | (DES.random_node_id des) (DES.Func f) 725 | in () in 726 | 727 | let check_if_finished node len = 728 | if len >= num_cmds && List.mem node (DES.live_nodes des) then begin 729 | completed := S.add (DES.node_id node) !completed; 730 | if S.cardinal !completed >= num_nodes then 731 | raise Exit 732 | end in 733 | 734 | let apply_one ~time node acc (index, cmd, term) = 735 | if cmd mod (if verbose then 1 else 10_000) = 0 then 736 | printf "XXXXXXXXXXXXX apply %S cmd:%d index:%Ld term:%Ld @ %Ld\n%!" 737 | (DES.node_id node) cmd index term time; 738 | let q = match acc with | `Snapshot (_, (_, q, _)) | `State (_, q, _) -> q in 739 | let id = DES.node_id node in 740 | let q = FQueue.push cmd q in 741 | let len = FQueue.length q in 742 | BatBitSet.set applied cmd; 743 | if cmd >= !last_sent then begin 744 | (* We schedule the next few commands being sent to the current leader 745 | * (simulating the client caching the current leader). *) 746 | let dt = CLOCK.(heartbeat_period - 10L) in 747 | let dst = Option.default id (DES.leader_id node) in 748 | for i = 1 to batch_size do 749 | incr last_sent; 750 | let cmd = !last_sent in 751 | schedule CLOCK.(of_int i * dt) dst cmd 752 | done 753 | end; 754 | check_if_finished node len; 755 | if cmd mod 10 = 0 then begin 756 | if verbose then 757 | printf "XXXXX snapshot %S %Ld (last cmds: %s)\n" id index 758 | (FQueue.to_list q |> List.rev |> List.take 5 |> List.rev |> 759 | List.map string_of_int |> String.concat ", "); 760 | `Snapshot (index, (index, q, term)) 761 | end else 762 | `State (index, q, term) 763 | in 764 | 765 | let on_apply ~time node cmds = 766 | List.fold_left (apply_one ~time node) (`State (DES.app_state node)) cmds in 767 | 768 | let take_snapshot node = 769 | if verbose then printf "TAKE SNAPSHOT at %S\n" (DES.node_id node); 770 | let (index, _, term) as snapshot = DES.app_state node in 771 | (index, snapshot, term) in 772 | 773 | let install_snapshot node ((last_index, q, last_term) as snapshot) = 774 | if verbose then 775 | printf "INSTALL SNAPSHOT at %S last_index:%Ld last_term:%Ld\n" 776 | (DES.node_id node) last_index last_term; 777 | DES.set_app_state node snapshot; 778 | check_if_finished node (FQueue.length q) 779 | in 780 | 781 | (* schedule init cmd delivery *) 782 | let () = schedule 1000L (DES.random_node_id des) init_cmd in 783 | let t0 = Unix.gettimeofday () in 784 | let steps = DES.simulate 785 | ~verbose ~string_of_cmd:string_of_int 786 | ~msg_loss_rate 787 | ~on_apply ~take_snapshot ~install_snapshot 788 | des in 789 | let dt = Unix.gettimeofday () -. t0 in 790 | (* We filter out nodes with 0 entries, i.e. those that were being introduced 791 | * into the cluster at the end of the simulation. *) 792 | let nodes = DES.live_nodes des |> 793 | List.filter 794 | (fun node -> 795 | let _, q, _ = DES.app_state node in 796 | FQueue.length q <> 0) in 797 | let ncmds = nodes |> 798 | List.map 799 | (fun node -> 800 | let _, q, _ = DES.app_state node in 801 | printf "%S: len %d\n" (DES.node_id node) (FQueue.length q); 802 | FQueue.length q) |> 803 | List.fold_left min max_int in 804 | let logs = nodes |> 805 | List.map 806 | (fun node -> 807 | let _, q, _ = DES.app_state node in 808 | FQueue.to_list q |> List.take ncmds) in 809 | let ok, _ = List.fold_left 810 | (fun (ok, l) l' -> match l with 811 | None -> (ok, Some l') 812 | | Some l -> (ok && l = l', Some l')) 813 | (true, None) 814 | logs 815 | in 816 | printf "%d commands\n" ncmds; 817 | printf "Simulated %d steps (%4.2f steps/cmd, %.0f steps/s, %.0f cmds/s).\n" 818 | steps (float steps /. float ncmds) 819 | (float steps /. dt) (float ncmds /. dt); 820 | if ok then 821 | print_endline "OK" 822 | else begin 823 | print_endline "FAILURE: replicated logs differ"; 824 | List.iteri 825 | (fun n l -> 826 | let all_eq, _ = List.fold_left 827 | (fun (b, prev) x -> match prev with 828 | None -> (true, Some x) 829 | | Some x' -> (b && x = x', Some x)) 830 | (true, None) l in 831 | let desc = if all_eq then "" else " DIFF" in 832 | let s = List.map string_of_int l |> String.concat " " in 833 | printf "%8d %s%s\n" n s desc) 834 | (List.transpose logs); 835 | exit 1; 836 | end 837 | 838 | let params = 839 | [| 840 | 800L, 200L, 50L, 0.01; 841 | 4000L, 200L, 50L, 0.20; 842 | 8000L, 200L, 50L, 0.50; 843 | 40000L, 200L, 50L, 0.60; 844 | (* 40000L, 200L, 50L, 0.70; *) 845 | |] 846 | 847 | let random rng a = a.(RND.int rng (Array.length a)) 848 | 849 | let () = 850 | for i = 1 to 100 do 851 | let rng = RND.make [| i |] in 852 | let num_nodes = 1 + RND.int rng 5 in 853 | let election_period, heartbeat_period, rtt, msg_loss_rate = random rng params in 854 | print_endline (String.make 78 '='); 855 | printf "seed:%d num_nodes:%d election:%Ld heartbeat:%Ld rtt:%Ld loss:%.4f\n%!" 856 | i num_nodes election_period heartbeat_period rtt msg_loss_rate; 857 | run ~seed:i ~verbose:false 858 | ~num_nodes ~election_period ~heartbeat_period ~rtt ~msg_loss_rate 859 | (); 860 | print_endline ""; 861 | done 862 | --------------------------------------------------------------------------------