├── .gitignore ├── META ├── Makefile ├── Makefile.opamlib ├── README.md ├── _tags ├── doc ├── introduction.md └── replication_performance.md ├── opam ├── src ├── raft.mllib ├── raft.odocl ├── raft_helper.ml ├── raft_helper.mli ├── raft_log.ml ├── raft_log.mli ├── raft_protocol.ml ├── raft_protocol.mli ├── raft_types.ml └── raft_types.mli └── tests └── test.ml /.gitignore: -------------------------------------------------------------------------------- 1 | *.cm* 2 | *.tsk 3 | *.a 4 | *.o 5 | 6 | *.sw* 7 | -------------------------------------------------------------------------------- /META: -------------------------------------------------------------------------------- 1 | version = "0.1.0" 2 | description = "Raft Protocol Logical Library" 3 | archive(byte) = "raft.cma" 4 | archive(native) = "raft.cmxa" 5 | exists_if = "raft.cma" 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LIB_NAME=raft 2 | 3 | LIB_FILES+=raft_helper 4 | LIB_FILES+=raft_protocol 5 | LIB_FILES+=raft_types 6 | LIB_FILES+=raft_log 7 | 8 | LIB_DEPS= 9 | 10 | test: 11 | $(OCB) test.native 12 | export OCAMLRUNPARAM="b" && ./test.native 13 | 14 | doc-gen: 15 | $(OCB) src/$(LIB_NAME).docdir/index.html 16 | 17 | ## Generic library makefile ## 18 | include Makefile.opamlib 19 | 20 | -------------------------------------------------------------------------------- /Makefile.opamlib: -------------------------------------------------------------------------------- 1 | ifneq ($(LIB_DEPS),) 2 | LIB_DEPS:=-pkgs $(LIB_DEPS) 3 | endif 4 | 5 | OCB_INC = -I src -I tests 6 | OCB_FLAGS = -use-ocamlfind $(LIB_DEPS) 7 | OCB = ocamlbuild $(OCB_FLAGS) $(OCB_INC) 8 | 9 | .PHONY: lib.native lib.byte lib.install lib.uninstall clean 10 | 11 | lib.native: 12 | $(OCB) $(LIB_NAME).cmxa 13 | $(OCB) $(LIB_NAME).cmxs 14 | 15 | lib.byte: 16 | $(OCB) $(LIB_NAME).cma 17 | 18 | LIB_BUILD =_build/src/ 19 | LIB_INSTALL = META 20 | LIB_INSTALL +=$(patsubst %,$(LIB_BUILD)/%.mli,$(LIB_FILES)) 21 | LIB_INSTALL +=$(patsubst %,$(LIB_BUILD)/%.cmi,$(LIB_FILES)) 22 | LIB_INSTALL +=$(patsubst %,$(LIB_BUILD)/%.annot,$(LIB_FILES)) 23 | LIB_INSTALL +=$(patsubst %,$(LIB_BUILD)/%.cmo,$(LIB_FILES)) 24 | LIB_INSTALL +=$(LIB_BUILD)/$(LIB_NAME).cma 25 | 26 | LIB_INSTALL +=-optional 27 | LIB_INSTALL +=$(patsubst %,$(LIB_BUILD)/%.cmx,$(LIB_FILES)) 28 | LIB_INSTALL +=$(patsubst %,$(LIB_BUILD)/%.cmt,$(LIB_FILES)) 29 | LIB_INSTALL +=$(LIB_BUILD)/$(LIB_NAME).cmxa 30 | LIB_INSTALL +=$(LIB_BUILD)/$(LIB_NAME).cmxs 31 | LIB_INSTALL +=$(LIB_BUILD)/$(LIB_NAME).a 32 | 33 | lib.install: 34 | ocamlfind install $(LIB_NAME) $(LIB_INSTALL) 35 | 36 | lib.uninstall: 37 | ocamlfind remove $(LIB_NAME) 38 | 39 | clean: 40 | $(OCB) -clean 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Raft Protocol ## 2 | 3 | > This repo implements the [Raft consensus protocol](https://raft.github.io/) 4 | > in a functional way using the OCaml language. 5 | 6 | The library solely implements the pure logical part of the protocol. It is 7 | **agnostic** of the transport mechanism, server identification and 8 | payload (ie log data) format. 9 | 10 | ## Project structure ## 11 | 12 | #### [Types](src/raft.proto) #### 13 | 14 | All the types involved in the RAFT protocoal are defined in the 15 | [protobuf](https://developers.google.com/protocol-buffers/) format (see [raft.proto](src/raft.proto)). 16 | [ocaml-protoc](https://github.com/mransan/ocaml-protoc/) compiler is then used to 17 | generate [the equivalent types](src/raft_pb.mli) in OCaml as well as serialization functions. 18 | 19 | 20 | #### [Helper Functions](src/raft_helper.mli) #### 21 | 22 | The module [Raft_helper](src/raft_helper.mli) provides convenience routines to manipulate the 23 | generated [types](src/raft_pb.mli). 24 | 25 | 26 | #### [Protocol Logic](src/raft_logic.mli) #### 27 | 28 | The protocol logic is implemented in the [Raft_logic](src/raft_logic.mli) module. The protcol implementation 29 | is divided into 2 subsections for each of the 2 request/response of the protocol: 30 | * Request Vote : For leader election 31 | * Append log entry: For state replication 32 | 33 | Each subsection implements the following logic: 34 | 35 | **Make Request** 36 | 37 | > This function creates a request value given the current state of the server and optionally which 38 | > server the request is sent to. 39 | 40 | **Handle Request** 41 | 42 | > This function computes the effect of the received request to the server state. The implementaiton being 43 | > functional, a new state is returned. 44 | 45 | **Handle Response** 46 | 47 | > This function computes the effect of a received response to the server. Along with a new state, a 48 | > **Follow Up Action** is returned to let the application perform the expected actions. 49 | 50 | 51 | #### Example 52 | 53 | The example below demonstrates how to use the library to implement an Append Entry request/response 54 | communication between a leader (server 0) and a follower (server 1). In this example the 55 | leader has a log which needs to be replicated on the follower. 56 | 57 | ```OCaml 58 | let () = 59 | (* Example *) 60 | 61 | (* Create a 3 server configuration. 62 | *) 63 | let configuration = Raft_pb.( { 64 | nb_of_server = 3; 65 | election_timeout = 0.1; 66 | }) in 67 | 68 | (* Create a leader state by simulating a (rigged) election 69 | *) 70 | let leader_0 = 71 | Raft_helper.Follower.create ~configuration ~id:0 () 72 | |> Raft_helper.Candidate.become ~now:0.0 73 | |> Raft_helper.Leader.become 74 | |> Raft_helper.Leader.add_log (Bytes.of_string "Foo") 75 | in 76 | 77 | (* Create a follower 78 | *) 79 | let follower_1 = 80 | Raft_helper.Follower.create ~configuration ~id:1 () 81 | in 82 | 83 | (* First create an 'Append Entries' request from the 84 | leader to server 1. 85 | *) 86 | match Raft_logic.Append_entries.make leader_0 1 with 87 | | Some request -> ( 88 | 89 | (* 'Update' server 1 (ie follower) by applying the request. This returns 90 | the response to send to the leader. 91 | *) 92 | let follower_1, response = Raft_logic.Append_entries.handle_request follower_1 request in 93 | 94 | (* 'Update' server 0 (ie leader) by applying the response. This returns 95 | the new state as well as a follow up action to take. 96 | *) 97 | let leader_0, action = Raft_logic.Append_entries.handle_response leader_0 response in 98 | 99 | (* Check that the follower has successfully replicated the leader single 100 | log 101 | *) 102 | match follower_1.log with 103 | | {data; _ } :: `` -> 104 | if "Foo" = Bytes.to_string data 105 | then print_endline "Log successfully replicated in follower" 106 | else print_endline "Log replication was corrupted" 107 | | _ -> print_endline "Log replication failure" 108 | ) 109 | | None -> () 110 | ``` 111 | 112 | ## Notes on implementing Log size limit ## 113 | 114 | **Problem with current implementation** 115 | 116 | > The current implementation keeps on adding `log entry`s to an in-memory 117 | > log data structure and never deletes them. 118 | > Eventually the process will run out of memory. 119 | 120 | The idea would be to keep only the most recent `log entry`s in memory; keep in 121 | mind that all `log entry`s are permanently recorded on disk. 122 | 123 | **Problem with Limitting the log size** 124 | 125 | In normal mode of operations, followers are replicating the leader log 126 | quickly and are never too far behind. Problem arise when one of the server is 127 | taken offline for a long time. In this case the follower has a lot of 128 | `log entry`s to catch up on. This creates 2 major issues: 129 | 130 | * The leader does a lot of of work to make the follower catch up. The leader 131 | keep sending Append Entries Request one after the othe and this usually 132 | slows down the leader process which also need to keep append new 133 | `log entry`s. 134 | 135 | * If we limit the log size proposed above, then it is likely that if 136 | follower has been taken offline for an extended period of time the logs 137 | it needs to replicate next are not be stored in the in-memory log data 138 | structure of the leader. 139 | 140 | **Solution overview** 141 | 142 | First, the leader can easily detect when the follower is too far behind 143 | (see notes on backlog situation detection). The next Append Entries request 144 | it sends can indicate to the follower that no logs can be replicated 145 | until the follower has caught up. Additionally through heartbeat Append Entries 146 | request the leader can also indicates the minimum log index to be replicated 147 | on the follower for it to resume the RAFT replication. 148 | 149 | In order to not disrupt the leader, a seperate process called `backlog` can 150 | run on each of the RAFT server. Its sole purpose is streaming logs back to the 151 | follower which is lagging behind. This multi process approach offers several 152 | advantages: 153 | 154 | * The backlog server protocol could be optimize for sending large quantity 155 | of logs. For instance compression could be used and TCP is likely more 156 | suited as well. 157 | 158 | * The backlog process won't affect the raft server (leader); it will use 159 | limited memory, and can run on a separate CPU. Furthermore it will only 160 | read past `log entry`s from the permanent storage which are immutable. 161 | Database like rocksdb allows a separate process to open the DB in a read 162 | only mode, and our design could leverage fully this feature. 163 | 164 | * The follower which is behind could use the backlog server of another follower, 165 | in general the leader process is the one which consumes the most resources, 166 | so it could be better to use a follower. 167 | 168 | **Solution Details** 169 | 170 | * 2 new field in configuration: max_log_size\_\{upper_bound|lower_bound}. When 171 | adding new log entries will make the log size go over upper bound, then 172 | truncate the log to lower bound prior to adding the new log entries. 173 | 174 | * Implement the enforcement of the upper and lower bound value in 175 | [Raft_log](src/raft_log.mli) module 176 | 177 | * Leader detecs the backlog situation when the follower sends back an 178 | Append Entries Response with `prev log index` which is smaller than the 179 | earliest `log entry` in the in-memory data structure. 180 | 181 | * Append Entries Request needs to be modified to notify the follower that it is 182 | in a backlog situation. 183 | 184 | * Follower state needs to keep track of the backlog requirement (ie the 185 | earlies `log entry` to be replicated. 186 | 187 | * `Raft_logic.result` should inform the app of the backlog requirement. This 188 | will then leveraged by the application to contact a backlog server for 189 | streaming back the missing logs. 190 | 191 | * Add `Raft_logic.handle_add_backlog` for the application to fill back the 192 | in memory log data structure of the follower 193 | 194 | * Make sure to read the lower bound entries backward from the permanent storage 195 | when starting a server 196 | -------------------------------------------------------------------------------- /_tags: -------------------------------------------------------------------------------- 1 | true: bin_annot 2 | true: annot 3 | true: safe_string 4 | true: keep_locs 5 | true: warn(A-4-42-41) 6 | true: debug 7 | true: no_alias_deps 8 | -------------------------------------------------------------------------------- /doc/introduction.md: -------------------------------------------------------------------------------- 1 | # A Functional Implementation of the RAFT protocol 2 | 3 | ## Introduction 4 | 5 | > In this serie of blog posts we will implement the consensus protocol RAFT in 6 | > a purely functional style. 7 | 8 | **Consensus algorithms** are interesting protocols which are particularly relevant with modern distributed architetures. The RAFT protocol proposes a relatively simple to understand approach and offers a great introduction to the consensus problem. 9 | 10 | The benefit of **Functional Programming** have been discussed (and argued over) numerous time, the focus here is rather to provide a concrete implementation of a protocol with a functional approach. 11 | 12 | **OCaml** language is elegant, well proven and really fast. No OCaml knowledge is required for these blog posts and if you are interested in learning this language through a concrete application then stay tuned. We'll only cover a very small fraction of the language and its ecosystem and hopefuly it will make you want to lean more. 13 | 14 | Here are the main technology we'll be using: 15 | 16 | * [Google Protobuf](https://developers.google.com/protocol-buffers): Language agnostic message speficications. We'll use that format for both the protocol messages and state specifications. 17 | * [OCaml-protoc](https://github.com/mransan/ocaml-protoc): OCaml compiler for protobuf messages. 18 | * [OCaml](http://ocaml.org/) core language 19 | * [Lwt](http://ocsigen.org/lwt/):OCaml library for concurrent programming (with futures and promises) 20 | * Unix UDP for the transport protocol 21 | 22 | ## Consensus Protocols 23 | 24 | The goal of a consensus protocols is to ensure that participating servers will eventually be consistent even if certain failure happened. Such protocol can differ with regards to the state they manage as well as the type of failure they are resilient to. 25 | 26 | **RAFT** protocol ensures the consistent execution of a `state machine` and is resilient to `fail-stop` failures. `fail-stop` failures are essentially server crashes or a server not receiving messages. **RAFT** protocol does not support Byzantine failures which is when a server is acting maliciously. 27 | 28 | In the next section we will look into details about what is a state machine with a concrete and simple example implemented in OCaml. 29 | 30 | #### State Machine 31 | 32 | A `state machine` is composed of a state and a series of command to be executed on the state. Each command has a payload and executing the command will modify the state. 33 | 34 | For instance if we want to represent a (limited) algebra state machine we could have the following: 35 | ```OCaml 36 | (** Named variable *) 37 | type var = { 38 | name : string; 39 | value : float; 40 | } 41 | 42 | (** State of the state machine *) 43 | type state = var list 44 | 45 | (** Command of the state machine *) 46 | type cmd = 47 | | Store of var 48 | | Add_and_assign of string * string * string 49 | (* (lhs , rhs , new_variable_name) *) 50 | ``` 51 | 52 | Let's stop here for a second and look at our first OCaml code. The code above demonstrates the use of the 3 most widely used OCaml types 53 | 54 | * **Records** (`type var = {...}`): Similar to a C `struct` 55 | * **List** (`type state = var list`): List is a builtin type in OCaml and represents a singly linked list. It's an immutable data structure. 56 | * **Variant** (`type cmd = |... |... `): Type to represent a choice. `cmd` can either be `Store` or `Add_and_assign`. Each choice is called a constructor. A constructor can have zero or many arguments. 57 | 58 | `Store x` adds the given variable `x` to the state. 59 | 60 | `Add_and_assign ("x", "y", "z")` sums the values associated with `"x"` and `"y"` and stores the result in a variable with name `"z"`. 61 | 62 | Here is an example of a sequence of commands: 63 | 64 | ```OCaml 65 | let cmds = [ 66 | Store {name = "x"; value = 1.}; 67 | Store {name = "y"; value = 4.}; 68 | Add_and_assign ("x", "y", "z"); 69 | ] 70 | ``` 71 | After exectution of the above commands on an initial empty list state, we would then expect the resulting state: 72 | ```OCaml 73 | let expected_state = [ 74 | {name = "z"; value = 5.}; 75 | {name = "y"; value = 4.}; 76 | {name = "x"; value = 1.}; 77 | ] 78 | ``` 79 | As far as OCaml is concerned we've just learned how to create values of the types we previously defined. 80 | 81 | Let's now write our first function in OCaml which will perform the execution of the state machine command: 82 | 83 | ```OCaml 84 | let execute cmd state = 85 | match cmd with 86 | | Store v -> v::state 87 | | Add_and_assign (xname, yname, zname) -> 88 | let xvar = List.find (fun v -> v.name = xname) state in 89 | let yvar = List.find (fun v -> v.name = yname) state in 90 | {name = zname; value = xvar.value +. yvar.value} :: state 91 | ``` 92 | 93 | Let's look into more details at a few constructs that the OCaml language offers: 94 | 95 | **`match cmd with | Store ..-> ... | ...`** 96 | 97 | This `match with` expression performs a proof by case logic. The OCaml compiler has special support to detect missing cases which helps a lot for finding bugs early. This construct is called **pattern matching** and is heavily used in OCaml. 98 | 99 | **`v::state`** 100 | 101 | The expression `v::state` is simply the builtin syntax for to append a value (`v`) at the head of the list (`state`). 102 | 103 | **`fun x -> ...`** 104 | 105 | OCaml is a functional language; you can create anonymous function using `(fun x -> ...)` expression. 106 | 107 | **`yvar.value`** 108 | 109 | Record fields access using the classic `.` (dot) syntax. (`yvar.value`). 110 | 111 | > Notice the lack of type annotation in the above code! In fact the OCaml compiler infer all the types and guarantees type safety. The syntax is minimal without sacrifying program correctness. 112 | 113 | ## Reaching consensus for a State machine 114 | 115 | Because a state machine has a deterministic execution, in order to get a replicated state on all the servers, the consensus protocol must solely ensure the correct ordered replication of the commands. 116 | 117 | The **RAFT** protocol is agnostic of the type of command; in fact for our implementation we will define the command as a byte sequence. Each command is wrapped by the RAFT protocol into a `log entry` data structure which uniquely index and stricly order each command. 118 | 119 | This `log entry` is a fundamental part of the RAFT protocol and used throughout messages and state specification. Let's therefore introduce our first Protobuf message: 120 | 121 | ```JavaScript 122 | message LogEntry { 123 | required int32 index = 1; 124 | required int32 term = 2; 125 | required bytes data = 3; 126 | } 127 | ``` 128 | 129 | The [term] field can be ignored for now as it will be explained later. The [index] field is a unique and strickly increasing value for each command. [data] is a placeholder field the application state machine commands. 130 | -------------------------------------------------------------------------------- /doc/replication_performance.md: -------------------------------------------------------------------------------- 1 | ### Functional log replication in RAFT 2 | 3 | The RAFT protocol is a consensus protocol for a log data structure. Each server 4 | in a RAFT cluster will eventually have the same log data structure. 5 | 6 | The RAFT protocol is based on the leader election mechanism; the initial phase of the protocol consists 7 | in deciding which server will be the leader. From then on, the leader server is responsible to append 8 | new log entries to its internal log as well as replicating the new entries in all the follower servers. 9 | 10 | In this post we will look at a practical performance problem related to implementing log replication 11 | both efficiently using solely functional data structures. 12 | 13 | **Log Entry data structure** 14 | 15 | ```OCaml 16 | type log_entry = { 17 | index : int; 18 | term : int; 19 | data : bytes; 20 | } 21 | ``` 22 | 23 | The `index` is monotically increasing (by 1) and starting at 1. 24 | The `term` corresponds to the election term that this `log entry` was created at. 25 | 26 | It's important to notice that both index and term are needed to uniquely identify a log 27 | entry. In fact, in certain circumstances where a previous leader crashed without fully 28 | replicating a log entry, it is possible that 2 servers will have log entries with 29 | the same index but different terms. (RAFT protocol ensure that only one is eventually persisted). 30 | 31 | **Log data structure** 32 | 33 | The log is the collection of `log_entry`s: 34 | 35 | ```OCaml 36 | {index = 10;term = 2}::{index = 9; term=1}:: .... ::{index = 1; term=1}::[] 37 | ``` 38 | 39 | The log ordering is from lastest to earliest log. 40 | 41 | **Replication protocol** 42 | 43 | During log replication the leader must replicate its `log` to all the followers and the RAFT protocol 44 | defines the following (simplified) request to be send from the leader to each followers. 45 | 46 | ```OCaml 47 | type request = { 48 | leader_id : int; 49 | prev_log_index : int; 50 | prev_log_term : int; 51 | rev_log_entries : log_entry list; 52 | } 53 | ``` 54 | 55 | Besides its `id` and the `log_entry`s to be replicated, the leader is also sending 56 | the `index` and `term` of the last log it believed was replicated on the follower. This extra 57 | information helps both parties to synchronize. 58 | 59 | The (simplified) response replied by the follower to its leader is: 60 | 61 | ```OCaml 62 | type response = 63 | | Success of {last_log_index : int} 64 | | Failure of {last_log_index : int; last_log_term : int} 65 | ``` 66 | 67 | In case of succesfful replication the follower server sends the index of the log it replicated (In general this would 68 | be the latest log sent in the query). In case of failure the follower additionally sends the corresponding term. 69 | 70 | A failure happens in the follower when the `index`/`term` information sent by the leader is not matching 71 | the follower state. 2 main reaons: 72 | a) The entry is simply not in the follower log. The follower is lagging behind and 73 | needs earlier log entries than the ones sent in that request. 74 | b) The entry has different term. This could happen during a leader crash. That log index was 75 | replicated on the follower, however the index was never commited. 76 | 77 | **Normal operation** 78 | 79 | Let's look at what the server is doing when computing the replication request to be 80 | sent to a follower. We assume here that the leader is keeping track of the `prev_log_index` 81 | for each of the followers. 82 | 83 | ```OCaml 84 | let collect (prev_log_index:int) (leader_id:int) (leader_log:log_entry list) = 85 | 86 | let rec aux rev_log_entries = function 87 | | [] -> ( 88 | (* This can happen when no logs hae previously been 89 | * replicated on the follower. 90 | * By convention the first log index is 1 and therefore 91 | * the expected `prev_log_index` should be 0. 92 | *) 93 | assert(prev_log_index = 0); 94 | {leader_id; prev_log_index = 0; prev_log_term = 0; rev_log_entries} 95 | ) 96 | 97 | | {index; term; _}::tl when index = prev_log_index -> 98 | (* The prev_log_index is successfully found 99 | * in the leader log. 100 | *) 101 | {leader_id; prev_log_index; prev_log_term = term; rev_log_entries} 102 | 103 | | log_entry::tl -> 104 | (* More logs need to be accumulated. 105 | *) 106 | aux (log_entry::rev_log_entries) tl 107 | in 108 | aux [] log_entry 109 | ``` 110 | 111 | The code above iterates from latest to earlier `log entry`s until the `prev_log_index` is found. Log entries 112 | are accumulated in a list in reverse order (hence the name `rev_log_entries`). 113 | 114 | In normal mode of operation, the follower is not lagging too much behind the leader; concequently only 115 | small number of iteration should be required to compute the requests. 116 | The `list` data structure used to store the logs is particularly efficient: adding new log at the head of the 117 | `list` is constant time and computing the replication request is linear in terms of logs to replicate. 118 | 119 | **Corner cases (1)** 120 | 121 | From time to time servers will go offline. This can happen in case of a machine failure or if a machine needs to 122 | go through maintenance. No matter what the reason, when the follower will join the replication process it will be lagging 123 | behind massively. 124 | 125 | The first problem is that in reality replication requests have a `max` number of log entries. It's impractical to send 126 | gigantic messages and the RAFT protocol accomodates nicely for the leader to send a limited number of log entries. 127 | 128 | Here is the code to get the subset of log for the replication request 129 | 130 | ```OCaml 131 | let rec keep_first_n l = function 132 | | 0 -> [] 133 | | n -> 134 | begin match l with 135 | | hd::tl -> hd::(keep_first_n tl (n - 1)) 136 | | [] -> [] 137 | end 138 | 139 | let collect_with_max max prev_log_index leader_id leader_log = 140 | let request = collect prev_log_index leader_id leader_log in 141 | {request with 142 | rev_log_entries = keep_first_n request.rev_log_entries max} 143 | ``` 144 | 145 | Let's define : 146 | * `n` the number of log entries to be replicated 147 | * `m` the max number of log entries that can be sent in a single replication request 148 | 149 | The replication complexity will therefore be 150 | > 0(n) = n + (n - m) + (n - 2\*m) + ... (n - (n * m)/m) 151 | > = n^2 152 | 153 | Note that during normal operation mode the `m` does not play a role since it's likely `>n` 154 | which makes the complexity `O(n)` as previously mentioned. 155 | 156 | The impact of such slow performance is quite large. Not only the replication for one of the 157 | follower is slow, but also the entire leader is getting slower. 158 | 159 | The RAFT protocol relies on heartbeat messages from the leader to 160 | the followers to notify that it is still active; if the replication request 161 | takes too long then followers will not receive a heartbeat and they 162 | will start a new leader election. 163 | -------------------------------------------------------------------------------- /opam: -------------------------------------------------------------------------------- 1 | opam-version: "1.2" 2 | name: "raft" 3 | version: "0.1.0" 4 | maintainer: "Maxime Ransan " 5 | authors:[ 6 | "Maxime Ransan " 7 | ] 8 | homepage: "https://github.com/mransan/raft" 9 | bug-reports:"https://github.com/mransan/raft/issues" 10 | dev-repo:"https://github.com/mransan/raft.git" 11 | license: "MIT" 12 | build: [ 13 | [make "lib.byte"] 14 | [make "lib.native"] { ocaml-native } 15 | ] 16 | install: [ 17 | [make "lib.install" ] 18 | ] 19 | remove: [make "lib.uninstall"] 20 | depends: [ 21 | "ocamlfind" {build} 22 | "ocamlbuild" {build} 23 | ] 24 | -------------------------------------------------------------------------------- /src/raft.mllib: -------------------------------------------------------------------------------- 1 | Raft_helper 2 | Raft_protocol 3 | Raft_types 4 | Raft_log 5 | -------------------------------------------------------------------------------- /src/raft.odocl: -------------------------------------------------------------------------------- 1 | Raft_protocol 2 | Raft_types 3 | Raft_log 4 | Raft_helper 5 | -------------------------------------------------------------------------------- /src/raft_helper.ml: -------------------------------------------------------------------------------- 1 | module Types = Raft_types 2 | module Log = Raft_log 3 | 4 | module Configuration = struct 5 | 6 | let is_majority {Types.nb_of_server; _} nb = 7 | nb > (nb_of_server / 2) 8 | 9 | let election_timeout configuration = 10 | let { 11 | Types.election_timeout = t; 12 | election_timeout_range = r; _ } = configuration in 13 | t +. (Random.float r -. (r /. 2.)) 14 | 15 | end 16 | 17 | module Follower = struct 18 | 19 | let make ?log ?commit_index ?current_term ~configuration ~now ~server_id () = 20 | 21 | let current_term = match current_term with 22 | | None -> 0 23 | | Some current_term -> current_term 24 | in 25 | 26 | let log = match log with 27 | | None -> Log.empty configuration.Types.max_log_size 28 | | Some log -> log 29 | in 30 | 31 | let commit_index = match commit_index with 32 | | None -> 0 33 | | Some commit_index -> commit_index 34 | in 35 | 36 | let timeout = Configuration.election_timeout configuration in 37 | { 38 | Types.current_term; 39 | server_id; 40 | log; 41 | commit_index; 42 | role = Types.(Follower { 43 | voted_for = None; 44 | current_leader = None; 45 | election_deadline = now +. timeout 46 | }); 47 | configuration; 48 | } 49 | 50 | let become ?current_leader ~now ~term state = 51 | let election_deadline = 52 | now +. Configuration.election_timeout state.Types.configuration 53 | in 54 | 55 | let role = match state.Types.role with 56 | | Types.Follower follower_state -> 57 | let voted_for = 58 | if term = state.Types.current_term 59 | then follower_state.Types.voted_for 60 | else None 61 | in 62 | Types.Follower { 63 | Types.current_leader; 64 | Types.election_deadline; 65 | Types.voted_for; 66 | } 67 | | Types.Candidate _ when state.Types.current_term = term -> 68 | Types.Follower { 69 | Types.voted_for = Some state.Types.server_id; 70 | Types.current_leader; 71 | Types.election_deadline; 72 | } 73 | | _ -> Types.Follower { 74 | Types.voted_for = None; 75 | Types.current_leader; 76 | Types.election_deadline; 77 | } 78 | in 79 | { state with Types.current_term = term; role } 80 | 81 | end 82 | 83 | module Candidate = struct 84 | 85 | let become ~now state = 86 | let timeout = Configuration.election_timeout state.Types.configuration in 87 | let role = Types.Candidate { 88 | Types.vote_count = 1; 89 | Types.election_deadline = now +. timeout; 90 | } in 91 | {state with 92 | Types.role; 93 | Types.current_term = state.Types.current_term + 1; 94 | } 95 | 96 | let increment_vote_count ({Types.vote_count; _ } as candidate_state) = 97 | {candidate_state with Types.vote_count = vote_count + 1} 98 | 99 | end 100 | 101 | module Leader = struct 102 | 103 | let become ~now state = 104 | 105 | let last_log_index = Log.last_log_index state.Types.log in 106 | 107 | let { 108 | Types.nb_of_server; 109 | hearbeat_timeout; _} = state.Types.configuration in 110 | 111 | let rec aux followers = function 112 | | (-1) -> followers 113 | | i -> 114 | if i = state.Types.server_id 115 | then 116 | aux followers (i -1) 117 | else 118 | let next_index = last_log_index + 1 in 119 | let match_index = 0 in 120 | 121 | let follower:Types.follower_info = { 122 | Types.follower_id = i; 123 | next_index; 124 | match_index; 125 | outstanding_request = false; 126 | heartbeat_deadline = now +. hearbeat_timeout; 127 | (* 128 | * Here the expectation is that after becoming a leader 129 | * the client application will send a message to all the 130 | * receivers and therefore the heartbeat_deadline is set 131 | * to [now + timeout] rather than [now]. *) 132 | } in 133 | aux (follower::followers) (i - 1) 134 | in 135 | let followers = aux [] (nb_of_server - 1) in 136 | 137 | {state with Types.role = Types.(Leader followers)} 138 | 139 | (* Reusable function to update the index of a particular 140 | * receiver id. *) 141 | let update_follower ~follower_id ~f leader_state = 142 | 143 | List.map (fun follower -> 144 | if follower.Types.follower_id = follower_id 145 | then (f follower) 146 | else follower 147 | ) leader_state 148 | 149 | let update_follower_last_log_index ~follower_id ~index followers = 150 | 151 | let followers = update_follower ~follower_id ~f:(fun follower -> 152 | let {Types.next_index; match_index; _} = follower in 153 | let upd_next_index = index + 1 in 154 | let upd_match_index = index in 155 | if upd_match_index > match_index && upd_next_index > next_index 156 | then 157 | {follower with Types.next_index = index + 1; match_index = index} 158 | else 159 | (* It is possible to receive out of order responses from the other 160 | * raft servers. 161 | * 162 | * In such a case we don't want to decrement the next index 163 | * of the server since the server is expected to never remove 164 | * previously saved log entries. *) 165 | follower 166 | ) followers 167 | in 168 | 169 | (* Calculate the number of server which also have replicated that 170 | log entry *) 171 | let nb_of_replications = List.fold_left (fun n {Types.match_index; _ } -> 172 | if match_index >= index 173 | then n + 1 174 | else n 175 | ) 0 followers in 176 | 177 | (followers, nb_of_replications) 178 | 179 | let decrement_next_index 180 | ~follower_last_log_index ~follower_id state followers = 181 | let latest_log_index = Log.last_log_index state.Types.log in 182 | 183 | assert(follower_last_log_index <= latest_log_index); 184 | (* This is an invariant. The server cannot have replicated more logs 185 | * than the Leader. 186 | * 187 | * However due to message re-ordering it is possible to receive a 188 | * [LogFailure] with the receiver_last_log_index equal to 189 | * the latest_log_index. 190 | * 191 | * Consider the following scenario 192 | * - [Leader] Append_entry prev_index = x rev_log_entries [x+1] 193 | * - [Server] receives the request and correctly replicate x + 1 194 | * - !! RESPONSE IS LOST !! 195 | * - [Leader] Append_entry prev_index = x rev_log_entries [x+1] 196 | * - [Server] return a failure since it has replicated x + 1 and cannot 197 | * remove that log entry since it is coming from the current term 198 | * leader. *) 199 | update_follower ~follower_id ~f:(fun index -> 200 | {index with 201 | Types.next_index = follower_last_log_index + 1; 202 | Types.match_index = follower_last_log_index} 203 | ) followers 204 | 205 | let record_response_received ~follower_id followers = 206 | 207 | update_follower 208 | ~follower_id 209 | ~f:(fun index -> 210 | {index with Types.outstanding_request = false;} 211 | ) 212 | followers 213 | 214 | let min_heartbeat_timout ~now followers = 215 | 216 | let min_heartbeat_deadline = List.fold_left (fun min_deadline f -> 217 | let {Types.heartbeat_deadline; _ } = f in 218 | if heartbeat_deadline < min_deadline 219 | then heartbeat_deadline 220 | else min_deadline 221 | ) max_float followers 222 | in 223 | min_heartbeat_deadline -. now 224 | 225 | end (* Leader *) 226 | 227 | module Timeout_event = struct 228 | 229 | let existing_election_wait election_deadline now = 230 | Types.({ 231 | timeout = election_deadline -. now; 232 | timeout_type = New_leader_election; 233 | }) 234 | 235 | let make_heartbeat_wait timeout = 236 | Types.({ 237 | timeout = timeout; 238 | timeout_type = Heartbeat; 239 | }) 240 | 241 | let next ~now state = 242 | match state.Types.role with 243 | | Types.Follower {Types.election_deadline; _} -> 244 | existing_election_wait election_deadline now 245 | | Types.Leader leader_state -> 246 | make_heartbeat_wait (Leader.min_heartbeat_timout ~now leader_state) 247 | | Types.Candidate {Types.election_deadline; _ } -> 248 | existing_election_wait election_deadline now 249 | 250 | end (* Timeout_event *) 251 | 252 | module Diff = struct 253 | 254 | let leader_change before after = 255 | let open Types in 256 | 257 | let { role = brole; _ } = before in 258 | let { role = arole; _ } = after in 259 | 260 | match brole, arole with 261 | | Follower _ , Leader _ 262 | | Leader _ , Candidate _ -> 263 | (* Impossible transition which would violate the rules of the 264 | * RAFT protocol *) 265 | assert(false) 266 | 267 | | Candidate _ , Leader _ -> 268 | (* Case of the server becoming a leader *) 269 | Some (New_leader after.server_id) 270 | 271 | | Follower {current_leader = Some bleader; _ }, 272 | Follower {current_leader = Some aleader; _ } when bleader = aleader -> 273 | None 274 | (* No leader change, following the same leader *) 275 | 276 | | _, Follower{current_leader = Some aleader;_} -> 277 | Some (New_leader aleader) 278 | (* There is a new leader *) 279 | 280 | | Follower {current_leader = Some _; _}, Candidate _ 281 | | Follower {current_leader = Some _; _}, Follower {current_leader = None; _} 282 | | Leader _ , Follower {current_leader = None; _} -> 283 | Some No_leader 284 | 285 | | Leader _ , Leader _ 286 | | Candidate _ , Candidate _ 287 | | Candidate _ , Follower {current_leader = None;_} 288 | | Follower {current_leader = None; _} , Follower {current_leader = None;_} 289 | | Follower {current_leader = None; _} , Candidate _ -> 290 | None 291 | 292 | let committed_logs before after = 293 | let open Types in 294 | 295 | let { commit_index = bcommit_index; _ } = before in 296 | let { commit_index = acommit_index; _ } = after in 297 | 298 | if acommit_index > bcommit_index 299 | then 300 | let recent_entries = after.log.Log.recent_entries in 301 | let _, prev_commit, sub = Log.IntMap.split bcommit_index recent_entries in 302 | begin match prev_commit with 303 | | None -> assert(bcommit_index = 0) 304 | | Some _ -> () 305 | end; 306 | let sub, last_commit, _ = Log.IntMap.split acommit_index sub in 307 | let sub = match last_commit with 308 | | None -> assert(false) 309 | | Some ({Log.index; _ } as log_entry) -> 310 | Log.IntMap.add index log_entry sub 311 | in 312 | let committed_entries = List.map snd (Log.IntMap.bindings sub) in 313 | committed_entries 314 | else 315 | (* Should we assert false if after < before, ie it's a violation 316 | of the RAFT protocol *) 317 | [] 318 | 319 | end (* Diff *) 320 | -------------------------------------------------------------------------------- /src/raft_helper.mli: -------------------------------------------------------------------------------- 1 | (** Helper functions for manipulating Raft types *) 2 | 3 | module Configuration : sig 4 | 5 | val is_majority : Raft_types.configuration -> int -> bool 6 | (** [is_majority configuration nb] returns true if [nb] is a majority *) 7 | 8 | end (* Configuration *) 9 | 10 | module Follower : sig 11 | 12 | val make : 13 | ?log:Raft_log.t -> 14 | ?commit_index:int -> 15 | ?current_term:int -> 16 | configuration:Raft_types.configuration -> 17 | now:float -> 18 | server_id:int -> 19 | unit -> 20 | Raft_types.state 21 | (** [create ~configuration ~now ~server_id ()] creates an initial 22 | follower state.*) 23 | 24 | val become : 25 | ?current_leader:int -> 26 | now:float -> 27 | term:int -> 28 | Raft_types.state -> 29 | Raft_types.state 30 | (** [become ~current_leader state term] return the new follower state. 31 | 32 | {ul 33 | {li [voted_for] is [None]} 34 | {li [current_leader] is taken from the function argument} 35 | {li [current_term] is taken from the function argument} 36 | } *) 37 | 38 | end (* Follower *) 39 | 40 | module Candidate : sig 41 | 42 | val become : 43 | now:float -> 44 | Raft_types.state -> 45 | Raft_types.state 46 | (** [become state now] returns the new state with Candidate role. 47 | [current_term] is incremented and [vote_count] initialized to 1. 48 | (ie we assume the candidate votes for itself. 49 | 50 | The [election_timeout] is reset to a random number between the boundaries 51 | of the configuration. *) 52 | 53 | val increment_vote_count : 54 | Raft_types.candidate_state -> 55 | Raft_types.candidate_state 56 | (** [increment_vote_count state] increments the candidate vote count 57 | by 1. 58 | 59 | This function is called upon receiving a successful response to a vote 60 | request to one of the servers. *) 61 | 62 | end (* Candidate *) 63 | 64 | module Leader : sig 65 | 66 | val become : 67 | now:float -> 68 | Raft_types.state -> 69 | Raft_types.state 70 | (** [become state] returns the new state with a Leader role. 71 | 72 | While only candidate with a majority are allowed by the protocol to 73 | become a leader, this function does not perform any checks but simply 74 | initialize the role of Leader. 75 | 76 | The calling application is responsible to ensure that it is correct 77 | to become a leader. *) 78 | 79 | 80 | val update_follower_last_log_index : 81 | follower_id:int -> 82 | index:int -> 83 | Raft_types.leader_state -> 84 | (Raft_types.leader_state * int) 85 | (** [update_receiver_last_log_index leader_state receiver_id last_log_index] 86 | updates the leader state with the [last_log_index] information received 87 | from a server. (Both [next_index] and [match_index] are updated. 88 | 89 | The function returns [(state, nb_of_replication)]. The 90 | [nb_of_replication] is useful for the application to determine how many 91 | servers have replicated the log and therefore determine if it can 92 | be considered commited. *) 93 | 94 | val record_response_received : 95 | follower_id:int -> 96 | Raft_types.leader_state -> 97 | Raft_types.leader_state 98 | (** [record_response_received ~server_id leader_state] keeps track of the 99 | fact that there are no more outstanding request for [server_id]. 100 | *) 101 | 102 | val decrement_next_index : 103 | follower_last_log_index:int -> 104 | follower_id:int -> 105 | Raft_types.state -> 106 | Raft_types.leader_state -> 107 | Raft_types.leader_state 108 | 109 | val min_heartbeat_timout : 110 | now:float -> 111 | Raft_types.leader_state -> 112 | float 113 | (** [min_heartbeat_timout ~now ~leader_state] returns when the next timeout 114 | event should occured based on the last request sent to the 115 | followers *) 116 | 117 | end (* Leader *) 118 | 119 | module Timeout_event : sig 120 | 121 | val next : now:float -> Raft_types.state -> Raft_types.timeout_event 122 | (** [next ~now state] returns the next timeout event which should happened 123 | unless another RAFT event happened first. *) 124 | 125 | end (* Timeout_event *) 126 | 127 | module Diff : sig 128 | val leader_change : 129 | Raft_types.state -> 130 | Raft_types.state -> 131 | Raft_types.leader_change option 132 | (** [notifications before after] computes the notification between 2 states 133 | *) 134 | 135 | val committed_logs : 136 | Raft_types.state -> 137 | Raft_types.state -> 138 | Raft_log.log_entry list 139 | (** [committed_logs before after] returns the newly committed log entries 140 | between [before] and [after] state *) 141 | 142 | end (* Diff *) 143 | -------------------------------------------------------------------------------- /src/raft_log.ml: -------------------------------------------------------------------------------- 1 | type log_entry = { 2 | index : int; 3 | term : int; 4 | data : bytes; 5 | id : string; 6 | } 7 | 8 | let pp_log_entry fmt {index; term; id; _} = 9 | Format.fprintf fmt "{index: %i; term: %i, id: %s}" 10 | index term id 11 | 12 | module IntMap = Map.Make(struct 13 | type t = int 14 | let compare (x:int) (y:int) = Pervasives.compare x y 15 | end) 16 | 17 | type size = 18 | | Number of int 19 | | Bytes of int * int 20 | 21 | let add size sum log_entry = 22 | match size with 23 | | Number _ -> (sum + 1) 24 | | Bytes (_, overhead) -> 25 | let {data; id; _} = log_entry in 26 | sum + overhead + 16 (* 2x64bits for index/term *) 27 | + (String.length id) + (Bytes.length data) 28 | 29 | let has_reach_max size sum = 30 | match size with 31 | | Number n -> sum >= n 32 | | Bytes (b,_) -> sum >= b 33 | 34 | type max_log_size = { 35 | upper_bound : int; 36 | lower_bound : int; 37 | } 38 | 39 | type t = { 40 | recent_entries : log_entry IntMap.t; 41 | max_log_size : max_log_size; 42 | } 43 | 44 | type log_diff = { 45 | added_logs : log_entry list; 46 | deleted_logs : log_entry list; 47 | } 48 | 49 | let empty_diff = { added_logs = []; deleted_logs = [] } 50 | 51 | let empty max_log_size = { 52 | recent_entries = IntMap.empty; 53 | max_log_size; 54 | } 55 | 56 | let last_log_index_and_term {recent_entries; _ } = 57 | match IntMap.max_binding recent_entries with 58 | | (_ , {index;term; _}) -> (index, term) 59 | | exception Not_found -> (0, 0) 60 | 61 | let last_log_index {recent_entries; _} = 62 | match IntMap.max_binding recent_entries with 63 | | (_ , {index; _}) -> index 64 | | exception Not_found -> 0 65 | 66 | exception Done of (log_entry list * int) 67 | 68 | let log_entries_since ~since ~max log = 69 | let {recent_entries ; _} = log in 70 | if recent_entries = IntMap.empty 71 | then ([], 0) 72 | (* TODO questionable, shoudl all cases go to the sub function *) 73 | else 74 | let _, prev, sub = IntMap.split since recent_entries in 75 | let prev_term = 76 | match prev with 77 | | None -> assert (since = 0); 0 78 | | Some {term; _} -> term 79 | in 80 | 81 | let log_entries, _ = 82 | try IntMap.fold (fun _ log_entry (log_entries, sum) -> 83 | let sum' = add max sum log_entry in 84 | if has_reach_max max sum' 85 | then raise (Done (log_entries, sum)) 86 | else (log_entry :: log_entries, sum') 87 | ) sub ([], 0) 88 | with | Done x -> x 89 | in 90 | 91 | (List.rev log_entries, prev_term) 92 | (* let sub, _, _ = IntMap.split (since + max + 1) sub in 93 | (List.map snd (IntMap.bindings sub), prev_term) *) 94 | 95 | (* Enforce that the size of the recent_entries stays within the 96 | * max log size configuration *) 97 | let truncate add_size ({recent_entries; max_log_size; _} as t) = 98 | let {upper_bound; lower_bound} = max_log_size in 99 | match IntMap.min_binding recent_entries with 100 | | exception Not_found -> t 101 | (* when empty, no need for size limitation *) 102 | | (lower_index, _) -> 103 | let (upper_index, _) = IntMap.max_binding recent_entries in 104 | let size = upper_index - lower_index + 1 in 105 | if size + add_size > upper_bound 106 | then 107 | let over = size - lower_bound in 108 | let lower_index = lower_index + over + add_size - 1 in 109 | let _, _, recent_entries = IntMap.split lower_index recent_entries in 110 | {t with recent_entries} 111 | else 112 | t 113 | 114 | let add_log_datas current_term datas log = 115 | let log = truncate (List.length datas) log in 116 | 117 | let rec aux term last_log_index recent_entries added_logs = function 118 | | [] -> (recent_entries, (List.rev added_logs)) 119 | | (data, id)::tl -> 120 | let last_log_index = last_log_index + 1 in 121 | 122 | let new_log_entry = { 123 | index = last_log_index; 124 | term; data; id; 125 | } in 126 | 127 | let added_logs = new_log_entry :: added_logs in 128 | 129 | let recent_entries = 130 | IntMap.add last_log_index new_log_entry recent_entries 131 | in 132 | 133 | aux term last_log_index recent_entries added_logs tl 134 | in 135 | 136 | let term = current_term in 137 | let last_log_index = last_log_index log in 138 | let recent_entries = log.recent_entries in 139 | 140 | let recent_entries, added_logs = 141 | aux term last_log_index recent_entries [] datas 142 | in 143 | 144 | let log_diff = { deleted_logs = []; added_logs; } in 145 | 146 | ({log with recent_entries}, log_diff) 147 | 148 | let add_log_entries ~log_entries log = 149 | let log = truncate (List.length log_entries) log in 150 | let rec aux recent_entries = function 151 | | [] -> 152 | {log with recent_entries} 153 | 154 | | hd::tl -> 155 | let recent_entries = IntMap.add hd.index hd recent_entries in 156 | aux recent_entries tl 157 | in 158 | 159 | let log_diff = { 160 | added_logs = log_entries; 161 | deleted_logs = []; 162 | } in 163 | 164 | (aux log.recent_entries log_entries, log_diff) 165 | 166 | let remove_log_since ~prev_log_index ~prev_log_term log = 167 | let {recent_entries; max_log_size = _ } = log in 168 | if prev_log_index > (last_log_index log) 169 | then raise Not_found 170 | else 171 | 172 | let before, e, after = IntMap.split prev_log_index recent_entries in 173 | let recent_entries, deleted_logs_map = 174 | match e with 175 | | None -> 176 | if prev_log_index = 0 177 | then (before, after) 178 | else raise Not_found 179 | | Some ({term; index; _} as log_entry) -> 180 | if term = prev_log_term 181 | then (IntMap.add index log_entry before, after) 182 | else raise Not_found 183 | in 184 | 185 | let deleted_logs = List.map snd @@ IntMap.bindings deleted_logs_map in 186 | 187 | ( 188 | {log with 189 | recent_entries;}, 190 | {deleted_logs; added_logs = []} 191 | ) 192 | 193 | let merge_diff lhs rhs = 194 | match lhs, rhs with 195 | | {added_logs = []; deleted_logs = []}, rhs -> rhs 196 | | lhs, {added_logs = []; deleted_logs = []} -> lhs 197 | 198 | | {added_logs; deleted_logs = []}, 199 | {added_logs = []; deleted_logs} 200 | | {added_logs = []; deleted_logs}, 201 | {added_logs; deleted_logs = []} -> {added_logs; deleted_logs} 202 | | _ -> assert(false) 203 | 204 | module Builder = struct 205 | 206 | type builder = t 207 | 208 | let make max_log_size = empty max_log_size 209 | 210 | let add_log_entry log log_entry = 211 | let log = truncate 1 log in 212 | (* assert(log_entry.index > (last_log_index log)); 213 | *) 214 | { log with 215 | recent_entries = IntMap.add log_entry.index log_entry log.recent_entries; 216 | } 217 | 218 | let to_log x = x 219 | 220 | end (* Builder *) 221 | -------------------------------------------------------------------------------- /src/raft_log.mli: -------------------------------------------------------------------------------- 1 | (** Log Data Structure and related logic*) 2 | 3 | (** {2 Types} *) 4 | 5 | (** unit of log *) 6 | type log_entry = { 7 | index : int; 8 | term : int; 9 | data : bytes; 10 | id : string; 11 | } 12 | 13 | (** Log size limitation parameters. 14 | 15 | The log data structure can grow to an inifinite size as new logs 16 | keeps being appended. However for practical reason it is important to limit 17 | the size of the log in memory. [upper_bound] defines the maximum size, 18 | while [lower_bound] is the size the log is truncated to when 19 | reaching its [upper_bound] value. *) 20 | type max_log_size = { 21 | upper_bound : int; 22 | lower_bound : int; 23 | } 24 | 25 | module IntMap : Map.S with type key = int 26 | 27 | (** log data structure *) 28 | type t = { 29 | recent_entries : log_entry IntMap.t; 30 | max_log_size : max_log_size; 31 | } 32 | 33 | (** {2 Creators} *) 34 | 35 | val empty : max_log_size -> t 36 | (** [empty] is an empty log *) 37 | 38 | (** {2 Accessors} *) 39 | 40 | val last_log_index_and_term : t -> (int * int) 41 | (** [last_log_index_and_term state] returns the [(index, term)] of the last log 42 | entry. If the log is empty [(0, 0)] is returned.*) 43 | 44 | val last_log_index: t -> int 45 | (** [last_log_index state] returns the index of the last log entry.*) 46 | 47 | (** Various representation for the size of a subset of the log *) 48 | type size = 49 | | Number of int 50 | | Bytes of int * int (* (value, overhead per entry) *) 51 | 52 | val log_entries_since : since:int -> max:size -> t -> (log_entry list * int) 53 | (** [log_entries_since ~since:index ~max log] returns 54 | [(log_entries, prev_log_term)] in chronological order. If not empty 55 | [List.hd log_entrie] is the log entry with [index = since + 1]. 56 | 57 | [max] size is enforced by the function. 58 | *) 59 | 60 | (** {2 Modifiers} *) 61 | 62 | (** A data structure detailing the modification done to the log by the 63 | modifiers functions *) 64 | type log_diff = { 65 | added_logs : log_entry list; 66 | deleted_logs : log_entry list; 67 | } 68 | 69 | val empty_diff : log_diff 70 | (** [empty_diff] represent the no difference value *) 71 | 72 | 73 | val merge_diff : log_diff -> log_diff -> log_diff 74 | (** [merge_diff lhs rhs] merges together 2 different diff. Right now it only 75 | works if lhs or rhs are not both containing either added logs or removed 76 | logs *) 77 | 78 | val add_log_datas : int -> (bytes * string) list -> t -> (t * log_diff) 79 | (** [add_log_datas current_term datas state] adds [datas] to the log of the 80 | [state]. 81 | 82 | Note that the logs are in chronological order 83 | In other word [List.hd datas] is the earliest entry 84 | and should be appended first to the server logs.*) 85 | 86 | val add_log_entries : log_entries:log_entry list -> t -> (t * log_diff) 87 | (** [add_log_entries log_entries log] appends [log_entries] to the 88 | [log]. 89 | This function assumes [log_entries] are in chronological order. *) 90 | 91 | val remove_log_since : 92 | prev_log_index:int -> 93 | prev_log_term:int -> 94 | t -> 95 | (t * log_diff) 96 | (** [remove_log_since ~prev_log_index ~prev_log_term log] removes all the 97 | entries which are after the log_entry defined by [prev_log_index] and 98 | [prev_log_term]. 99 | 100 | If [log] does not contain any log_entry defined by [prev_log_index] and 101 | [prev_log_term] then [Not_found] is raised. *) 102 | 103 | (** {2 Utilities} *) 104 | 105 | val pp_log_entry : Format.formatter -> log_entry -> unit 106 | (** [pp_log_entry fmt log_entry] format [log_entry] *) 107 | 108 | (** Helper module to build the log type from saved log entries. 109 | 110 | This module is designed to be used by a RAFT server at start time. *) 111 | module Builder : sig 112 | 113 | type builder 114 | 115 | val make : max_log_size -> builder 116 | 117 | val add_log_entry : builder -> log_entry -> builder 118 | 119 | val to_log : builder -> t 120 | 121 | end (* Builder *) 122 | -------------------------------------------------------------------------------- /src/raft_protocol.ml: -------------------------------------------------------------------------------- 1 | module Types = Raft_types 2 | module Follower = Raft_helper.Follower 3 | module Candidate = Raft_helper.Candidate 4 | module Leader = Raft_helper.Leader 5 | module Configuration = Raft_helper.Configuration 6 | module Log = Raft_log 7 | module Timeout_event = Raft_helper.Timeout_event 8 | module Helper = Raft_helper 9 | 10 | let make_result ?(msgs_to_send = []) ?leader_change ?(deleted_logs = []) 11 | ?(committed_logs = []) ?(added_logs = []) state = { 12 | Types.state; 13 | messages_to_send = msgs_to_send; 14 | leader_change; 15 | committed_logs; 16 | added_logs; 17 | deleted_logs; 18 | } 19 | 20 | module Log_entry_util = struct 21 | 22 | let make_append_entries prev_log_index state = 23 | 24 | let to_send, prev_log_term = 25 | let since = prev_log_index in 26 | let max = state.Types.configuration.Types.max_nb_logs_per_message in 27 | Log.log_entries_since ~since ~max state.Types.log 28 | in 29 | 30 | let request = Types.({ 31 | leader_term = state.current_term; 32 | leader_id = state.server_id; 33 | prev_log_index; 34 | prev_log_term; 35 | log_entries = to_send; 36 | leader_commit = state.commit_index; 37 | }) in 38 | 39 | request 40 | 41 | let compute_append_entries ?(force = false) state followers now = 42 | 43 | let rec aux followers msgs_to_send = function 44 | | [] -> (List.rev followers, msgs_to_send) 45 | 46 | | follower::tl -> 47 | let { 48 | Types.follower_id; 49 | heartbeat_deadline; 50 | outstanding_request; 51 | next_index;_ 52 | } = follower in 53 | 54 | let shoud_send_request = 55 | if force || now >= heartbeat_deadline 56 | then true 57 | (* The heartbeat deadline is past due, the [Leader] must 58 | * sent an [Append_entries] request. *) 59 | else 60 | if outstanding_request 61 | then false 62 | (* In case of an outstanding request there is no point 63 | * in sending a new request to that server. 64 | * Even if the outstanding request was lost and it could be 65 | * beneficial to send a new request, this would happen 66 | * at the next heartbeat. We assume it's more likely that 67 | * the server is down and therefore there is no need to keep 68 | * on sending the same request over and over. *) 69 | else 70 | let prev_index = next_index - 1 in 71 | let last_log_index = Log.last_log_index state.Types.log in 72 | if prev_index = last_log_index 73 | then false 74 | (* The receipient has the most recent log entry and the 75 | * heartbeat deadline has not expired, no need to send a 76 | * new heartbeat. *) 77 | else true 78 | (* The recipient is missing recent log entries *) 79 | in 80 | 81 | if shoud_send_request 82 | then 83 | let request = make_append_entries (next_index - 1) state in 84 | let follower = 85 | let outstanding_request = true in 86 | let heartbeat_deadline = 87 | now +. state.Types.configuration.Types.hearbeat_timeout 88 | in 89 | {follower with 90 | Types.outstanding_request; 91 | Types.heartbeat_deadline; 92 | } 93 | in 94 | let followers = follower::followers in 95 | let msgs_to_send = 96 | let msg = (Types.Append_entries_request request, follower_id) in 97 | msg::msgs_to_send 98 | in 99 | aux followers msgs_to_send tl 100 | else 101 | aux (follower::followers) msgs_to_send tl 102 | in 103 | aux [] [] followers 104 | 105 | end (* Log_entry_util *) 106 | 107 | (** {2 Request Vote} *) 108 | 109 | let make_request_vote_request state = 110 | let index, term = Log.last_log_index_and_term state.Types.log in 111 | Types.({ 112 | candidate_term = state.current_term; 113 | candidate_id = state.server_id; 114 | candidate_last_log_index = index; 115 | candidate_last_log_term = term; 116 | }) 117 | 118 | let handle_request_vote_request state request now = 119 | let { 120 | Types.candidate_id; 121 | candidate_term; 122 | candidate_last_log_index;_} = request in 123 | 124 | let make_response state vote_granted = 125 | Types.({ 126 | voter_term = state.current_term; 127 | voter_id = state.server_id ; 128 | vote_granted; 129 | }) 130 | in 131 | 132 | if candidate_term < state.Types.current_term 133 | then 134 | (* This request is coming from a candidate lagging behind ... 135 | * no vote for him. *) 136 | (state, make_response state false) 137 | else 138 | let state = 139 | (* Enforce invariant that if this server is lagging behind 140 | * it must convert to a follower and update to that term. *) 141 | if candidate_term > state.Types.current_term 142 | then Follower.become ~term:candidate_term ~now state 143 | else state 144 | in 145 | 146 | let last_log_index = Log.last_log_index state.Types.log in 147 | 148 | if candidate_last_log_index < last_log_index 149 | then 150 | (* Enforce the safety constraint by denying vote if this server 151 | * last log is more recent than the candidate one.*) 152 | (state, make_response state false) 153 | else 154 | let role = state.Types.role in 155 | match role with 156 | | Types.Follower {Types.voted_for = None; _} -> 157 | (* This server has never voted before, candidate is getting the vote 158 | * 159 | * In accordance to the `Rules for Servers`, the follower must 160 | * reset its election deadline when granting its vote. *) 161 | 162 | let { 163 | Types.configuration = { 164 | Types.election_timeout; _ 165 | }; _} = state in 166 | 167 | let state = {state with 168 | Types.role = Types.Follower { 169 | Types.voted_for = Some candidate_id; 170 | Types.current_leader = None; 171 | Types.election_deadline = now +. election_timeout; 172 | } 173 | } in 174 | (state, make_response state true) 175 | 176 | | Types.Follower {Types.voted_for = Some id; _} when id = candidate_id -> 177 | (* This server has already voted for that candidate... reminding him *) 178 | (state, make_response state true) 179 | 180 | | _ -> 181 | (* Server has previously voted for another candidate or 182 | * itself *) 183 | (state, make_response state false) 184 | 185 | let handle_request_vote_response state response now = 186 | let {Types.current_term; role; configuration; _ } = state in 187 | let {Types.voter_term; vote_granted; _ } = response in 188 | 189 | if voter_term > current_term 190 | then 191 | (* Enforce invariant that if this server is lagging behind 192 | * it must convert to a follower and update to the latest term. *) 193 | let state = Follower.become ~term:voter_term ~now state in 194 | (state, []) 195 | 196 | else 197 | match role, vote_granted with 198 | | Types.Candidate ({Types.vote_count; _ } as candidate_state) , true -> 199 | 200 | if Configuration.is_majority configuration (vote_count + 1) 201 | then 202 | (* By reaching a majority of votes, the candidate is now 203 | * the new leader *) 204 | let state = Leader.become ~now state in 205 | 206 | (* As a new leader, the server must send Append entries request 207 | * to the other servers to both establish its leadership and 208 | * start synching its log with the others. *) 209 | begin match state.Types.role with 210 | | Types.Leader followers -> 211 | 212 | let followers, msgs_to_send = 213 | let force = true in 214 | Log_entry_util.compute_append_entries ~force state followers now 215 | in 216 | 217 | let state = Types.{state with role = Leader followers} in 218 | (state, msgs_to_send) 219 | | _ -> assert(false) 220 | end 221 | else 222 | (* Candidate has a new vote but not yet reached the majority *) 223 | let new_state = Types.{state with 224 | role = Candidate (Candidate.increment_vote_count candidate_state); 225 | } in 226 | (new_state, [] (* no message to send *)) 227 | 228 | | Types.Candidate _ , false 229 | (* The vote was denied, the election keeps on going until 230 | * its deadline. *) 231 | 232 | | Types.Follower _ , _ 233 | | Types.Leader _ , _ -> (state, []) 234 | (* If the server is either Follower or Leader, it means that 235 | * it has changed role in between the time it sent the 236 | * [RequestVote] request and this response. 237 | * This response can therefore safely be ignored and the server 238 | * keeps its current state.Types. *) 239 | 240 | (** {2 Append Entries} *) 241 | 242 | let update_state leader_commit receiver_last_log_index log state = 243 | if leader_commit > state.Types.commit_index 244 | then 245 | let commit_index = min leader_commit receiver_last_log_index in 246 | {state with Types.log; commit_index} 247 | else 248 | {state with Types.log} 249 | 250 | let handle_append_entries_request state request now = 251 | let {Types.leader_term; leader_id; _} = request in 252 | 253 | let make_response state result = 254 | Types.({ 255 | receiver_term = state.current_term; 256 | receiver_id = state.server_id; 257 | result; 258 | }) 259 | in 260 | 261 | if leader_term < state.Types.current_term 262 | then 263 | (* This request is coming from a leader lagging behind... *) 264 | (state, make_response state Types.Term_failure, Log.empty_diff) 265 | 266 | else 267 | (* This request is coming from a legetimate leader, 268 | * let's ensure that this server is a follower. *) 269 | let state = 270 | let current_leader = leader_id and term = leader_term in 271 | Follower.become ~current_leader ~term ~now state 272 | in 273 | 274 | (* Next step is to handle the log entries from the leader. *) 275 | let { 276 | Types.prev_log_index; 277 | prev_log_term; 278 | log_entries; 279 | leader_commit; _ } = request in 280 | 281 | let ( 282 | receiver_last_log_index, 283 | receiver_last_log_term 284 | ) = Log.last_log_index_and_term state.Types.log in 285 | 286 | let commit_index = state.Types.commit_index in 287 | 288 | if prev_log_index < commit_index 289 | then 290 | (* The only reason that can happen is if the messages 291 | * are delivered out of order. (Which is completely possible). 292 | * No need to update this follower. *) 293 | let success = Types.Success receiver_last_log_index in 294 | (state, make_response state success, Log.empty_diff) 295 | 296 | else 297 | if leader_term = receiver_last_log_term 298 | then 299 | match compare prev_log_index receiver_last_log_index with 300 | | 0 -> 301 | (* Leader info about the receiver last log index is matching 302 | * perfectly, we can append the logs. *) 303 | let log, log_diff = 304 | Log.add_log_entries ~log_entries state.Types.log 305 | in 306 | let receiver_last_log_index = Log.last_log_index log in 307 | let state = 308 | update_state leader_commit receiver_last_log_index log state 309 | in 310 | 311 | let success = Types.Success receiver_last_log_index in 312 | (state, make_response state success, log_diff) 313 | 314 | | x when x > 0 -> 315 | (* This should really never happen since: 316 | * - No logs belonging to the Leader term can be removed 317 | * - The leader is supposed to keep track of the latest log from 318 | * the receiver within the same term. 319 | *) 320 | let failure = Types.Log_failure receiver_last_log_index in 321 | (state, make_response state failure, Log.empty_diff) 322 | 323 | | _ (* x when x < 0 *) -> 324 | (* This case is possible when messages are received out of order by 325 | * the Follower 326 | * 327 | * Note that even if the prev_log_index is earlier, it's important 328 | * that no log entry is removed from the log if they come from the 329 | * current leader. 330 | * 331 | * The current leader might have sent a commit message back to a 332 | * client believing that the log entry is replicated on this server. 333 | * If we remove the log entry we violate the assumption. *) 334 | let success = Types.Success receiver_last_log_index in 335 | (state, make_response state success, Log.empty_diff) 336 | 337 | else (* leader_term > receiver_last_log_term *) 338 | 339 | if prev_log_index > receiver_last_log_index 340 | then 341 | (* This is likely the case after a new election, the Leader has 342 | * more log entries in its log and assumes that all followers have 343 | * the same number of log entries. *) 344 | let failure = Types.Log_failure receiver_last_log_index in 345 | (state, make_response state failure, Log.empty_diff) 346 | else 347 | (* Because it is a new Leader, this follower can safely remove all 348 | * the logs from previous terms which were not committed. *) 349 | 350 | match Log.remove_log_since ~prev_log_index 351 | ~prev_log_term state.Types.log with 352 | | exception Not_found -> 353 | let failure = Types.Log_failure commit_index in 354 | (state, make_response state failure, Log.empty_diff) 355 | (* This is the case where there is a mismatch between the [Leader] 356 | * and this server and the log entry identified with 357 | * (prev_log_index, prev_log_term) 358 | * could not be found. 359 | * 360 | * In such a case, the safest log entry to synchronize upon is the 361 | * commit_index 362 | * of the follower. *) 363 | 364 | | log, log_diff -> 365 | let log, log_diff' = Log.add_log_entries ~log_entries log in 366 | let log_diff = Log.merge_diff log_diff log_diff' in 367 | let receiver_last_log_index = Log.last_log_index log in 368 | let state = 369 | update_state leader_commit receiver_last_log_index log state 370 | in 371 | let success = Types.Success receiver_last_log_index in 372 | (state, make_response state success, log_diff) 373 | 374 | let handle_append_entries_response state response now = 375 | let { 376 | Types.receiver_term; 377 | receiver_id = follower_id ; 378 | result} = response in 379 | 380 | if receiver_term > state.Types.current_term 381 | then 382 | (* Enforce invariant that if the server is lagging behind 383 | * it must convert to a follower and update to that term. *) 384 | (Follower.become ~term:receiver_term ~now state , []) 385 | 386 | else 387 | match state.Types.role with 388 | | Types.Follower _ 389 | | Types.Candidate _ -> (state, []) 390 | 391 | | Types.Leader followers -> 392 | 393 | let followers = 394 | Leader.record_response_received ~follower_id followers 395 | in 396 | 397 | begin match result with 398 | | Types.Success follower_last_log_index -> 399 | (* Log entries were successfully inserted by the receiver... 400 | * 401 | * let's update our leader state about that receiver *) 402 | 403 | let configuration = state.Types.configuration in 404 | 405 | let followers , nb_of_replications = 406 | Leader.update_follower_last_log_index 407 | ~follower_id ~index:follower_last_log_index followers 408 | in 409 | 410 | let state = 411 | (* Check if the received log entry from has reached 412 | * a majority of server. 413 | * Note that we need to add `+1` simply to count this 414 | * server (ie leader) which does not update its next/match 415 | * *) 416 | if Configuration.is_majority configuration (nb_of_replications + 1) && 417 | follower_last_log_index > state.Types.commit_index 418 | then {state with Types.commit_index = follower_last_log_index;} 419 | else state 420 | in 421 | 422 | let followers, msgs_to_send = 423 | Log_entry_util.compute_append_entries state followers now 424 | in 425 | 426 | let state = Types.({state with role = Leader followers}) in 427 | 428 | (state, msgs_to_send) 429 | 430 | | Types.Log_failure follower_last_log_index -> 431 | (* The receiver log is not matching this server current belief. 432 | * If a leader this server should decrement the next 433 | * log index and retry the append. *) 434 | let leader_state = 435 | Leader.decrement_next_index 436 | ~follower_last_log_index ~follower_id state followers 437 | in 438 | let leader_state, msgs_to_send = 439 | Log_entry_util.compute_append_entries state leader_state now 440 | in 441 | let state = Types.({state with role = Leader leader_state}) in 442 | (state, msgs_to_send) 443 | 444 | | Types.Term_failure -> 445 | (state, []) 446 | (* The receiver could have detected that this server term was not the 447 | * latest one and sent the [Term_failure] response. 448 | * 449 | * This could typically happen in a network partition: 450 | * 451 | * Old-leader---------X-------New Leader 452 | * \ / 453 | * ---------Follower--------- 454 | * 455 | * In the diagram above this server is the Old leader. 456 | *) 457 | 458 | end (* match result *) 459 | 460 | let init ?log ?commit_index ?current_term ~configuration ~now ~server_id () = 461 | Follower.make ?log ?commit_index ?current_term 462 | ~configuration ~now ~server_id () 463 | 464 | let handle_message state message now = 465 | let state', msgs_to_send, log_diff = 466 | match message with 467 | | Types.Request_vote_request ({Types.candidate_id; _ } as r) -> 468 | let state, response = handle_request_vote_request state r now in 469 | let msgs = 470 | (Types.Request_vote_response response, candidate_id)::[] 471 | in 472 | (state, msgs, Log.empty_diff) 473 | 474 | | Types.Append_entries_request ({Types.leader_id; _ } as r) -> 475 | let state, response, log_diff = 476 | handle_append_entries_request state r now 477 | in 478 | let msgs = 479 | (Types.Append_entries_response response, leader_id) :: [] 480 | in 481 | (state, msgs, log_diff) 482 | 483 | | Types.Request_vote_response r -> 484 | let state, msgs = handle_request_vote_response state r now in 485 | (state, msgs, Log.empty_diff) 486 | 487 | | Types.Append_entries_response r -> 488 | let state, msgs = handle_append_entries_response state r now in 489 | (state, msgs, Log.empty_diff) 490 | 491 | in 492 | let leader_change = Helper.Diff.leader_change state state' in 493 | let committed_logs = Helper.Diff.committed_logs state state' in 494 | let {Log.added_logs; deleted_logs} = log_diff in 495 | make_result ~msgs_to_send ?leader_change ~added_logs 496 | ~deleted_logs ~committed_logs state' 497 | 498 | (* Iterates over all the other server ids. (ie the ones different 499 | * from the state id). *) 500 | let fold_over_servers f e0 state = 501 | 502 | let { 503 | Types.server_id; 504 | configuration = {Types.nb_of_server; _}; _ 505 | } = state in 506 | 507 | let rec aux acc = function 508 | | -1 -> acc 509 | | id -> 510 | let next = id - 1 in 511 | if id = server_id 512 | then aux acc next 513 | else aux (f acc id) next 514 | in 515 | aux e0 (nb_of_server - 1) 516 | let handle_new_election_timeout state now = 517 | let state' = Candidate.become ~now state in 518 | let msgs_to_send = 519 | fold_over_servers (fun acc server_id -> 520 | let request = make_request_vote_request state' in 521 | (Types.Request_vote_request request, server_id) :: acc 522 | ) [] state' 523 | in 524 | let leader_change = Helper.Diff.leader_change state state' in 525 | let committed_logs = Helper.Diff.committed_logs state state' in 526 | make_result ~msgs_to_send ?leader_change ~committed_logs state' 527 | 528 | let handle_heartbeat_timeout state now = 529 | match state.Types.role with 530 | | Types.Leader leader_state -> 531 | let leader_state, msgs_to_send = 532 | Log_entry_util.compute_append_entries state leader_state now 533 | in 534 | let state = Types.({state with role = Leader leader_state}) in 535 | make_result ~msgs_to_send state 536 | | _ -> 537 | make_result state 538 | 539 | type new_log_response = 540 | | Appended of Types.result 541 | | Forward_to_leader of int 542 | | Delay 543 | 544 | let handle_add_log_entries state datas now = 545 | match state.Types.role with 546 | | Types.Follower {Types.current_leader = None ; _ } 547 | | Types.Candidate _ -> 548 | Delay 549 | (* Server in the middle of an election with no [Leader] 550 | * agreed upon yet *) 551 | 552 | | Types.Follower {Types.current_leader = Some leader_id; _ } -> 553 | Forward_to_leader leader_id 554 | (* The [Leader] should be the one centralizing all the 555 | * new log entries. *) 556 | 557 | | Types.Leader leader_state -> 558 | 559 | let log, log_diff = 560 | Log.add_log_datas state.Types.current_term datas state.Types.log 561 | in 562 | 563 | let state' = Types.({state with log }) in 564 | 565 | let leader_state, msgs_to_send = 566 | Log_entry_util.compute_append_entries state' leader_state now 567 | in 568 | 569 | let state' = Types.({state' with role = Leader leader_state }) in 570 | let {Log.added_logs; deleted_logs} = log_diff in 571 | Appended (make_result ~msgs_to_send ~added_logs ~deleted_logs state') 572 | 573 | let next_timeout_event state now = Timeout_event.next ~now state 574 | 575 | let committed_entries_since ~since {Types.commit_index; log; _} = 576 | let max = commit_index - since in 577 | fst @@ Log.log_entries_since ~since ~max:(Log.Number max)log 578 | -------------------------------------------------------------------------------- /src/raft_protocol.mli: -------------------------------------------------------------------------------- 1 | (** Protocol Implementation *) 2 | 3 | (** This module implements the RAFT protocol logic in a functional way and is 4 | agnostic of: 5 | {ul 6 | {- {b Transport protocol}: This module simply defines which message 7 | should be sent upon any RAFT protocol event. It is the caller 8 | responsability to handle all the communication between RAFT servers. 9 | Note that the package {b raft-pb} provides message serialization, 10 | based on Protobuf technology.} 11 | {- {b Persistent storage}: The RAFT protocol requires data to be 12 | recorded permanently. This implementation simply notifies of state 13 | change; the caller is responsible to store this information. 14 | Note that the package {b raft-rocks} provides a persistent storage 15 | solution using RocksDB.} 16 | }*) 17 | 18 | (** {2 Protocol Event Implementation} *) 19 | 20 | val init : 21 | ?log:Raft_log.t -> 22 | ?commit_index:int -> 23 | ?current_term:int -> 24 | configuration:Raft_types.configuration -> 25 | now:Raft_types.time -> 26 | server_id:Raft_types.server_id -> 27 | unit -> 28 | Raft_types.state 29 | (** [init ~configuration ~now ~server_id ()] creates an initial server 30 | state. 31 | 32 | The server is initially a [Follower] and its [election_deadline] will be 33 | set according to the [configuration] and the [now] value. *) 34 | 35 | val handle_message : 36 | Raft_types.state -> 37 | Raft_types.message -> 38 | Raft_types.time -> 39 | Raft_types.result 40 | (** [handle_message state message now] handles a new RAFT message received.*) 41 | 42 | val handle_new_election_timeout : 43 | Raft_types.state -> 44 | Raft_types.time -> 45 | Raft_types.result 46 | (** [handle_new_election_timeout state now] handles a new election timeout.*) 47 | 48 | val handle_heartbeat_timeout : 49 | Raft_types.state -> 50 | Raft_types.time -> 51 | Raft_types.result 52 | (** [handle_heartbeat_timeout state now] handles an heartbeat event.*) 53 | 54 | type new_log_response = 55 | | Appended of Raft_types.result 56 | (** The new log can correctly be handled by this server (ie it is 57 | a valid [Leader] and new [Append_entries] request message can be 58 | sent to follower servers.*) 59 | | Forward_to_leader of int 60 | (** If the current server is not a [Leader], the new log entry should 61 | not be handled by this server but rather forwarded to the current 62 | [Leader] which id is returned.*) 63 | | Delay 64 | (** The current state of the system (as this server is aware) does not 65 | seem to be in a configuration that can handle the the log. 66 | For instance during an election it is possible that this server 67 | is a [Candidate] waiting for more votes. Another scenario would be 68 | that this server is a [Follower] which has not yet received confirmation 69 | from a valid [Leader].*) 70 | 71 | val handle_add_log_entries: 72 | Raft_types.state -> 73 | (bytes * string) list -> 74 | Raft_types.time -> 75 | new_log_response 76 | (** [handle_add_log_entries state data now] processes new log entries [data] 77 | which is a list [(data, id)]. See new_log_response for more 78 | information.*) 79 | 80 | (** {2 Utilities} *) 81 | 82 | val next_timeout_event : 83 | Raft_types.state -> 84 | Raft_types.time -> 85 | Raft_types.timeout_event 86 | (** [next_timeout_event state now] returns the timeout information 87 | that the serve should implement. 88 | 89 | The server application is responsible for managing the main event 90 | loop such as listening for messaging and waking up for timeout events.*) 91 | 92 | val committed_entries_since : 93 | since:int -> 94 | Raft_types.state -> 95 | Raft_log.log_entry list 96 | (** [committed_entries_since ~since state] returns all the committed entries 97 | since the log index [since].*) 98 | -------------------------------------------------------------------------------- /src/raft_types.ml: -------------------------------------------------------------------------------- 1 | module Log = Raft_log 2 | 3 | type time = float 4 | 5 | type duration = float 6 | 7 | type server_id = int 8 | 9 | type configuration = { 10 | nb_of_server : int; 11 | election_timeout : duration; 12 | election_timeout_range : duration; 13 | hearbeat_timeout : duration; 14 | max_nb_logs_per_message : Raft_log.size; 15 | max_log_size : Log.max_log_size; 16 | } 17 | 18 | type request_vote_request = { 19 | candidate_term : int; 20 | candidate_id : server_id; 21 | candidate_last_log_index : int; 22 | candidate_last_log_term : int; 23 | } 24 | 25 | type request_vote_response = { 26 | voter_id : server_id; 27 | voter_term : int; 28 | vote_granted : bool; 29 | } 30 | 31 | type append_entries_request = { 32 | leader_term : int; 33 | leader_id : server_id; 34 | prev_log_index : int; 35 | prev_log_term : int; 36 | log_entries : Log.log_entry list; 37 | leader_commit : int; 38 | } 39 | 40 | type append_entries_response_result = 41 | | Success of int 42 | | Log_failure of int 43 | | Term_failure 44 | 45 | type append_entries_response = { 46 | receiver_id : server_id; 47 | receiver_term : int; 48 | result : append_entries_response_result; 49 | } 50 | 51 | type message = 52 | | Request_vote_request of request_vote_request 53 | | Request_vote_response of request_vote_response 54 | | Append_entries_request of append_entries_request 55 | | Append_entries_response of append_entries_response 56 | 57 | type message_to_send = message * server_id 58 | 59 | type timeout_type = 60 | | New_leader_election 61 | | Heartbeat 62 | 63 | type timeout_event = { 64 | timeout : time; 65 | timeout_type : timeout_type; 66 | } 67 | 68 | type leader_change = 69 | | New_leader of int 70 | | No_leader 71 | 72 | type follower_info = { 73 | follower_id : int; 74 | next_index : int; 75 | match_index : int; 76 | heartbeat_deadline : float; 77 | outstanding_request : bool; 78 | } 79 | 80 | type leader_state = follower_info list 81 | 82 | type candidate_state = { 83 | vote_count : int; 84 | election_deadline : time; 85 | } 86 | 87 | type follower_state = { 88 | voted_for : server_id option; 89 | current_leader : server_id option; 90 | election_deadline : time; 91 | } 92 | 93 | type role = 94 | | Leader of leader_state 95 | | Candidate of candidate_state 96 | | Follower of follower_state 97 | 98 | type state = { 99 | server_id : server_id; 100 | current_term : int; 101 | log : Raft_log.t; 102 | commit_index : int; 103 | role : role; 104 | configuration : configuration; 105 | } 106 | 107 | type result = { 108 | state : state; 109 | messages_to_send : message_to_send list; 110 | leader_change : leader_change option; 111 | committed_logs : Raft_log.log_entry list; 112 | added_logs : Raft_log.log_entry list; 113 | deleted_logs : Raft_log.log_entry list; 114 | } 115 | 116 | 117 | let is_follower {role; _} = 118 | match role with 119 | | Follower _ -> true 120 | | _ -> false 121 | 122 | let is_candidate {role; _ } = 123 | match role with 124 | | Candidate _ -> true 125 | | _ -> false 126 | 127 | let is_leader {role; _ } = 128 | match role with 129 | | Leader _ -> true 130 | | _ -> false 131 | 132 | let current_leader {server_id; role; _} = 133 | match role with 134 | | Follower {current_leader; _ }-> current_leader 135 | | Candidate _ -> None 136 | | Leader _ -> Some server_id 137 | -------------------------------------------------------------------------------- /src/raft_types.mli: -------------------------------------------------------------------------------- 1 | (** Protocol Types: State, Message and Events *) 2 | 3 | (** {2 Common types } *) 4 | 5 | type time = float 6 | (** Monotonic time (Absolute value) *) 7 | 8 | type duration = float 9 | (** Duration, must be in the same unit as time*) 10 | 11 | type server_id = int 12 | (** Server id *) 13 | 14 | (** Configuration *) 15 | type configuration = { 16 | nb_of_server : int; 17 | (** Number of servers for the RAFT clusters, servers are then 18 | identified with 0 indexing *) 19 | election_timeout : duration; 20 | (** Average duration time for which a server will wait before starting 21 | a new election when the leader is not sending messages. *) 22 | election_timeout_range : duration; 23 | (** Duration range for the election timeout. The effective election 24 | timeout is randomly chosen between timeout +|- range/2. 25 | This value must be strickly less than [2 *. election_timeout] *) 26 | hearbeat_timeout : duration; 27 | (** Duration between heartbeat sent by the leader. [hearbeat_timeout] 28 | must be much less than [election_timeout] *) 29 | max_nb_logs_per_message : Raft_log.size; 30 | (** Limit the number of log entries per append entries message *) 31 | max_log_size : Raft_log.max_log_size; 32 | (** define boundaries for the "in-memory log" size limitation *) 33 | } 34 | 35 | (** {2 Protocol Messages} *) 36 | 37 | type request_vote_request = { 38 | candidate_term : int; 39 | candidate_id : server_id; 40 | candidate_last_log_index : int; 41 | candidate_last_log_term : int; 42 | } 43 | 44 | type request_vote_response = { 45 | voter_id : server_id; 46 | voter_term : int; 47 | vote_granted : bool; 48 | } 49 | 50 | type append_entries_request = { 51 | leader_term : int; 52 | leader_id : server_id; 53 | prev_log_index : int; 54 | prev_log_term : int; 55 | log_entries : Raft_log.log_entry list; 56 | leader_commit : int; 57 | } 58 | 59 | type append_entries_response_result = 60 | | Success of int (** receiver last log index *) 61 | | Log_failure of int (** receiver last log index *) 62 | | Term_failure 63 | 64 | type append_entries_response = { 65 | receiver_id : server_id; 66 | receiver_term : int; 67 | result : append_entries_response_result; 68 | } 69 | 70 | type message = 71 | | Request_vote_request of request_vote_request 72 | | Request_vote_response of request_vote_response 73 | | Append_entries_request of append_entries_request 74 | | Append_entries_response of append_entries_response 75 | 76 | (** RAFT Message to send to the given server *) 77 | type message_to_send = message * server_id 78 | 79 | (** {2 Protocol State} *) 80 | 81 | (** Follower information that a leader keeps track of*) 82 | type follower_info = { 83 | follower_id : server_id; 84 | (** Id of the follower *) 85 | next_index : int; 86 | (** Which [Raft_types.log_entry] should be sent next. *) 87 | match_index : int; 88 | (** The last replicated [Raft_types.log_entry] for this follower *) 89 | heartbeat_deadline : time; 90 | (** The time at which a heartbeat should be sent next *) 91 | outstanding_request : bool; 92 | (** Whether or not the follower has an outstanding request (ie if 93 | no response was received since the last time an append entries 94 | request was sent. *) 95 | } 96 | 97 | (** Leader state *) 98 | type leader_state = follower_info list 99 | 100 | (** Candidate state *) 101 | type candidate_state = { 102 | vote_count : int; 103 | (** The number of positive vote received so far *) 104 | election_deadline : time; 105 | (** The time at whic the election for which the server is a candidate 106 | is ending *) 107 | } 108 | 109 | (** Follower state *) 110 | type follower_state = { 111 | voted_for : server_id option; 112 | (** If [None] then this follower has not voted yet, other it is the id 113 | * of the candidate for which it previously voted *) 114 | current_leader : server_id option; 115 | (** Current leader *) 116 | election_deadline : time; 117 | (** The time at which the next election should take place and this 118 | * follower become a candidate if not leadership was established by 119 | * another server. *) 120 | } 121 | 122 | (** Role of a server in the RAFT protocol *) 123 | type role = 124 | | Leader of leader_state 125 | | Candidate of candidate_state 126 | | Follower of follower_state 127 | 128 | (** Raft server state *) 129 | type state = { 130 | server_id : server_id; 131 | (** Unique Identifier must be between [0] and [nb_of_server - 1]. *) 132 | current_term : int; 133 | (** RAFT protocol divide time into terms during which a single 134 | leader can be established. *) 135 | log : Raft_log.t; 136 | (** Set of log entries *) 137 | commit_index : int; 138 | (** The index of the last log entry to be committed. A committed log 139 | is guaranteed by the RAFT protocol to never be removed. *) 140 | role : role; 141 | (** Role of the server. *) 142 | configuration : configuration; 143 | (** Various parameter to configure the RAFT cluster *) 144 | } 145 | 146 | (** {2 API types} *) 147 | 148 | (** The RAFT protocol defines 2 type of Timeout event which should be 149 | triggered if no other protocol event has happened. *) 150 | type timeout_type = 151 | | New_leader_election 152 | (** Timeout until when the server should start a new election (ie 153 | increase term by 1 and become a candidate. *) 154 | | Heartbeat 155 | (** Timeout until when the server which is in a Leader state should 156 | send a heartbeat message to a follower. Heartbeats are used to maintain 157 | leadership and indicate the Leader is still operating as normal *) 158 | 159 | (** Timeout event which defines when (is absolute time) the next timeout 160 | event should be triggered as well as its type *) 161 | type timeout_event = { 162 | timeout : time; 163 | timeout_type : timeout_type; 164 | } 165 | 166 | (** Notification of a change of leadership in the RAFT protocol *) 167 | type leader_change = 168 | | New_leader of server_id 169 | (** A new leader has been elected *) 170 | | No_leader 171 | (** The previous leader is no longer considered a leader. The previous 172 | leader could have been any server including this server. *) 173 | 174 | (** Data returned by each of the protocol event *) 175 | type result = { 176 | state : state; 177 | (** The new state *) 178 | messages_to_send : message_to_send list; 179 | (** Raft messages to be send to the given servers.*) 180 | leader_change : leader_change option; 181 | (** Notification of a change in leaderhsip.*) 182 | committed_logs : Raft_log.log_entry list; 183 | (** Log entries which are now commited.*) 184 | added_logs : Raft_log.log_entry list; 185 | (** Log entries added to the log. Note that this could overlap 186 | with [committed_logs].*) 187 | deleted_logs : Raft_log.log_entry list; 188 | (** Log entries deleted from the log.*) 189 | } 190 | 191 | (** {2 Role functionality} *) 192 | 193 | val is_follower : state -> bool 194 | (** [is_follower state] returns [true] if [state] role is a follower, [false] 195 | otherwise. *) 196 | 197 | val is_candidate : state -> bool 198 | (** [is_candidate state] returns [true] if [state] role is a candidate, [false] 199 | otherwise. *) 200 | 201 | val is_leader : state -> bool 202 | (** [is_leader state] returns [true] if [state] role is a leader, [false] 203 | otherwise. *) 204 | 205 | val current_leader: state -> server_id option 206 | (** [current_leader state] return the current leader for the current term. 207 | If no leader is known then [None] is returned. *) 208 | -------------------------------------------------------------------------------- /tests/test.ml: -------------------------------------------------------------------------------- 1 | [@@@ocaml.warning "-45"] 2 | 3 | open Raft_types 4 | open Raft_log 5 | 6 | module Candidate = Raft_helper.Candidate 7 | module Follower = Raft_helper.Follower 8 | module Leader = Raft_helper.Leader 9 | module Timeout_event = Raft_helper.Timeout_event 10 | 11 | module Protocol = Raft_protocol 12 | 13 | let default_configuration = { 14 | nb_of_server = 3; 15 | election_timeout = 0.1; 16 | election_timeout_range = 0.0; 17 | (* To get deterministic number for testing. 18 | *) 19 | hearbeat_timeout = 0.02; 20 | max_nb_logs_per_message = Number 10; 21 | max_log_size = { 22 | upper_bound = 7; 23 | lower_bound = 5; 24 | } 25 | } 26 | 27 | let recent_log_length {log = {recent_entries; _ }; _ } = 28 | IntMap.cardinal recent_entries 29 | 30 | let recent_log_hd {log = {recent_entries; _ }; _ } = 31 | snd @@ IntMap.max_binding recent_entries 32 | 33 | let initial_state ~now server_id = 34 | let configuration = default_configuration in 35 | Protocol.init ~configuration ~now ~server_id () 36 | 37 | let now = 0. 38 | 39 | let msg_for_server msgs id = 40 | match List.find (fun (_, server_id) -> id = server_id) msgs with 41 | | (msg, _) -> msg 42 | | exception Not_found -> assert(false) 43 | 44 | type t = { 45 | server0 : state; 46 | server1 : state; 47 | server2 : state; 48 | } 49 | 50 | let init () = { 51 | server0 = initial_state ~now 0; 52 | server1 = initial_state ~now 1; 53 | server2 = initial_state ~now 2; 54 | } 55 | 56 | (* This part of the test will simulate the successful 57 | * election of server0. Server1 will grant its vote during the election, 58 | * while server2 is kept disconnected from server0 and does not receive 59 | * any message. 60 | * 61 | * At the end of this function, server0 is the leader, server1 is 62 | * a follower of server0 and server2 is still a follower not aware 63 | * than an election occured. *) 64 | let election_1 {server0; server1; server2} now = 65 | 66 | (* All of those severs should have an election timeout randomly 67 | * generated between [election_timeout +/- election_timeout_range/2]. *) 68 | let next_event = Protocol.next_timeout_event server0 now in 69 | assert(next_event.timeout = default_configuration.election_timeout); 70 | assert(next_event.timeout_type = New_leader_election); 71 | 72 | assert(0 = server0.current_term); 73 | assert(0 = server1.current_term); 74 | assert(0 = server2.current_term); 75 | (* The current term initiale value is expected 76 | * to be 0. *) 77 | 78 | (* 79 | * Server0 reach election timeout and kicks off the election 80 | * ------------------------------------------------------------------- 81 | *) 82 | 83 | let { 84 | state = server0; 85 | messages_to_send = msgs; 86 | leader_change; _} = Protocol.handle_new_election_timeout server0 now in 87 | 88 | assert(is_candidate server0); 89 | (* When an election timeout happens the server starts a new election 90 | * and become a [Candidate]. *) 91 | 92 | assert(None = leader_change); 93 | (* There was no previous Leader so no leader_change *) 94 | 95 | assert(1 = server0.current_term); 96 | (* Part of the new election process is to increment the [current_term] *) 97 | 98 | assert(2 = List.length msgs); 99 | List.iter (fun (msg, _) -> 100 | match msg with 101 | | Request_vote_request r -> ( 102 | assert(r.candidate_term = 1); 103 | assert(r.candidate_id = 0); 104 | assert(r.candidate_last_log_index = 0); 105 | assert(r.candidate_last_log_term = 0); 106 | ) 107 | | _ -> assert(false) 108 | ) msgs; 109 | (* Upon becoming a candidate the sever0 is expected to send a `RequestVote` 110 | * message to all the other servers. Since we are in a cluster 111 | * of 3, 2 messages should be sent from by the new [Candidate]. *) 112 | 113 | (* 114 | * Send Request_vote_request Server0 -> Server1 115 | * ----------------------------------------------------------------------- 116 | *) 117 | 118 | let now = now +. 0.001 in 119 | 120 | let res = 121 | let msg = msg_for_server msgs 1 in 122 | Protocol.handle_message server1 msg now 123 | in 124 | 125 | let { 126 | state = server1; 127 | messages_to_send = msgs; 128 | leader_change; 129 | committed_logs; 130 | added_logs; 131 | deleted_logs; 132 | } = res in 133 | 134 | assert(1 = server1.current_term); 135 | (* Server0 [Candidate] sent a higher term to server1 which is then 136 | * expected to update its own [current_term] and be a follower 137 | * (which it was already). *) 138 | 139 | assert(None = leader_change); 140 | assert([] = committed_logs); 141 | assert([] = added_logs); 142 | assert([] = deleted_logs); 143 | 144 | begin match server1.role with 145 | | Follower {voted_for; current_leader; election_deadline; } -> begin 146 | (* Server1 should still be a follower. *) 147 | 148 | assert(voted_for = (Some 0)); 149 | (* Because server1 did not previously vote for a candidate it should 150 | * grant its vote to server0 *) 151 | assert(current_leader = None); 152 | (* Granting a vote does not guarantee that server0 will be a [Leader]. 153 | * 154 | * Note that only a valid [Append_entries] request can establish the leadership 155 | * role of the sender. (We will this later). *) 156 | assert(election_deadline = now +. default_configuration.election_timeout); 157 | (* Election deadline should be updated. *) 158 | end 159 | | _ -> assert(false) 160 | end; 161 | 162 | begin match msgs with 163 | | (msg, 0)::[] -> 164 | (* A single response to server 0 is expected from server1. *) 165 | begin match msg with 166 | | Request_vote_response r -> 167 | assert(r.voter_id = 1); 168 | assert(r.voter_term = 1); 169 | assert(r.vote_granted = true); 170 | (* The message confirms the server1 state and decision to grant its vote 171 | * to server0. *) 172 | | _ -> assert(false) 173 | end 174 | | _ -> assert(false) 175 | end; 176 | 177 | (* 178 | * Send Request_vote_response Server1 -> Server0 179 | * -------------------------------------------------------------------------- 180 | *) 181 | 182 | let now = now +. 0.001 in 183 | 184 | let res = 185 | let msg = msg_for_server msgs 0 in 186 | Protocol.handle_message server0 msg now 187 | in 188 | 189 | let { 190 | state = server0; 191 | messages_to_send = msgs; 192 | leader_change; 193 | committed_logs; 194 | added_logs; 195 | deleted_logs; 196 | } = res in 197 | 198 | assert(is_leader server0); 199 | assert(Some (New_leader 0) = leader_change); 200 | (* Because a single vote is enough to reach a majority in a 3-server 201 | * cluster, server0 becomes a [Leader]. *) 202 | assert(committed_logs = []); 203 | assert(added_logs = []); 204 | assert([] = deleted_logs); 205 | 206 | assert(1 = server0.current_term); 207 | (* Becoming a [Leader] should not affect the term. (Only a new election).*) 208 | 209 | begin match server0.role with 210 | | Leader followers -> ( 211 | assert(2 = List.length followers); 212 | (* The leader maintain various state for each of the 213 | * other servers. *) 214 | 215 | List.iter (fun follower -> 216 | 217 | assert(follower.next_index = 1); 218 | assert(follower.match_index = 0); 219 | assert(follower.heartbeat_deadline = 220 | now +. default_configuration.hearbeat_timeout); 221 | assert(follower.outstanding_request = true); 222 | 223 | ) followers; 224 | ) 225 | | _ -> assert(false) 226 | end; 227 | 228 | assert(2 = List.length msgs); 229 | (* Upon becoming a Leader a server must immediately 230 | * send an [Append_entries] request to all the other servers 231 | * to establish its leadership. *) 232 | 233 | List.iter (fun (msg, _ ) -> 234 | match msg with 235 | | Append_entries_request r -> 236 | assert(1 = r.leader_term); 237 | assert(0 = r.leader_id); 238 | assert(0 = r.prev_log_index); 239 | assert(0 = r.prev_log_term); 240 | assert([] = r.log_entries); 241 | (* We have not yet added any log to the [Leader] so 242 | * no new entries are sent to the other servers. *) 243 | assert(0 = r.leader_commit); 244 | | _ -> assert(false); 245 | ) msgs; 246 | 247 | (* 248 | * Send Append_entries_request Server0 -> Server1 249 | * ---------------------------------------------------------------------- 250 | *) 251 | 252 | let now = now +. 0.001 in 253 | let res = 254 | let msg = msg_for_server msgs 1 in 255 | Protocol.handle_message server1 msg now 256 | in 257 | 258 | let { 259 | state = server1; 260 | messages_to_send = msgs; 261 | leader_change; 262 | committed_logs; 263 | added_logs; 264 | deleted_logs; 265 | } = res in 266 | 267 | assert(Some (New_leader 0)= leader_change); 268 | assert([] = committed_logs); 269 | assert(added_logs = []); 270 | assert([] = deleted_logs); 271 | 272 | begin match server1.role with 273 | | Follower f -> ( 274 | 275 | assert(f.voted_for = Some 0); 276 | (* [voted_for] is still assigned to server0 since the current term 277 | * has not changed. *) 278 | 279 | assert(f.current_leader = Some 0); 280 | (* [Append_entries] request indicates the leadership role of the sender. 281 | * 282 | * server1 then updates its state to keep track of the current [Leader] (ie 283 | * server0 in our case). *) 284 | 285 | assert(f.election_deadline = now +. default_configuration.election_timeout); 286 | (* Because it just receive a message from the [Leader], the 287 | * [election_deadline] is extended for another [election_timeout] amount 288 | * of time. *) 289 | ) 290 | | _ -> assert(false) 291 | end; 292 | 293 | assert(1 = List.length msgs); 294 | (* The server1 is expected to send a single response back to the 295 | * sender (ie server0). *) 296 | 297 | begin match List.hd msgs with 298 | | (Append_entries_response r, 0) -> ( 299 | assert(r.receiver_id = 1); 300 | assert(r.receiver_term = 1); 301 | assert(r.result = Success 0); 302 | ) 303 | | _ -> assert(false) 304 | end; 305 | 306 | ({server0; server1; server2}, now) 307 | 308 | 309 | (* In this part of the test server2 which was previously disconnected 310 | * during the election_1 is now starting its own election for the same 311 | * term 1. Both server1 and server0 denies their vote since they have both 312 | * already granted their vote in this term. 313 | * 314 | * At the end of the election server2 is still a candidate while server0 315 | * maintains its leader role and server1 is its follower. *) 316 | let failed_election_1 {server0; server1; server2} now = 317 | 318 | (* 319 | * Since server2 has not received any message from a leader, 320 | * it will start a new election. 321 | * ---------------------------------------------------------------------- 322 | *) 323 | 324 | let { 325 | state = server2; 326 | messages_to_send = request_vote_msgs; 327 | leader_change; 328 | committed_logs; 329 | added_logs; 330 | deleted_logs; } = Protocol.handle_new_election_timeout server2 now 331 | in 332 | 333 | assert(None = leader_change); 334 | assert([] = committed_logs); 335 | assert(added_logs = []); 336 | assert(deleted_logs = []); 337 | 338 | assert(is_candidate server2); 339 | (* Server2 never got an [Append_entries] request which would have 340 | * established server0 leadership. Therefore as far as server2 is 341 | * concerned there were no leaders. *) 342 | assert(1 = server2.current_term); 343 | assert(2 = List.length request_vote_msgs); 344 | 345 | List.iter (fun (msg, _) -> 346 | match msg with 347 | | Request_vote_request r -> ( 348 | assert(r.candidate_term = 1); 349 | assert(r.candidate_id = 2); 350 | assert(r.candidate_last_log_index = 0); 351 | assert(r.candidate_last_log_term = 0); 352 | ) 353 | | _ -> assert(false) 354 | ) request_vote_msgs; 355 | 356 | (* 357 | * Send Request_vote_request Server2 -> Server1 358 | * --------------------------------------------------------------------- 359 | *) 360 | 361 | let now = now +. 0.001 in 362 | let res = 363 | let msg = msg_for_server request_vote_msgs 1 in 364 | Protocol.handle_message server1 msg now 365 | in 366 | 367 | let { 368 | state = server1; 369 | messages_to_send = msgs; 370 | leader_change; 371 | committed_logs; 372 | added_logs; 373 | deleted_logs; 374 | } = res in 375 | 376 | assert(None = leader_change); 377 | assert([] = committed_logs); 378 | assert(added_logs = []); 379 | assert([] = deleted_logs); 380 | 381 | begin match server1.role with 382 | | Follower f -> ( 383 | assert(f.voted_for = Some 0); 384 | assert(f.current_leader = Some 0); 385 | (* server0 state is unaffected by this new [Candidate] for this 386 | * term. *) 387 | ) 388 | | _ -> assert(false); 389 | end; 390 | 391 | begin match msgs with 392 | | (Request_vote_response r, 2)::[] -> ( 393 | assert(r.voter_id = 1); 394 | assert(r.voter_term = 1); 395 | assert(r.vote_granted = false); 396 | (* Server1 has already voted for server0 in this election 397 | * term, so it should deny its vote to server2. *) 398 | ) 399 | | _ -> assert(false) 400 | end; 401 | 402 | (* Send Request_vote_response Server1 -> Server2 403 | * 404 | * (vote not granted) 405 | * --------------------------------------------------------------------- 406 | *) 407 | 408 | let now = now +. 0.001 in 409 | 410 | let res = 411 | let msg = msg_for_server msgs 2 in 412 | Protocol.handle_message server2 msg now 413 | in 414 | 415 | let { 416 | state = server2; 417 | messages_to_send = msgs; 418 | leader_change; 419 | committed_logs; 420 | added_logs; 421 | deleted_logs; 422 | } = res in 423 | 424 | assert(is_candidate server2); 425 | assert(None = leader_change); 426 | (* Despite the vote not being granted by server1, server2 427 | * should continue to be a [Candidate] until either 428 | * 429 | * - a [Request_vote] response with a granted vote is replied 430 | * by another server 431 | * 432 | * - new election timeout elapsed in this case it will start 433 | * a new election. 434 | * 435 | * - a valid [Leader] sends an [Append_entries] request, in which 436 | * case it will become a [Follower]. 437 | *) 438 | 439 | assert([] = committed_logs); 440 | assert(added_logs = []); 441 | assert([] = deleted_logs); 442 | assert(1 = server2.current_term); 443 | (* No new election should have been started. 444 | *) 445 | 446 | assert([] = msgs); 447 | (* server2 is still a candidate but for the time being it has no 448 | * message to send to any servers. *) 449 | 450 | (* 451 | * Send Request_vote Server2 -> Server0 452 | * ----------------------------------------------------------------------- 453 | *) 454 | 455 | let res = 456 | let msg = msg_for_server request_vote_msgs 0 in 457 | Protocol.handle_message server0 msg now 458 | in 459 | 460 | let { 461 | state = server0; 462 | messages_to_send = msgs; 463 | leader_change; 464 | committed_logs; 465 | added_logs; 466 | deleted_logs; 467 | } = res in 468 | 469 | assert(is_leader server0); 470 | assert(None = leader_change); 471 | (* Server0 is still a [Leader] and should not be affected 472 | * by a Candidate for the same term. 473 | * 474 | * (This would be different if the Candidate was for a later term) *) 475 | assert(1 = List.length msgs); 476 | assert([] = committed_logs); 477 | assert(added_logs = []); 478 | assert([] = deleted_logs); 479 | 480 | begin match List.hd msgs with 481 | | (Request_vote_response r, 2) -> ( 482 | 483 | assert(r.voter_id = 0); 484 | assert(r.voter_term = 1); 485 | assert(r.vote_granted = false); 486 | (* Server0 being the [Leader] for term 1, it should 487 | * not grant its vote. *) 488 | ) 489 | | _ -> assert(false) 490 | end; 491 | 492 | let now = now +. 0.002 in 493 | 494 | (* 495 | * Send Request_vote_response Server0 -> Server2 496 | * 497 | * (vote not granted) 498 | * ----------------------------------------------------------------------- 499 | *) 500 | 501 | let res = 502 | let msg = msg_for_server msgs 2 in 503 | Protocol.handle_message server2 msg now 504 | in 505 | 506 | let { 507 | state = server2; 508 | messages_to_send = msgs; 509 | leader_change; 510 | committed_logs; 511 | added_logs; 512 | deleted_logs; 513 | } = res in 514 | 515 | assert(deleted_logs = []); 516 | assert(is_candidate server2); 517 | (* Yes despite all other server denying their vote, server2 518 | * is still a [Candidate]. It should not take any further 519 | * action until its election timeout has elapsed. *) 520 | 521 | assert(None = leader_change); 522 | assert([] = committed_logs); 523 | assert(added_logs = []); 524 | 525 | assert([] = msgs); 526 | (* No new message from server2 for this elections. All [Request_vote] 527 | * requests have been sent and the unsucessful responses received. *) 528 | ({server0; server1; server2}, now) 529 | 530 | (* In this part of the test server0 the leader is sending heartbeat 531 | * messages (ie empty Append_entries_request) to all the followers. 532 | * 533 | * Server2 is no longer disconnected and will receive the msg; the msg 534 | * will establish server0 leadership to server2 which will update its role 535 | * to be a follower of server0. 536 | * 537 | * At the end server0 is the leader and both server1 and server2 are followers 538 | * of server0.*) 539 | let leader_heartbeat_1 {server0; server1; server2} now = 540 | 541 | (* Since the heartbeat timeout is usually much shorter 542 | * than a new election timeout, it's likely that server0 543 | * (ie the current [Leader]) will send heartbeats to the other 544 | * 2 servers. 545 | * ----------------------------------------------------------------------- 546 | *) 547 | 548 | (* First let's make sure that even because the heartbeat deadline 549 | * for any server has not been reached, a heartbeat timeout should not 550 | * trigger new messages. 551 | *) 552 | 553 | let {state = server0; messages_to_send = msgs; _ } = 554 | Protocol.handle_heartbeat_timeout server0 now 555 | in 556 | 557 | assert([] = msgs); 558 | (* Heartbeat messages are only sent to servers which have not recveived a 559 | * message for at least the [hearbeat_timeout] amount of time. 560 | * It's not the case since [hearbeat_timeout] is 0.02. *) 561 | 562 | let now = now +. default_configuration.hearbeat_timeout in 563 | 564 | let {state = server0; messages_to_send = hb_msgs; _ } = 565 | Protocol.handle_heartbeat_timeout server0 now 566 | in 567 | 568 | assert(2 = List.length hb_msgs); 569 | (* Because we added [hearbeat_timeout] to the previous time, we know for 570 | * sure that heartbeats messages are past due for all of the followers. *) 571 | 572 | List.iter (fun (msg, _) -> 573 | match msg with 574 | | Append_entries_request r -> ( 575 | assert(r.leader_term = 1); 576 | assert(r.leader_id = 0); 577 | assert(r.prev_log_index = 0); 578 | assert(r.prev_log_term = 0); 579 | assert(r.log_entries = []); 580 | assert(r.leader_commit = 0); 581 | ) 582 | | _ -> assert(false); 583 | ) hb_msgs; 584 | 585 | (* 586 | * Send Append_entries_request Server0 -> Server1 587 | * 588 | * (server2 becomes a follower) 589 | * -------------------------------------------------------------------- 590 | *) 591 | 592 | let now = now +. 0.001 in 593 | let res = 594 | let msg = msg_for_server hb_msgs 2 in 595 | Protocol.handle_message server2 msg now 596 | in 597 | 598 | let { 599 | state = server2; 600 | messages_to_send = msgs; 601 | leader_change; 602 | committed_logs; 603 | added_logs; 604 | deleted_logs; 605 | } = res in 606 | 607 | assert(is_follower server2); 608 | assert(1 = server2.current_term); 609 | assert(Some (New_leader 0)= leader_change); 610 | assert([] = committed_logs); 611 | assert(added_logs = []); 612 | assert(deleted_logs = []); 613 | (* Receiving an [Append_entries] request with a term at least equal 614 | * or supperior to one [current_term] means that the sender is a valid 615 | * [Leader] for that term and therefore the recipient must become a 616 | * [Follower]. *) 617 | 618 | begin match server2.role with 619 | | Follower fs -> ( 620 | assert(fs.voted_for = Some 2); 621 | (* For that term, since this server was also a candidate it 622 | * did already vote for itself. *) 623 | assert(fs.current_leader = Some 0); 624 | (* server2 is now aware that server0 is the [Leader] for 625 | * term 1 *) 626 | assert(fs.election_deadline = 627 | now +. default_configuration.election_timeout); 628 | ) 629 | | _ -> assert(false); 630 | end; 631 | 632 | assert(1 = List.length msgs); 633 | (* Response for the [Append_entries] *) 634 | 635 | begin match msgs with 636 | | ((Append_entries_response r), server_id) :: [] -> ( 637 | assert(server_id = 0); 638 | assert(r.receiver_id = 2); 639 | assert(r.receiver_term = 1); 640 | assert(r.result = Success 0); 641 | ) 642 | | _ -> assert(false) 643 | end; 644 | 645 | (* 646 | * Send Append_entries_response Server2 -> Server0 647 | *----------------------------------------------------------------------- 648 | *) 649 | 650 | let res = 651 | let msg = msg_for_server msgs 0 in 652 | Protocol.handle_message server0 msg now 653 | in 654 | 655 | let { 656 | state = server0; 657 | messages_to_send = msgs; 658 | leader_change; 659 | committed_logs; 660 | added_logs; 661 | deleted_logs; 662 | } = res in 663 | assert(None = leader_change); 664 | assert([] = committed_logs); 665 | assert([] = msgs); 666 | assert(added_logs = []); 667 | assert(deleted_logs = []); 668 | 669 | (* 670 | * Send Append_entries_request Server0 -> Server1 671 | * ----------------------------------------------------------------------- 672 | *) 673 | 674 | let res = 675 | let msg = msg_for_server hb_msgs 1 in 676 | Protocol.handle_message server1 msg now 677 | in 678 | 679 | let { 680 | state = server1; 681 | messages_to_send = msgs; 682 | leader_change; 683 | committed_logs; 684 | added_logs; 685 | deleted_logs; 686 | } = res in 687 | 688 | assert(is_follower server1); 689 | (* No change in the role, server0 is a valid [Leader], 690 | * server1 stays a [Follower]. *) 691 | 692 | assert(None = leader_change); 693 | assert([] = committed_logs); 694 | assert(added_logs = []); 695 | assert(deleted_logs = []); 696 | (* The heartbeat message from server0 did not bring new 697 | * information (ie server1 already new server0 was the 698 | * [Leader].) *) 699 | 700 | assert(1 = List.length msgs); 701 | (* Single [Append_entries_response] expected. *) 702 | 703 | begin match msgs with 704 | | (Append_entries_response r, 0)::[] -> ( 705 | assert(r.receiver_id = 1); 706 | assert(r.receiver_term = 1); 707 | assert(r.result = Success 0); 708 | ) 709 | | _ -> assert(false) 710 | end; 711 | 712 | let now = now +. 0.001 in 713 | 714 | (* 715 | * Send Append_entries_response Server1 -> Server0 716 | * 717 | * -------------------------------------------------------------------------- 718 | *) 719 | 720 | (* Note that it's important for the rest of the test that 721 | * we explicitely handle the response from server1 in server0. 722 | * Each [Leader] is keeping track of whether or not there is an 723 | * [outstanding_request] for each server. 724 | * 725 | * The [Leader] would then avoid sending new [Append_entries] requests 726 | * which already have an outstanding requests. However if a heartbeat timeout 727 | * has been reached, a new [Append_entries] request will be sent no matter 728 | * what. *) 729 | 730 | let res = 731 | let msg = msg_for_server msgs 0 in 732 | Protocol.handle_message server0 msg now 733 | in 734 | 735 | let { 736 | state = server0; 737 | messages_to_send = msgs; 738 | leader_change; 739 | committed_logs; 740 | added_logs; 741 | deleted_logs; 742 | } = res in 743 | 744 | assert(is_leader server0); 745 | assert(None = leader_change); 746 | assert([] = committed_logs); 747 | assert(added_logs = []); 748 | assert(deleted_logs = []); 749 | assert([] = msgs); 750 | 751 | ({server0; server1; server2}, now) 752 | 753 | (* In this part of the a new log is added to server0 the leader. This log 754 | * triggers 2 Append_entries_request to server1 and server2. Only server1 755 | * successfully receives the msg, replicates the log and send back the 756 | * response to server0. server2 will not receive the msg. 757 | * 758 | * At the end server0 is the leader with a single log entry in its log; its 759 | * commit index is 1 since it was successfully replicated on server1. Server1 760 | * is still a follower with a single log entry in its log and a commit index 761 | * of 0 since it is not yet aware that this first log entry was commited. 762 | * Server2 is still a follower with an empty log since it never received 763 | * the log entry #1. *) 764 | let add_first_log {server0; server1; server2} now = 765 | 766 | (* Let's now add a log entry to the [Leader] which is expected to trigger 767 | * the corresponding [Append_entries] requests to the other servers. *) 768 | let new_log_result = 769 | let data = Bytes.of_string "Message01" in 770 | Protocol.handle_add_log_entries server0 [(data, "01")] now 771 | in 772 | 773 | let server0, data1_msgs = 774 | let open Protocol in 775 | match new_log_result with 776 | | Appended result -> 777 | let { 778 | state; 779 | messages_to_send; 780 | committed_logs; 781 | added_logs;deleted_logs;_} = result in 782 | assert([] = committed_logs); 783 | assert(deleted_logs = []); 784 | begin match added_logs with 785 | | [{id="01"; index = 1; term = 1; _}] -> () 786 | | _ -> assert(false) 787 | end; 788 | (state, messages_to_send) 789 | (* server0 is the [Leader] and is therefore expected to 790 | * handle the new log entry. *) 791 | 792 | | Delay | Forward_to_leader _ -> assert(false) 793 | in 794 | 795 | assert(1 = recent_log_length server0); 796 | (* The new log entry should have been appended to the current empty log.*) 797 | 798 | begin match recent_log_hd server0 with 799 | | {index = 1; term = 1; _ } -> () 800 | (* Log should start as 1 and be set to the current 801 | * term (ie 1.) *) 802 | 803 | | _ -> assert(false) 804 | end; 805 | 806 | assert(2 = List.length data1_msgs); 807 | (* Both [Follower]s have no outstanding request and have also less 808 | * log entries than the [Leader], therefore they 809 | * should get a new [Append_entries] request message with the new 810 | * log *) 811 | 812 | List.iter (fun (msg, _) -> 813 | match msg with 814 | | Append_entries_request r -> ( 815 | assert(r.leader_term = 1); 816 | assert(r.leader_id = 0); 817 | assert(r.prev_log_index = 0); 818 | assert(r.prev_log_term = 0); 819 | assert(1 = List.length r.log_entries); 820 | (* 821 | * Contains the log entry to be synchronized. 822 | *) 823 | begin match r.log_entries with 824 | | {index = 1; term = 1; _} :: [] -> () 825 | | _ -> assert(false) 826 | end; 827 | assert(r.leader_commit = 0); 828 | ) 829 | | _ -> assert(false) 830 | ) data1_msgs; 831 | 832 | let now = now +. 0.001 in 833 | 834 | (* 835 | * Send Append_entries_request Server0 -> Server1 836 | * ---------------------------------------------------------------------- 837 | *) 838 | 839 | let res = 840 | let msg = msg_for_server data1_msgs 1 in 841 | Protocol.handle_message server1 msg now 842 | in 843 | 844 | let { 845 | state = server1; 846 | messages_to_send = msgs; 847 | leader_change; 848 | committed_logs; 849 | added_logs; 850 | deleted_logs; 851 | } = res in 852 | 853 | assert(is_follower server1); 854 | (* No change of role for server1, [Append_entries] only 855 | * re-inforce that server0 is the [Leader]. *) 856 | 857 | begin match added_logs with 858 | | [{id = "01"; index = 1; term = 1; _}] -> () 859 | | _ -> assert(false) 860 | end; 861 | assert(1 = recent_log_length server1); 862 | (* The [log_entry] with index 1 has been replicated 863 | * on server1. *) 864 | 865 | assert(0 = server1.commit_index); 866 | assert(None = leader_change); 867 | assert(deleted_logs = []); 868 | assert([] = committed_logs); 869 | (* While server1 has successfully replicated the log entry 870 | * with [index = 1], it cannot assume that this latter log 871 | * entry has been committed (ie that it has been replicated on 872 | * a majority of servers). 873 | * 874 | * Therefore the [commit_index] is still 0. 875 | * 876 | * It will be updated upon receiving the next [Append_entries_request] *) 877 | 878 | begin match msgs with 879 | | (Append_entries_response r, 0) :: [] -> ( 880 | assert(r.receiver_id = 1); 881 | assert(r.receiver_term = 1); 882 | assert(r.result = Success 1); 883 | (* server1 has correctly replicated the log entry which has index1. *) 884 | ) 885 | | _ -> assert(false) 886 | end; 887 | 888 | (* 889 | * Send Append_entries_response Server1 -> Server0 890 | * -------------------------------------------------------------------- 891 | *) 892 | 893 | let res = 894 | let msg = msg_for_server msgs 0 in 895 | Protocol.handle_message server0 msg now 896 | in 897 | 898 | let { 899 | state = server0; 900 | messages_to_send = _; 901 | leader_change; 902 | committed_logs; 903 | added_logs; 904 | deleted_logs; 905 | } = res in 906 | 907 | assert(is_leader server0); 908 | assert(1 = server0.commit_index); 909 | assert(None = leader_change); 910 | assert(deleted_logs = []); 911 | assert(added_logs = []); 912 | (* The log with index1 was already added so added_log is empty *) 913 | begin match committed_logs with 914 | | {id = "01"; _ }::[] -> () 915 | (* server1 has replicated the log successfully so this means 916 | * that a majority of servers have done the replication. 917 | * The [Leader] commit_index can now be updated to that latest 918 | * log index (ie 1). *) 919 | | _ -> assert(false) 920 | end; 921 | 922 | ({server0; server1; server2}, now) 923 | 924 | (* In this part of the test a second log entry is added to server0 the leader. 925 | * This log entry is successfully replicated on server1 only. Server2 is 926 | * not receiving the Append_entries_request message. Server1 commit index 927 | * is now [1] since the leader commit index of 1 was sent in the request. 928 | * 929 | * At the end server0 is the leader with 2 log entries and a commit index 930 | * of 2, server1 is a follower with 2 log entries and a commit index of 1, 931 | * server2 is a follower with an empty log. *) 932 | let add_second_log {server0; server1; server2} now = 933 | let now = now +. 0.001 in 934 | 935 | let new_log_result = 936 | let data = Bytes.of_string "Message02" in 937 | Protocol.handle_add_log_entries server0 [(data,"02")] now 938 | in 939 | 940 | let server0, data2_msg = 941 | let open Protocol in 942 | match new_log_result with 943 | | Delay | Forward_to_leader _ -> assert(false) 944 | | Appended {state; messages_to_send; added_logs; _ } -> begin 945 | begin match added_logs with 946 | | [{id = "02"; index = 2; term = 1; _}] -> () 947 | | _ -> assert(false) 948 | end; 949 | (state, messages_to_send) 950 | end 951 | in 952 | 953 | assert(is_leader server0); 954 | 955 | assert(2 = recent_log_length server0); 956 | (* The second log entry should have been appended to the 957 | * server log. *) 958 | 959 | begin match recent_log_hd server0 with 960 | | {index = 2; term = 1; _ } -> () 961 | (* Make sure the index is incremented by 1. *) 962 | | _ -> assert(false) 963 | end; 964 | 965 | assert(1 = server0.commit_index); 966 | (* The new log entry (with index [2]) should not be committed 967 | * since no request/response interaction has yet been done. *) 968 | 969 | assert(1 = List.length data2_msg); 970 | (* Since server2 has an outstanding request, it should not 971 | * be sent an additional request. Only server1 should receive 972 | * an [Append_entries] request. *) 973 | 974 | begin match List.hd data2_msg with 975 | | (Append_entries_request r, 1) -> ( 976 | 977 | assert(r.leader_term = 1); 978 | assert(r.leader_id = 0); 979 | assert(r.prev_log_index = 1); 980 | (* Server0 [Leader] knows that the server1 has successfully 981 | * replicated the log entry with [index = 1] and therefore 982 | * will send [prev_log_index] with value 1. *) 983 | 984 | assert(r.prev_log_term = 1); 985 | assert(1 = List.length r.log_entries); 986 | (* Only the last [log_entry] should be sent to that follower, 987 | * since the first [log_entry] was already replicated. 988 | *) 989 | assert(r.leader_commit = 1); 990 | ) 991 | | _ -> assert(false) 992 | end; 993 | 994 | let now = now +. 0.001 in 995 | 996 | (* 997 | * Send Append_entries_request Server0 -> Server1 998 | * ------------------------------------------------------------------- 999 | *) 1000 | 1001 | let res = 1002 | let msg = msg_for_server data2_msg 1 in 1003 | Protocol.handle_message server1 msg now 1004 | in 1005 | 1006 | let { 1007 | state = server1; 1008 | messages_to_send = msgs; 1009 | leader_change; 1010 | committed_logs; 1011 | added_logs; 1012 | deleted_logs; 1013 | } = res in 1014 | 1015 | assert(is_follower server1); 1016 | 1017 | assert(2 = recent_log_length server1); 1018 | (* server1 should have correctly replicated the log 1019 | * with [index = 2]. *) 1020 | 1021 | assert(None = leader_change); 1022 | 1023 | assert(1 = server1.commit_index); 1024 | begin match committed_logs with 1025 | | {id = "01"; _ }::[] -> () 1026 | | _ -> assert(false) 1027 | end; 1028 | (* The [Append_entries] request contained the [commit_index] 1029 | * of the [Leader] (1 in this case) and therefore server1 has 1030 | * updated its own. *) 1031 | begin match added_logs with 1032 | | {id = "02"; index = 2; term = 1; _}::[] -> () 1033 | | _ -> assert(false) 1034 | end; 1035 | (* The new log number 2 is now added in this follower .. but not commited 1036 | * as verified previously. *) 1037 | 1038 | assert(deleted_logs = []); 1039 | assert(1 = List.length msgs); 1040 | (* Only a single response to server0 should be 1041 | * sent back. *) 1042 | 1043 | begin match List.hd msgs with 1044 | | (Append_entries_response r, 0) -> ( 1045 | assert(r.receiver_id = 1); 1046 | assert(r.receiver_term = 1); 1047 | assert(r.result = Success 2); 1048 | (* server1 notifies the [Leader] about the last log it has 1049 | * replicated. 1050 | *) 1051 | ) 1052 | | _ -> assert(false) 1053 | end; 1054 | 1055 | (* 1056 | * Send Append_entries_response Server1 -> Server0 1057 | * ---------------------------------------------------------------------- 1058 | *) 1059 | 1060 | let now = now +. 0.001 in 1061 | 1062 | let res = 1063 | let msg = msg_for_server msgs 0 in 1064 | Protocol.handle_message server0 msg now 1065 | in 1066 | 1067 | let { 1068 | state = server0; 1069 | messages_to_send = _; 1070 | leader_change; 1071 | committed_logs; 1072 | added_logs; 1073 | deleted_logs; 1074 | } = res in 1075 | 1076 | assert(is_leader server0); 1077 | 1078 | assert(None = leader_change); 1079 | 1080 | assert(2 = server0.commit_index); 1081 | begin match committed_logs with 1082 | | {id = "02"; _ }::[] -> () 1083 | | _ -> assert(false) 1084 | end; 1085 | (* A successfull replication is enough for a majority. *) 1086 | assert(added_logs = []); 1087 | assert(deleted_logs = []); 1088 | ({server0; server1; server2}, now) 1089 | 1090 | (* In this part of the test, the leader reaches its heartbeat timeout 1091 | * and send an Append_entries_request to each of its follower. Since 1092 | * server1 has replicated all the logs, its message contains no log 1093 | * entries but its commit_index will be set to 2. Server2 msg will 1094 | * contains the last 2 log entries since it has not replicated them. 1095 | * 1096 | * This time server2 receives the messages, replicates the 2 log and update 1097 | * its commit index to [2]. 1098 | * 1099 | * At the end of the test, server0 is the leader with 2 log index and a 1100 | * commit index of [2]. server1 and server2 are followers with 2 log index 1101 | * and commit index of [2]. *) 1102 | let leader_heartbeat_2 {server0; server1; server2} now = 1103 | 1104 | let now = now +. default_configuration.hearbeat_timeout in 1105 | 1106 | let {state = server0; messages_to_send = msgs; added_logs; _ } = 1107 | Protocol.handle_heartbeat_timeout server0 now 1108 | in 1109 | 1110 | assert(added_logs = []); 1111 | assert(2 = List.length msgs); 1112 | 1113 | begin match msg_for_server msgs 1 with 1114 | | Append_entries_request r -> ( 1115 | assert(r.leader_term = 1); 1116 | assert(r.leader_id = 0); 1117 | assert(r.prev_log_term = 1); 1118 | assert(r.log_entries = []); 1119 | (* As expected no new log entry should be sent 1120 | * since they all have been previously and [server1] 1121 | * replied successfully. *) 1122 | 1123 | assert(r.prev_log_index = 2); 1124 | assert(r.leader_commit = 2); 1125 | ) 1126 | | _ -> assert(false) 1127 | end; 1128 | 1129 | begin match msg_for_server msgs 2 with 1130 | | Append_entries_request r -> ( 1131 | assert(r.leader_term = 1); 1132 | assert(r.leader_id = 0); 1133 | assert(r.prev_log_term = 0); 1134 | 1135 | assert(List.length r.log_entries = 2); 1136 | assert(r.prev_log_index = 0); 1137 | (* this reflect the knowledge of server0 (leader) which has never 1138 | * received an Append_entries_response from server2 indicating 1139 | * that logs were replicated. *) 1140 | assert(r.leader_commit = 2); 1141 | ) 1142 | | _ -> assert(false) 1143 | end; 1144 | 1145 | let now = now +. 0.001 in 1146 | 1147 | (* 1148 | * Send Append_entries_request Server0 -> Server1 1149 | * --------------------------------------------------------------------- 1150 | *) 1151 | let res = 1152 | let msg = msg_for_server msgs 1 in 1153 | Protocol.handle_message server1 msg now 1154 | in 1155 | 1156 | let { 1157 | state = server1; 1158 | messages_to_send = server1_response; 1159 | leader_change; 1160 | committed_logs; 1161 | added_logs; 1162 | deleted_logs; 1163 | } = res in 1164 | 1165 | assert(is_follower server1); 1166 | assert(None = leader_change); 1167 | 1168 | begin match committed_logs with 1169 | | {id = "02"; _ }::[] -> () 1170 | | _ -> assert(false) 1171 | end; 1172 | assert(2 = server1.commit_index); 1173 | (* server1 is updating its commit index based on latest 1174 | * [leader_commit] value of 2 in the request it received. *) 1175 | 1176 | assert(added_logs = []); 1177 | (* All logs have already been replicated in server 1. *) 1178 | 1179 | assert(deleted_logs = []); 1180 | 1181 | assert(1 = List.length server1_response); 1182 | (* Only a single response is expected. *) 1183 | 1184 | (* 1185 | * Send Append_entries_request Server0 -> Server2 1186 | * --------------------------------------------------------------------- 1187 | *) 1188 | 1189 | let res = 1190 | let msg = msg_for_server msgs 2 in 1191 | Protocol.handle_message server2 msg now 1192 | in 1193 | 1194 | let { 1195 | state = server2; 1196 | messages_to_send = server2_response; 1197 | leader_change; 1198 | committed_logs; 1199 | added_logs; 1200 | deleted_logs; 1201 | } = res in 1202 | 1203 | assert(is_follower server2); 1204 | begin match committed_logs with 1205 | | {id = "01"; _ }::{id = "02"; _}::[] -> () 1206 | | _ -> assert(false) 1207 | end; 1208 | assert(added_logs = committed_logs); 1209 | (* The 2 previous logs were never replicated to the server 2 before 1210 | * this heartbeat message *) 1211 | assert(deleted_logs = []); 1212 | assert(None = leader_change); 1213 | assert(2 = server2.commit_index); 1214 | (* server2 is updating its commit index based on latest 1215 | * [leader_commit] value of 2 in the request it received. *) 1216 | 1217 | assert(2 = recent_log_length server2); 1218 | (* server2 should have caught up with server0 and replicated 1219 | * all the logs in the cache (ie 1) *) 1220 | 1221 | assert(1 = List.length server2_response); 1222 | (* Only a single response is expected. *) 1223 | 1224 | begin match server2_response with 1225 | | (Append_entries_response r, 0)::[] -> ( 1226 | assert(r.receiver_id = 2); 1227 | assert(r.receiver_term = 1); 1228 | assert(r.result = Success 2); 1229 | (* server2 has successfully replicated the 2 log entries *) 1230 | ) 1231 | | _ -> assert(false) 1232 | end; 1233 | 1234 | (* 1235 | * Send Append_entries_response Server2 -> Server0 1236 | * ---------------------------------------------------------------------- 1237 | *) 1238 | 1239 | let res = 1240 | let res = 1241 | let msg = msg_for_server server1_response 0 in 1242 | Protocol.handle_message server0 msg now 1243 | in 1244 | 1245 | let { 1246 | state = server0; 1247 | messages_to_send; 1248 | leader_change; 1249 | committed_logs; 1250 | added_logs; 1251 | deleted_logs; 1252 | } = res in 1253 | 1254 | assert([] = messages_to_send); 1255 | assert(None = leader_change); 1256 | assert([] = committed_logs); 1257 | assert(added_logs = []); 1258 | assert(deleted_logs = []); 1259 | (* Server1 has replicated the 2 logs it has nothing 1260 | * left. *) 1261 | 1262 | let msg = msg_for_server server2_response 0 in 1263 | Protocol.handle_message server0 msg now 1264 | in 1265 | 1266 | let { 1267 | state = server0; 1268 | messages_to_send; 1269 | leader_change; 1270 | committed_logs; 1271 | added_logs; 1272 | deleted_logs; 1273 | } = res in 1274 | 1275 | assert(is_leader server0); 1276 | assert(2 = server0.commit_index); 1277 | assert(2 = recent_log_length server0); 1278 | assert(None = leader_change); 1279 | assert([] = committed_logs); 1280 | assert(added_logs = []); 1281 | assert(deleted_logs = []); 1282 | 1283 | (* Both servers have replicated the 2 logs, no outstanding 1284 | * logs to be sent. *) 1285 | assert(0 = List.length messages_to_send); 1286 | 1287 | ({server0; server1; server2}, now) 1288 | 1289 | (* In this part of the test, a 3rd log is added to the leader which 1290 | * successfully replicates it on server1 but the Append_entries_response 1291 | * is not sent back to server0. Therefore the log is not yet committed. 1292 | * Server 2 does not replicate the 3rd log however. 1293 | * 1294 | * At the end server0 is the leader with 3 logs and commit_index set to [2], 1295 | * server1 has 3 logs with commit_index set to [2] and server2 has 2 logs 1296 | * and commit_index set to [2]. *) 1297 | let add_third_log {server0; server1; server2} now = 1298 | 1299 | (* Let's now add a 3rd [log_entry] to the [Leader]. 1300 | * 1301 | * We'll replicate this 3rd entry on [server1] only and 1302 | * then simulate a [Leader] crash. 1303 | * 1304 | * The consequence of a [Leader] crash will be that one of the 1305 | * follower will start a new election. 1306 | * However only [server1] should become a [Leader] since it has replicated 1307 | * more [log_entry]s than [server2]. 1308 | * 1309 | * We will simulate and test the above assumption. *) 1310 | 1311 | let now = now +. 0.002 in 1312 | 1313 | let new_log_result = 1314 | let data = Bytes.of_string "Message03" in 1315 | Protocol.handle_add_log_entries server0 [(data, "03") ] now 1316 | in 1317 | 1318 | let server0, msgs = 1319 | match new_log_result with 1320 | | Protocol.Appended {state; messages_to_send; added_logs; _ } -> 1321 | begin match added_logs with 1322 | | {id = "03"; index = 3; term = 1; _ } :: [] -> () 1323 | | _ -> assert(false) 1324 | end; 1325 | (state, messages_to_send) 1326 | | _ -> assert(false) 1327 | in 1328 | assert(is_leader server0); 1329 | assert(3 = recent_log_length server0); 1330 | (* Correctly added log since server0 is a [Leader]. *) 1331 | 1332 | assert(2 = List.length msgs); 1333 | (* 2 [Append_entries] requests, one for each of the other 1334 | * 2 servers. *) 1335 | 1336 | (* 1337 | * Send Append_entries_request Server0 -> Server1 1338 | * --------------------------------------------------------------------- 1339 | *) 1340 | 1341 | let res = 1342 | let msg = msg_for_server msgs 1 in 1343 | Protocol.handle_message server1 msg now 1344 | in 1345 | 1346 | let { 1347 | state = server1; 1348 | messages_to_send = _; 1349 | leader_change; 1350 | committed_logs; 1351 | added_logs; 1352 | deleted_logs; 1353 | } = res in 1354 | 1355 | assert(is_follower server1); 1356 | assert(3 = recent_log_length server1); 1357 | (* * The 3rd [log_entry] is correctly replicated. *) 1358 | 1359 | assert(None = leader_change); 1360 | assert([] = committed_logs); 1361 | assert(2 = server1.commit_index); 1362 | (* No change since the [commit_index] was still 1363 | * 2 in the [Append_entries] request. *) 1364 | begin match added_logs with 1365 | | {id = "03"; index = 3; term =1 ; _ } :: [] -> () 1366 | | _ -> assert(false) 1367 | end; 1368 | 1369 | assert(deleted_logs = []); 1370 | 1371 | (* The response is not send back to server0 *) 1372 | ({server0; server1; server2}, now) 1373 | 1374 | (* In this part of the test, server2 starts a new election, however since 1375 | * it has only 2 log entries and server1 has 3 log entries, server1 will 1376 | * not grant its vote. 1377 | * 1378 | * At the end of the test server2 is a candidate for term [2] with only 1379 | * its own vote granted. Server1 is still a follower with no leader and 1380 | * no vote granted in term [2]. Server0 is still a leader for term [1]. *) 1381 | let failed_election_2 {server0; server1; server2} now = 1382 | 1383 | (* 1384 | * Server2 starts a new election 1385 | * -------------------------------------------------------------------------- 1386 | *) 1387 | 1388 | let now = now +. default_configuration.election_timeout in 1389 | 1390 | let { 1391 | state = server2; 1392 | messages_to_send = msgs; 1393 | leader_change; 1394 | committed_logs; 1395 | added_logs; 1396 | deleted_logs} = Protocol.handle_new_election_timeout server2 now 1397 | in 1398 | 1399 | assert(deleted_logs = []); 1400 | assert(is_candidate server2); 1401 | (* Server2 started a new election. *) 1402 | 1403 | assert(Some No_leader = leader_change); 1404 | (* Becoming a candidate and incrementing the terms 1405 | * means that there are no longer a [Leader]. *) 1406 | assert([] = committed_logs); 1407 | assert(added_logs = []); 1408 | 1409 | assert(2 = server2.current_term); 1410 | (* Each new election increments the term *) 1411 | 1412 | assert(2 = List.length msgs); 1413 | (* 2 [Request_vote] request for each of the other 2 1414 | * servers. *) 1415 | 1416 | List.iter (fun (r, _) -> 1417 | match r with 1418 | | Request_vote_request r -> ( 1419 | assert(r.candidate_term = 2); 1420 | assert(r.candidate_id = 2); 1421 | assert(r.candidate_last_log_index = 2); 1422 | assert(r.candidate_last_log_term = 1); 1423 | ) 1424 | | _ -> assert(false) 1425 | ) msgs; 1426 | 1427 | 1428 | (* 1429 | * Send Request_vote_request Server2 -> Server1 1430 | * ---------------------------------------------------------------------- 1431 | *) 1432 | 1433 | let now = now +. 0.001 in 1434 | 1435 | let res = 1436 | let msg = msg_for_server msgs 1 in 1437 | Protocol.handle_message server1 msg now 1438 | in 1439 | 1440 | let { 1441 | state = server1; 1442 | messages_to_send = msgs; 1443 | leader_change; 1444 | committed_logs; 1445 | added_logs; 1446 | deleted_logs; 1447 | } = res in 1448 | 1449 | assert(is_follower server1); 1450 | assert(2 = server1.current_term); 1451 | (* The sender (ie server2) term was greater than the current 1452 | * term of server1, it should then 1453 | * increments its current term. *) 1454 | 1455 | assert(Some No_leader = leader_change); 1456 | (* The change of term means that there are no current [Leader] 1457 | * for it yet. *) 1458 | assert([] = committed_logs); 1459 | assert(added_logs = []); 1460 | assert(deleted_logs = []); 1461 | 1462 | assert(1 = List.length msgs); 1463 | (* [Request_vote] response to server2. *) 1464 | 1465 | begin match msgs with 1466 | | (Request_vote_response r, 2)::[] -> ( 1467 | assert(r.voter_id = 1); 1468 | assert(r.voter_term = 2); 1469 | assert(r.vote_granted = false); 1470 | (* [server1] has more [log_entry] than [server2] and therefore 1471 | * rejects [server2] candidacy. 1472 | * 1473 | * This is to ensure the safety property of the RAFT protocol 1474 | * so that no committed entries are later invalidated. *) 1475 | ) 1476 | | _ -> assert(false) 1477 | end; 1478 | ({server0; server1; server2}, now) 1479 | 1480 | (* In this part of the test, server1 takes its turn to start a new election 1481 | * in term3. This time since it has more logs than server2, server2 grants its 1482 | * vote and server1 becomes a leader. 1483 | * 1484 | * Upon become a leader server1 send Append_entries_request to both 1485 | * server0 and server2 assuming their previous log index matches his (ie [3]). 1486 | * Server0 is disconnected and the msg is not sent to it. Server2 receives 1487 | * the message, however it has not replicated log [3] and therefore sends 1488 | * back a log failure indicating its previous log index which is [2]. 1489 | * Server1 will then update its information about server2 and send back 1490 | * a corrected Append_entries_request, with this time the missing third 1491 | * entry. Upon receiving the Append_entries_response server1 will mark 1492 | * the 3rd log entry to be commited! 1493 | * 1494 | * At the end server0 is still in a leader role but for term 1 and it has 1495 | * 3 log entries with commit index of 2. Both server1 and server2 are in 1496 | * term 3 with server1 the leader and server2 a follower. They both have 3 1497 | * log entries, server1 commit index is 3 while server2 is 2. *) 1498 | let election_2 {server0; server1; server2} now = 1499 | 1500 | (* 1501 | * Let's now have server1 starts a new election. 1502 | * ---------------------------------------------------------------------- 1503 | *) 1504 | 1505 | let now = now +. default_configuration.election_timeout in 1506 | 1507 | let { 1508 | state = server1; 1509 | messages_to_send = msgs; 1510 | leader_change; 1511 | committed_logs; 1512 | added_logs; 1513 | deleted_logs} = Protocol.handle_new_election_timeout server1 now 1514 | in 1515 | 1516 | assert(None = leader_change); 1517 | assert([] = committed_logs); 1518 | assert(added_logs = []); 1519 | assert(deleted_logs = []); 1520 | 1521 | assert(is_candidate server1); 1522 | assert(3 = server1.current_term); 1523 | assert(2 = List.length msgs); 1524 | 1525 | List.iter (fun (r, _) -> 1526 | begin match r with 1527 | | Request_vote_request r -> ( 1528 | assert(r.candidate_term = 3); 1529 | assert(r.candidate_id = 1); 1530 | assert(r.candidate_last_log_index = 3); 1531 | assert(r.candidate_last_log_term = 1); 1532 | ) 1533 | | _ -> assert(false) 1534 | end 1535 | ) msgs; 1536 | 1537 | (* 1538 | * Send Request_vote_request server1 -> server2 1539 | * ----------------------------------------------------------------------- 1540 | *) 1541 | 1542 | let res = 1543 | let msg = msg_for_server msgs 2 in 1544 | Protocol.handle_message server2 msg now 1545 | in 1546 | 1547 | let { 1548 | state = server2; 1549 | messages_to_send = msgs; 1550 | leader_change; 1551 | committed_logs; 1552 | added_logs; 1553 | deleted_logs; 1554 | } = res in 1555 | 1556 | assert(is_follower server2); 1557 | (* server1 [current_term] is greater than the one 1558 | * in server2 (3 versus 2). 1559 | * 1560 | * Therefore server2 becomes a [Follower] and update 1561 | * its current_term. *) 1562 | 1563 | assert(3 = server2.current_term); 1564 | 1565 | assert(None = leader_change); 1566 | assert(deleted_logs = []); 1567 | assert([] = committed_logs); 1568 | assert(added_logs = []); 1569 | (* Server2 already knew there was no [Leader] since it was a candidate 1570 | * in the previous term and never got elected. *) 1571 | 1572 | assert(1 = List.length msgs); 1573 | 1574 | begin match msgs with 1575 | | (Request_vote_response r, 1)::[] -> ( 1576 | assert(r.voter_id = 2); 1577 | assert(r.voter_term = 3); 1578 | assert(r.vote_granted = true); 1579 | (* Vote is indeed granted since server1 has a greater 1580 | * [last_log_index]. *) 1581 | ) 1582 | | _ -> assert(false) 1583 | end; 1584 | 1585 | let now = now +. 0.001 in 1586 | 1587 | (* 1588 | * Send Request_vote_response Server2 -> Server1 1589 | * --------------------------------------------------------------------- 1590 | *) 1591 | 1592 | let res = 1593 | let msg = msg_for_server msgs 1 in 1594 | Protocol.handle_message server1 msg now 1595 | in 1596 | 1597 | let { 1598 | state = server1; 1599 | messages_to_send = msgs; 1600 | leader_change; 1601 | committed_logs; 1602 | added_logs; 1603 | deleted_logs; 1604 | } = res in 1605 | 1606 | assert(Some (New_leader 1) = leader_change); 1607 | assert([] = committed_logs); 1608 | assert(added_logs = []); 1609 | assert(deleted_logs = []); 1610 | assert(is_leader server1); 1611 | (* One vote is enough to become a [Leader]. *) 1612 | 1613 | assert(3 = server1.current_term); 1614 | (* [current_term] should be the same after becoming 1615 | * a [Leader]. *) 1616 | 1617 | assert(2 = List.length msgs); 1618 | (* Imediately after becoming a [Leader], the server 1619 | * will send [Append_entries] to establish its 1620 | * leadership. *) 1621 | 1622 | List.iter (fun (r, _) -> 1623 | match r with 1624 | | Append_entries_request r -> ( 1625 | assert(r.leader_term = 3); 1626 | assert(r.leader_id = 1); 1627 | assert(r.prev_log_index = 3); 1628 | assert(r.prev_log_term = 1); 1629 | assert(r.log_entries = []); 1630 | (* Initially the [Leader] believes that all other servers 1631 | * have replicated the same [log_entry]s as itself. *) 1632 | assert(r.leader_commit = 2); 1633 | ) 1634 | | _ -> assert(false) 1635 | ) msgs; 1636 | 1637 | (* Send Append_entries_request Server1 -> Server2 1638 | * ----------------------------------------------------------------------- 1639 | *) 1640 | 1641 | let now = now +. 0.001 in 1642 | 1643 | let res = 1644 | let msg = msg_for_server msgs 2 in 1645 | Protocol.handle_message server2 msg now 1646 | in 1647 | 1648 | let { 1649 | state = server2; 1650 | messages_to_send = msgs; 1651 | leader_change; 1652 | committed_logs; 1653 | added_logs; 1654 | deleted_logs; 1655 | } = res in 1656 | 1657 | assert(deleted_logs = []); 1658 | assert(Some (New_leader 1) = leader_change); 1659 | assert([] = committed_logs); 1660 | assert(is_follower server2); 1661 | assert(3 = server2.current_term); 1662 | assert(added_logs = []); 1663 | (* See explanation below as to why the 3rd log entry was not replicated *) 1664 | 1665 | assert(1 = List.length msgs); 1666 | (* Single response to server1. *) 1667 | 1668 | begin match msgs with 1669 | | (Append_entries_response r, 1)::[] -> ( 1670 | assert(r.receiver_id = 2); 1671 | assert(r.receiver_term = 3); 1672 | assert(r.result = Log_failure 2); 1673 | (* server2 did not replicate the 3rd [log_entry] that server1 1674 | * did during [term = 1]. 1675 | * 1676 | * Therefore the previous [Append_entries] request is rejected. *) 1677 | ) 1678 | | _ -> assert(false) 1679 | end; 1680 | 1681 | (* 1682 | * Send Append_entries_response Server2 -> Server1 1683 | * ---------------------------------------------------------------------- 1684 | *) 1685 | 1686 | let now = now +. 0.001 in 1687 | 1688 | let res = 1689 | let msg = msg_for_server msgs 1 in 1690 | Protocol.handle_message server1 msg now 1691 | in 1692 | 1693 | let { 1694 | state = server1; 1695 | messages_to_send = msgs; 1696 | leader_change; 1697 | committed_logs; 1698 | added_logs; 1699 | deleted_logs; 1700 | } = res in 1701 | 1702 | assert(deleted_logs = []); 1703 | assert(is_leader server1); 1704 | assert(3 = server1.current_term); 1705 | 1706 | assert(None = leader_change); 1707 | assert([] = committed_logs); 1708 | assert(added_logs = []); 1709 | 1710 | assert(1 = List.length msgs); 1711 | assert(2 = server1.commit_index); 1712 | (* A new request for server2 has been computed which 1713 | * should now contain the 3rd [log_entry] *) 1714 | 1715 | begin match msgs with 1716 | | (Append_entries_request r, 2) :: [] -> ( 1717 | assert(r.leader_term = 3); 1718 | assert(r.leader_id = 1); 1719 | assert(r.prev_log_index = 2); 1720 | assert(r.prev_log_term = 1); 1721 | assert(1 = List.length r.log_entries); 1722 | (* The missing 3rd log entry is now part of the 1723 | * request for server2 to catch up 1724 | *) 1725 | assert(r.leader_commit = 2); 1726 | ) 1727 | | _ -> assert(false) 1728 | end; 1729 | 1730 | (* 1731 | * Send Append_entries_request Server1 -> Server2 1732 | * ---------------------------------------------------------------------- 1733 | *) 1734 | 1735 | let now = now +. 0.001 in 1736 | 1737 | let res = 1738 | let msg = msg_for_server msgs 2 in 1739 | Protocol.handle_message server2 msg now 1740 | in 1741 | 1742 | let { 1743 | state = server2; 1744 | messages_to_send = msgs; 1745 | leader_change; 1746 | committed_logs; 1747 | added_logs; 1748 | deleted_logs; 1749 | } = res in 1750 | 1751 | assert(is_follower server2); 1752 | assert(3 = server2.current_term); 1753 | 1754 | assert(3 = recent_log_length server2); 1755 | (* server2 has correctly replicated the 3rd [log_entry]. *) 1756 | 1757 | assert(deleted_logs = []); 1758 | assert(None = leader_change); 1759 | assert([] = committed_logs); 1760 | assert(2 = server2.commit_index); 1761 | (* The 3rd log while succesfully replicated is not yet 1762 | * committed. 1763 | *) 1764 | begin match added_logs with 1765 | | {id = "03"; index = 3; term = 1; _ }::[] -> () 1766 | | _ -> assert(false); 1767 | end; 1768 | 1769 | assert(1 = List.length msgs); 1770 | (* Single response for server1. *) 1771 | 1772 | begin match msgs with 1773 | | (Append_entries_response r, 1) :: [] -> ( 1774 | assert(r.receiver_id = 2); 1775 | assert(r.receiver_term = 3); 1776 | assert(r.result = Success 3); 1777 | (* Confirmation that the replication of the log has 1778 | * been successful. *) 1779 | ) 1780 | | _ -> assert(false) 1781 | end; 1782 | 1783 | (* Send Append_entries_response Server2 -> Server1 1784 | * ---------------------------------------------------------------------- 1785 | *) 1786 | 1787 | let now = now +. 0.001 in 1788 | 1789 | let res = 1790 | let msg = msg_for_server msgs 1 in 1791 | Protocol.handle_message server1 msg now 1792 | in 1793 | 1794 | let { 1795 | state = server1; 1796 | messages_to_send = msgs; 1797 | leader_change; 1798 | committed_logs; 1799 | added_logs; 1800 | deleted_logs; 1801 | } = res in 1802 | 1803 | assert(is_leader server1); 1804 | assert(3 = server1.current_term); 1805 | 1806 | assert(deleted_logs = []); 1807 | begin match committed_logs with 1808 | | {id = "03"; _ }::[] -> () 1809 | | _ -> assert(false) 1810 | end; 1811 | assert(None = leader_change); 1812 | assert(3 = server1.commit_index); 1813 | (* 1814 | * The 3rd [log_entry] has been replicated one one other 1815 | * server than the [Leader]; this makes a majority and therefore 1816 | * indicates that [commit_index] can be set to 3. 1817 | *) 1818 | assert(added_logs = []); 1819 | assert([] = msgs); 1820 | ({server0; server1; server2}, now) 1821 | 1822 | (* In this part of the test we simply add 2 logs at the same time to server1 1823 | * the latest leader. Server1 successfully adds, replicate and commits 1824 | * both logs. 1825 | * Server2 replicates the logs but its commit index is not yet updated. 1826 | * Server0 is still disconnected and the leader of term [1]. *) 1827 | let add_4_and_5_logs {server0; server1; server2} now = 1828 | 1829 | let new_log_result = 1830 | let datas = [ 1831 | (Bytes.of_string "Message04", "04"); 1832 | (Bytes.of_string "Message05", "05"); 1833 | ] in 1834 | Protocol.handle_add_log_entries server1 datas now 1835 | in 1836 | 1837 | let server1, data45_msgs = 1838 | let open Protocol in 1839 | match new_log_result with 1840 | | Appended {state; messages_to_send; added_logs; _ } -> 1841 | begin 1842 | assert(2 = List.length added_logs); 1843 | (state, messages_to_send) 1844 | end 1845 | | Delay | Forward_to_leader _ -> assert(false) 1846 | in 1847 | 1848 | assert(5 = recent_log_length server1); 1849 | assert(5 = last_log_index server1.log); 1850 | 1851 | assert(3 = server1.commit_index); 1852 | (* The 2 logs have not been committed. *) 1853 | 1854 | assert(1 = List.length data45_msgs); 1855 | (* Only one message needs to be sent since server0 has still an outstanding 1856 | * request. (ie server1 the leader never received any `append entries 1857 | * response` from that server *) 1858 | 1859 | List.iter (fun (msg, _) -> 1860 | 1861 | match msg with 1862 | | Append_entries_request r -> ( 1863 | let { 1864 | leader_term; 1865 | leader_id; 1866 | prev_log_index; 1867 | prev_log_term; 1868 | log_entries; 1869 | leader_commit; 1870 | } = r in 1871 | assert(3 = leader_term); 1872 | assert(1 = leader_id); 1873 | assert(prev_log_index = 3); 1874 | assert(prev_log_term = 1); 1875 | assert(2 = List.length log_entries); 1876 | assert(3 = leader_commit); 1877 | ) 1878 | | _ -> assert(false); 1879 | ) data45_msgs; 1880 | 1881 | (* 1882 | * Send Append_entries_request Server1 -> Server2 1883 | * ------------------------------------------------------------------ 1884 | *) 1885 | 1886 | let now = now +. 0.001 in 1887 | 1888 | let res = 1889 | let msg = msg_for_server data45_msgs 2 in 1890 | Protocol.handle_message server2 msg now 1891 | in 1892 | 1893 | let { 1894 | state = server2; 1895 | messages_to_send = msgs; 1896 | leader_change; 1897 | committed_logs; 1898 | added_logs; 1899 | deleted_logs; 1900 | } = res in 1901 | 1902 | assert(deleted_logs = []); 1903 | assert(is_follower server2); 1904 | assert(3 = server2.current_term); 1905 | 1906 | assert(2 = List.length added_logs); 1907 | assert(5 = recent_log_length server2); 1908 | (* The last 2 logs where succesfully replicated 1909 | *) 1910 | begin match recent_log_hd server2 with 1911 | | {data; _ } -> 1912 | assert((Bytes.of_string "Message05") = data); 1913 | (* Let's make sure the order was properly replicated. 1914 | *) 1915 | end; 1916 | 1917 | assert(3 = server2.commit_index); 1918 | (* Replicated from server1, previous one was 2 so we 1919 | * can expect a notification. *) 1920 | 1921 | assert(None = leader_change); 1922 | begin match committed_logs with 1923 | | {id = "03"; _ }::[] -> () 1924 | | _ -> assert(false) 1925 | end; 1926 | 1927 | begin match msgs with 1928 | | (Append_entries_response r, 1) :: [] -> ( 1929 | let { 1930 | receiver_id; 1931 | receiver_term; 1932 | result; 1933 | } = r in 1934 | assert(2 = receiver_id); 1935 | assert(3 = receiver_term); 1936 | assert(Success 5 = result); 1937 | ) 1938 | | _ -> assert(false); 1939 | end; 1940 | 1941 | (* 1942 | * Send Append_entries_response Server2 -> Server1 1943 | * ------------------------------------------------------------------------ 1944 | *) 1945 | 1946 | let now = now +. 0.001 in 1947 | 1948 | let res = 1949 | let msg = msg_for_server msgs 1 in 1950 | Protocol.handle_message server1 msg now 1951 | in 1952 | 1953 | let { 1954 | state = server1; 1955 | messages_to_send = msgs; 1956 | leader_change; 1957 | committed_logs; 1958 | added_logs; 1959 | deleted_logs; 1960 | } = res in 1961 | 1962 | assert(deleted_logs = []); 1963 | assert(5 = server1.commit_index); 1964 | assert([] = msgs); 1965 | assert(2 = List.length committed_logs); 1966 | assert(added_logs = []); 1967 | assert(None = leader_change); 1968 | 1969 | ({server0; server1; server2}, now) 1970 | 1971 | (* In this part of the test we add a log to server0 which is disconnected 1972 | * (ie the network is partitioned) from the other servers. Because server0 1973 | * is still a leader for term [1] it will add the log but will not receive 1974 | * any response from the other server, therefore this 4th log entry will 1975 | * not be commited. 1976 | * 1977 | * At the end, server0 is still a leader for term [1], it has 4 log entries 1978 | * but a commit_index of [2]. Server1 is the leader of term [3] and server2 is 1979 | * a follower of server1. *) 1980 | 1981 | let add_log_to_outdated_leader {server0; server1; server2} now = 1982 | let new_log_result = 1983 | let data = Bytes.of_string "NeverCommitted" in 1984 | Protocol.handle_add_log_entries server0 [(data, "NC")] now 1985 | in 1986 | 1987 | let server0, msgs = 1988 | let open Protocol in 1989 | match new_log_result with 1990 | | Appended result -> 1991 | let { 1992 | state; 1993 | messages_to_send; 1994 | committed_logs; 1995 | added_logs;deleted_logs; _} = result in 1996 | assert([] = committed_logs); 1997 | assert(deleted_logs = []); 1998 | assert(1 = List.length added_logs); 1999 | begin match added_logs with 2000 | | [{id="NC"; index = 4; term = 1; _}] -> () 2001 | | _ -> assert(false) 2002 | end; 2003 | (state, messages_to_send) 2004 | (* server0 is the [Leader] and is therefore expected to 2005 | * handle the new log entry. *) 2006 | 2007 | | Delay | Forward_to_leader _ -> assert(false) 2008 | in 2009 | 2010 | assert(2 = List.length msgs); 2011 | assert(4 = recent_log_length server0); 2012 | assert(2 = server0.commit_index); 2013 | 2014 | List.iter (fun (msg, _) -> 2015 | match msg with 2016 | | Append_entries_request r -> ( 2017 | assert(r.leader_term = 1); 2018 | assert(r.leader_id = 0); 2019 | assert(r.prev_log_index = 2); 2020 | assert(r.prev_log_term = 1); 2021 | assert(2 = List.length r.log_entries); 2022 | (* * Contains the log entry to be synchronized. *) 2023 | begin match r.log_entries with 2024 | | {index = 3; term = 1; _} :: {index = 4; id = "NC";_} :: [] -> () 2025 | | _ -> assert(false) 2026 | end; 2027 | assert(r.leader_commit = 2); 2028 | ) 2029 | | _ -> assert(false) 2030 | ) msgs; 2031 | 2032 | ({server0; server1; server2}, now) 2033 | 2034 | (* In this part of the test server1 the leader in term [3] will send 2035 | * heartbeat messages to both server0 and server2. Server0 is no longer 2036 | * disconnected and receives the message, the synchronization sequence 2037 | * between the latest leader (server1) and the previous one (server0) is 2038 | * deleting non commited messages in server0. 2039 | * 2040 | * At the end server1 is the leader and has 5 log entries and a commit 2041 | * index of 5. Both server0 and server2 have fully replicated the 2042 | * 5 log entries and their commit index is 5 as well*) 2043 | let leader_heartbeat_3 {server0; server1; server2} now = 2044 | let now = now +. default_configuration.hearbeat_timeout in 2045 | 2046 | let {state = server1; messages_to_send = msgs; added_logs; _ } = 2047 | Protocol.handle_heartbeat_timeout server1 now 2048 | in 2049 | 2050 | assert(added_logs = []); 2051 | assert(2 = List.length msgs); 2052 | 2053 | begin match msg_for_server msgs 0 with 2054 | | Append_entries_request r -> ( 2055 | assert(r.leader_term = 3); 2056 | assert(r.leader_id = 1); 2057 | assert(r.prev_log_term = 1); 2058 | assert(2 = List.length r.log_entries); 2059 | (* Because no msg was received from server0, server1 knowledge of 2060 | * server0 still believes its prev_log_index to be 3... the same as 2061 | * when it was elected a leader. Therefore for the message contains 2062 | * the 2 latest entries. *) 2063 | assert(r.prev_log_index = 3); 2064 | assert(r.leader_commit = 5); 2065 | ) 2066 | | _ -> assert(false) 2067 | end; 2068 | 2069 | begin match msg_for_server msgs 2 with 2070 | | Append_entries_request r -> ( 2071 | assert(r.leader_term = 3); 2072 | assert(r.leader_id = 1); 2073 | assert(r.prev_log_term = 3); 2074 | assert(0 = List.length r.log_entries); 2075 | assert(r.prev_log_index = 5); 2076 | (* Server2 has replicated both log_entries 4 & 5 *) 2077 | assert(r.leader_commit = 5); 2078 | ) 2079 | | _ -> assert(false) 2080 | end; 2081 | 2082 | (* 2083 | * Send Append_entries_request Server1 -> Server0 2084 | * --------------------------------------------------------------------- 2085 | *) 2086 | 2087 | let res = 2088 | let msg = msg_for_server msgs 0 in 2089 | Protocol.handle_message server0 msg now 2090 | in 2091 | 2092 | let { 2093 | state = server0; 2094 | messages_to_send = server0_response; 2095 | leader_change; 2096 | committed_logs; 2097 | added_logs; 2098 | deleted_logs; 2099 | } = res in 2100 | 2101 | begin match deleted_logs with 2102 | | {index = 4; term = 1; id = "NC"; _} :: [] -> () 2103 | | _ -> assert(false) 2104 | end; 2105 | (* The non commited log added by server0 in term [1] is deleted since 2106 | * it was never replicated anywher and server1 is now the leader 2107 | * in a later term *) 2108 | 2109 | assert(is_follower server0); 2110 | assert(3 = server0.current_term); 2111 | assert(5 = recent_log_length server0); 2112 | begin match committed_logs with 2113 | | {index = 3; term = 1; _}::{index = 4; term = 3; _}:: 2114 | {index = 5; term = 3; _}::[] -> () 2115 | | _ -> assert(false) 2116 | end; 2117 | assert(5 = server0.commit_index); 2118 | assert(2 = List.length added_logs); 2119 | 2120 | begin match msg_for_server server0_response 1 with 2121 | | Append_entries_response r -> 2122 | assert(r.receiver_id = 0); 2123 | assert(r.receiver_term = 3); 2124 | assert(r.result = Success 5); 2125 | | _ -> assert(false); 2126 | end; 2127 | assert(Some (New_leader 1) = leader_change); 2128 | 2129 | (* 2130 | * Send Append_entries_request Server1 -> Server2 2131 | * ---------------------------------------------------------------------- 2132 | *) 2133 | 2134 | let res = 2135 | let msg = msg_for_server msgs 2 in 2136 | Protocol.handle_message server2 msg now 2137 | in 2138 | 2139 | let { 2140 | state = server2; 2141 | messages_to_send = server2_response; 2142 | leader_change; 2143 | committed_logs; 2144 | added_logs; 2145 | deleted_logs; 2146 | } = res in 2147 | 2148 | assert(deleted_logs = []); 2149 | assert(is_follower server2); 2150 | assert(3 = server2.current_term); 2151 | assert(5 = recent_log_length server2); 2152 | begin match committed_logs with 2153 | | {index = 4; term = 3; _}::{index = 5; term = 3; _}::[] -> () 2154 | | _ -> assert(false) 2155 | end; 2156 | assert(5 = server2.commit_index); 2157 | assert(0 = List.length added_logs); 2158 | 2159 | begin match msg_for_server server2_response 1 with 2160 | | Append_entries_response r -> 2161 | assert(r.receiver_id = 2); 2162 | assert(r.receiver_term = 3); 2163 | assert(r.result = Success 5); 2164 | | _ -> assert(false); 2165 | end; 2166 | assert(None= leader_change); 2167 | 2168 | let now = now +. 0.001 in 2169 | 2170 | (* 2171 | * Spend Append_entries_response Server0 -> Server1 2172 | * ------------------------------------------------------------------- 2173 | *) 2174 | 2175 | let res = 2176 | let msg = msg_for_server server0_response 1 in 2177 | Protocol.handle_message server1 msg now 2178 | in 2179 | let { 2180 | state = server1; 2181 | messages_to_send = msgs; 2182 | leader_change; 2183 | committed_logs; 2184 | added_logs; 2185 | deleted_logs; 2186 | } = res in 2187 | assert(deleted_logs = []); 2188 | assert([] = msgs); 2189 | assert([] = committed_logs); 2190 | assert([] = added_logs); 2191 | assert(None = leader_change); 2192 | 2193 | (* 2194 | * Spend Append_entries_response Server2 -> Server1 2195 | * ------------------------------------------------------------------- 2196 | *) 2197 | 2198 | let res = 2199 | let msg = msg_for_server server2_response 1 in 2200 | Protocol.handle_message server1 msg now 2201 | in 2202 | let { 2203 | state = server1; 2204 | messages_to_send = msgs; 2205 | leader_change; 2206 | committed_logs; 2207 | added_logs; 2208 | deleted_logs; 2209 | } = res in 2210 | assert(deleted_logs = []); 2211 | assert([] = msgs); 2212 | assert([] = committed_logs); 2213 | assert([] = added_logs); 2214 | assert(None = leader_change); 2215 | 2216 | ({server0; server1; server2}, now) 2217 | 2218 | (* Prior to this test, the log size was 5, but in this test we're adding 2219 | * 3 new entries which means the resulting size would be 8. 8 is greated 2220 | * than the configured max log size upper boud and so the log size 2221 | * limitation enforcement should kick in. Log entries [1] [2] [3] should 2222 | * be removed from the Raft_log.recent_entries to make sure 2223 | * the recent entries size is set to lower bound (ie [5]). 2224 | * 2225 | * Intentionally at the end of this test we returned the server state 2226 | * unmodified, this test is solely checking the log size enforcement 2227 | * and has no side effect *) 2228 | let enforce_log_size ({server0; server1; _} as servers) now = 2229 | let new_log_result = 2230 | let datas = [ 2231 | (Bytes.of_string "Message06", "06"); 2232 | (Bytes.of_string "Message07", "07"); 2233 | (Bytes.of_string "Message08", "08"); 2234 | ] in 2235 | Protocol.handle_add_log_entries server1 datas now 2236 | in 2237 | 2238 | let min_max_index server = 2239 | ( 2240 | fst @@ IntMap.min_binding server.log.recent_entries, 2241 | fst @@ IntMap.max_binding server.log.recent_entries 2242 | ) 2243 | in 2244 | 2245 | let min_index, max_index = min_max_index server1 in 2246 | assert(1 = min_index); 2247 | assert(5 = max_index); 2248 | 2249 | let server1, msgs = 2250 | let open Protocol in 2251 | match new_log_result with 2252 | | Appended {state; messages_to_send; added_logs; _ } -> 2253 | begin 2254 | assert(3 = List.length added_logs); 2255 | (state, messages_to_send) 2256 | end 2257 | | Delay | Forward_to_leader _ -> assert(false) 2258 | in 2259 | 2260 | let min_index, max_index = min_max_index server1 in 2261 | assert(4 = min_index); 2262 | assert(8 = max_index); 2263 | 2264 | (* 2265 | * Send Append_entries_request Server1 -> Server0 2266 | * ---------------------------------------------- 2267 | *) 2268 | 2269 | let res = 2270 | let msg = msg_for_server msgs 0 in 2271 | Protocol.handle_message server0 msg now 2272 | in 2273 | 2274 | let { 2275 | state = server0; _ 2276 | } = res in 2277 | 2278 | let min_index, max_index = min_max_index server0 in 2279 | assert(4 = min_index); 2280 | assert(8 = max_index); 2281 | 2282 | (servers, now) 2283 | 2284 | let () = 2285 | 2286 | let servers = init () in 2287 | let servers, now = election_1 servers now in 2288 | let servers, now = failed_election_1 servers now in 2289 | let servers, now = leader_heartbeat_1 servers now in 2290 | let servers, now = add_first_log servers now in 2291 | let servers, now = add_second_log servers now in 2292 | let servers, now = leader_heartbeat_2 servers now in 2293 | let servers, now = add_third_log servers now in 2294 | let servers, now = failed_election_2 servers now in 2295 | let servers, now = election_2 servers now in 2296 | let servers, now = add_4_and_5_logs servers now in 2297 | let servers, now = add_log_to_outdated_leader servers now in 2298 | let servers, now = leader_heartbeat_3 servers now in 2299 | let servers, now = enforce_log_size servers now in 2300 | let _ = servers and _ = now in 2301 | 2302 | () 2303 | --------------------------------------------------------------------------------