├── .gitignore ├── Cargo.toml ├── LICENSE ├── NOTICE ├── README.md ├── src └── raft │ ├── Cargo.toml │ └── src │ ├── candidate.rs │ ├── config.rs │ ├── driver.rs │ ├── follower.rs │ ├── group.rs │ ├── index.rs │ ├── info.rs │ ├── leader.rs │ ├── lib.rs │ ├── log.rs │ ├── message.rs │ ├── node.rs │ ├── observer.rs │ ├── peer.rs │ ├── pos.rs │ ├── raft.rs │ ├── stage.rs │ ├── term.rs │ ├── transition.rs │ └── votes.rs └── tests ├── isolated ├── Cargo.toml ├── src │ ├── builder.rs │ ├── driver.rs │ ├── lib.rs │ └── macros.rs └── tests │ ├── test_election.rs │ ├── test_flow.rs │ ├── test_follower.rs │ ├── test_leader.rs │ ├── test_lifecycle.rs │ └── test_observer.rs ├── simulated ├── Cargo.toml ├── src │ ├── builder.rs │ ├── driver.rs │ ├── io.rs │ ├── lib.rs │ ├── message.rs │ └── node.rs └── tests │ └── test_raft.rs └── validated ├── Cargo.toml └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | *.log 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "src/raft", 4 | 5 | # Tests 6 | "tests/isolated", 7 | "tests/simulated", 8 | "tests/validated", 9 | ] 10 | 11 | [patch.crates-io] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Mini Raft 2 | Copyright 2022 Carl Lerche 3 | 4 | The Initial Developer of some parts of the library, primarily comments, which 5 | are copied from, derived from, or inspired by raft-rs (github.com/tikv/raft-rs), 6 | is TiKV Project Authors. Copyright 2019 TiKV Project Authors. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mini Raft 2 | 3 | Work in progress. This project is intended to be a demonstration of structuring 4 | a complex Rust code base and using 5 | [Turmoil](https://github.com/tokio-rs/turmoil) for testing. See 6 | [tikv/raft](https://github.com/tikv/raft-rs) for a mature implementation of Raft 7 | in Rust. -------------------------------------------------------------------------------- /src/raft/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mini-raft" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | anyhow = "1.0" 10 | futures-lite = "1" 11 | indexmap = "1.9.1" 12 | serde = { version = "1.0.142", features = ["derive"] } 13 | tokio = { version = "1.19", features = ["time", "test-util"] } 14 | tracing = "0.1.35" 15 | -------------------------------------------------------------------------------- /src/raft/src/candidate.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::io; 4 | use tokio::time::Instant; 5 | 6 | pub(crate) struct Candidate { 7 | // True if this is a pre-election 8 | pre_election: bool, 9 | 10 | /// How long to wait for votes 11 | restart_campaign_at: Instant, 12 | 13 | /// Tracks votes received from peers. 14 | votes: Votes, 15 | 16 | /// When we last received a message from a known leader. 17 | last_leader_msg_at: Instant, 18 | } 19 | 20 | impl Candidate { 21 | #[tracing::instrument(level = "debug", skip(node))] 22 | pub(crate) fn transition_from_follower( 23 | node: &mut Node, 24 | last_leader_msg_at: Instant, 25 | ) -> Candidate { 26 | // Transitioning to pre-candidiate only happens when there is no known 27 | // group leader. 28 | node.group.leader = None; 29 | 30 | Candidate::new(node, true, last_leader_msg_at) 31 | } 32 | 33 | #[tracing::instrument(level = "debug", skip(node))] 34 | pub(crate) fn transition_from_pre_candidate( 35 | node: &mut Node, 36 | last_leader_msg_at: Instant, 37 | ) -> Candidate { 38 | assert!(node.group.leader.is_none()); 39 | 40 | // When transitioning to candidate, the term is incremented. 41 | node.increment_term(); 42 | node.group 43 | .transition_to_candidate(&node.config.id, &node.log); 44 | 45 | Candidate::new(node, false, last_leader_msg_at) 46 | } 47 | 48 | pub(crate) fn transition_from_timed_out_campaign( 49 | node: &mut Node, 50 | last_leader_msg_at: Instant, 51 | ) -> Candidate { 52 | // There should **still** be no leader 53 | assert!(node.group.leader.is_none()); 54 | 55 | // Now back at the pre-campaign stage 56 | Candidate::new(node, true, last_leader_msg_at) 57 | } 58 | 59 | fn new(node: &mut Node, pre_election: bool, last_leader_msg_at: Instant) -> Candidate { 60 | // There should be no known leader 61 | assert!(node.group.leader.is_none()); 62 | 63 | // Reset the vote tracker & vote for ourself (because we are cool). 64 | let mut votes = Votes::new(); 65 | 66 | // Vote for ourselves 67 | votes.record(&node.group, &node.config.id, true); 68 | 69 | let last_log_pos = node.log.last_appended(); 70 | 71 | for voter_id in node.group.voter_ids() { 72 | // Don't send a message to ourself 73 | if node.config.id == *voter_id { 74 | continue; 75 | } 76 | 77 | node.driver.dispatch( 78 | voter_id.clone(), 79 | Message { 80 | origin: message::Origin { 81 | id: node.config.id.clone(), 82 | term: if pre_election { 83 | // A pre-vote message uses the *next* term that the node would 84 | // use when announcing its candidacy. 85 | node.term + 1 86 | } else { 87 | node.term 88 | }, 89 | }, 90 | action: if pre_election { 91 | message::Action::PreVote(message::Vote { last_log_pos }) 92 | } else { 93 | message::Action::Vote(message::Vote { last_log_pos }) 94 | }, 95 | }, 96 | ); 97 | } 98 | 99 | Candidate { 100 | pre_election, 101 | restart_campaign_at: node.campaign_at(), 102 | votes, 103 | last_leader_msg_at, 104 | } 105 | } 106 | 107 | pub(crate) async fn receive( 108 | &mut self, 109 | node: &mut Node, 110 | message: Message, 111 | ) -> io::Result>> { 112 | use message::Action::*; 113 | 114 | match message.action { 115 | PreVoteResponse(message::VoteResponse { granted }) => { 116 | // If the pre-vote response is no, then the term is the peer's 117 | // **actual** term. 118 | if !granted && message.origin.term > node.term { 119 | self.transition_to_follower(node, message).await 120 | } else if !self.pre_election { 121 | // Not currently a pre-election, so ignore the messages 122 | Ok(None) 123 | } else if message.origin.term < node.term { 124 | // Outdated message 125 | Ok(None) 126 | } else if granted && message.origin.term != node.term + 1 { 127 | // Outdated message 128 | Ok(None) 129 | } else { 130 | self.votes.record(&node.group, &message.origin.id, granted); 131 | // Counting votes happens in `tick` 132 | Ok(None) 133 | } 134 | } 135 | VoteResponse(message::VoteResponse { granted }) => { 136 | // A vote response with a higher term always results in updating the term. 137 | if message.origin.term > node.term { 138 | self.transition_to_follower(node, message).await 139 | } else if message.origin.term == node.term { 140 | if !self.pre_election { 141 | self.votes.record(&node.group, &message.origin.id, granted); 142 | Ok(None) 143 | } else { 144 | // The message is out of date, ignore it. 145 | Ok(None) 146 | } 147 | } else { 148 | Ok(None) 149 | } 150 | } 151 | PreVote(vote) => { 152 | // We must respond to pre-votes even if the term is lower. It is 153 | // possible to have a peer with a lower term but more log. Not 154 | // responding would result in the group deadlocking. 155 | let last_appended = node.log.last_appended(); 156 | 157 | // For pre-votes, we only vote "yes" if the message term is greater than ours. 158 | let granted = message.origin.term > node.term && vote.last_log_pos >= last_appended; 159 | 160 | // Never update our own term or state as part of a pre-vote 161 | // request. Because we are a candidate, we have not yet issued 162 | // any votes that we need to be held to. The only determining 163 | // factor for whether or not we *may* vote for the peer 164 | // candidate is if the term is greater than our current term and 165 | // the candidate's log is at least as current as ours. 166 | node.driver.dispatch( 167 | message.origin.id.clone(), 168 | Message { 169 | origin: message::Origin { 170 | id: node.config.id.clone(), 171 | // See `Follower` for the reason we are using 172 | // the origin term here. 173 | term: message.origin.term, 174 | }, 175 | action: message::Action::PreVoteResponse(message::VoteResponse { granted }), 176 | }, 177 | ); 178 | 179 | Ok(None) 180 | } 181 | Vote(..) => { 182 | if message.origin.term > node.term { 183 | // We received a vote request with a term greater than ours. 184 | // We must transition back to follower. If the candidate has 185 | // a log at least as up to date as ours, then we can vote 186 | // for the candidate. 187 | self.transition_to_follower(node, message).await 188 | } else { 189 | // As a candidate, we already voted for ourselves this term. 190 | node.driver.dispatch( 191 | message.origin.id.clone(), 192 | Message { 193 | origin: message::Origin { 194 | id: node.config.id.clone(), 195 | term: node.term, 196 | }, 197 | action: message::Action::PreVoteResponse(message::VoteResponse { 198 | granted: false, 199 | }), 200 | }, 201 | ); 202 | 203 | Ok(None) 204 | } 205 | } 206 | AppendEntries(..) => { 207 | if message.origin.term < node.term { 208 | // We have received messages from a leader at a lower term. 209 | // It is possible that these messages were simply delayed in 210 | // the network, but this could also mean that this node has 211 | // advanced its term number during a network partition, and 212 | // it is now unable to either win an election or to rejoin 213 | // the majority on the old term. If checkQuorum is false, 214 | // this will be handled by incrementing term numbers in 215 | // response to MsgVote with a higher term, but if 216 | // checkQuorum is true we may not advance the term on 217 | // MsgVote and must generate other messages to advance the 218 | // term. The net result of these two features is to minimize 219 | // the disruption caused by nodes that have been removed 220 | // from the cluster's configuration: a removed node will 221 | // send MsgVotes which will be ignored, but it will not 222 | // receive MsgApp or MsgHeartbeat, so it will not create 223 | // disruptive term increases, by notifying leader of this 224 | // node's activeness. The above comments also true for 225 | // Pre-Vote 226 | // 227 | // When follower gets isolated, it soon starts an election 228 | // ending up with a higher term than leader, although it 229 | // won't receive enough votes to win the election. When it 230 | // regains connectivity, this response with "pb.MsgAppResp" 231 | // of higher term would force leader to step down. However, 232 | // this disruption is inevitable to free this stuck node 233 | // with fresh election. This can be prevented with Pre-Vote 234 | // phase. 235 | // 236 | // (Copied from raft-rs) 237 | node.driver.dispatch( 238 | message.origin.id.clone(), 239 | Message { 240 | origin: message::Origin { 241 | id: node.config.id.clone(), 242 | term: node.term, 243 | }, 244 | action: message::Action::AppendEntriesResponse( 245 | message::AppendEntriesResponse::Reject, 246 | ), 247 | }, 248 | ); 249 | Ok(None) 250 | } else { 251 | // Switch back to follower, maybe incrementing our term. 252 | self.transition_to_follower(node, message).await 253 | } 254 | } 255 | AppendEntriesResponse(_) => { 256 | // Does not apply to candidates. The node most likely used to be 257 | // a leader and is receiving an outdated message. However, 258 | // **if** the origin term is greater than the node's term, this 259 | // indicates a peer got partitioned, incremented its term, and 260 | // is now trying to rejoin the group. 261 | if message.origin.term > node.term { 262 | self.transition_to_follower(node, message).await 263 | } else { 264 | Ok(None) 265 | } 266 | } 267 | } 268 | } 269 | 270 | pub(crate) async fn tick(&mut self, node: &mut Node) -> io::Result>> { 271 | use votes::Tally::*; 272 | 273 | // First, tally votes. 274 | Ok(match self.votes.tally(&node.group) { 275 | Win => { 276 | if self.pre_election { 277 | Candidate::transition_from_pre_candidate(node, self.last_leader_msg_at).into() 278 | } else { 279 | Leader::transition_from_candidate(node).await?.into() 280 | } 281 | } 282 | Lose => { 283 | // We lost! Oh no... well, transition back to follower. 284 | Follower::transition_from_lost_election(node, self.last_leader_msg_at).into() 285 | } 286 | Pending => { 287 | // At this point, just wait for the timeout to expire. If it is 288 | // hit, this means that no further messages are received and the 289 | // pre-electiololn failed. 290 | if node.before(self.restart_campaign_at) { 291 | return Ok(self.restart_campaign_at.into()); 292 | } 293 | 294 | // Transition back to pre-candidate and restart the election 295 | Candidate::transition_from_timed_out_campaign(node, self.last_leader_msg_at).into() 296 | } 297 | }) 298 | } 299 | 300 | // Process a message that transitioned the node back to follower 301 | async fn transition_to_follower( 302 | &mut self, 303 | node: &mut Node, 304 | message: Message, 305 | ) -> io::Result>> { 306 | // If the term is **not** being incremented and this is **not** a 307 | // pre-election, then we already voted for ourselves this term and will 308 | // ignore all other votes. 309 | let voted_for = if !self.pre_election && message.origin.term == node.term { 310 | Some(node.config.id.clone()) 311 | } else { 312 | None 313 | }; 314 | 315 | // We received a message that is forcing us back to follower. 316 | let mut follower = Follower::transition_from_candidate( 317 | node, 318 | message.origin.term, 319 | None, 320 | voted_for, 321 | self.last_leader_msg_at, 322 | ); 323 | 324 | // Process the message as a follower 325 | transition!(follower.receive(node, message,).await?, follower) 326 | } 327 | } 328 | -------------------------------------------------------------------------------- /src/raft/src/config.rs: -------------------------------------------------------------------------------- 1 | use tokio::time::Duration; 2 | 3 | /// Configure a Raft group's execution details 4 | #[derive(Clone)] 5 | #[non_exhaustive] 6 | pub struct Config { 7 | pub id: T, 8 | 9 | /// How often the leader sends heartbeats 10 | pub leader_heartbeat_interval: Duration, 11 | 12 | /// Lower bound after which a follower will initiate an election if it has 13 | /// not heard from the leader. 14 | pub min_election_interval: Duration, 15 | 16 | /// Upper bound after which a follower will initiate an election if it has 17 | /// not heard from the leader. 18 | pub max_election_interval: Duration, 19 | 20 | /// How many entries can be uncommitted before the leader starts rejecting 21 | /// proposals. 22 | pub max_uncommitted_entries: Option, 23 | } 24 | 25 | impl Config { 26 | pub fn new(id: T) -> Config { 27 | Config { 28 | id, 29 | leader_heartbeat_interval: ms(10), 30 | min_election_interval: ms(150), 31 | max_election_interval: ms(400), 32 | max_uncommitted_entries: None, 33 | } 34 | } 35 | } 36 | 37 | fn ms(ms: u32) -> Duration { 38 | Duration::from_millis(ms as _) 39 | } 40 | -------------------------------------------------------------------------------- /src/raft/src/driver.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::hash::Hash; 4 | use std::task::{Context, Poll}; 5 | use std::{fmt, io}; 6 | 7 | pub trait Driver: fmt::Debug + Sized + 'static { 8 | type Id: Hash + Eq + PartialEq + Clone + fmt::Debug + 'static; 9 | 10 | /// Send a message to a peer 11 | fn dispatch(&mut self, dst: Self::Id, message: Message); 12 | 13 | /// Asynchronously query the term for the entry at the given index. 14 | fn poll_term_for( 15 | &mut self, 16 | cx: &mut Context<'_>, 17 | index: Index, 18 | ) -> Poll>>; 19 | 20 | /// Asynchronously read a single entry from the log 21 | fn poll_read_entry( 22 | &mut self, 23 | cx: &mut Context<'_>, 24 | index: Index, 25 | ) -> Poll>>>; 26 | 27 | /// Asynchronously read entries from the log. 28 | /// 29 | /// `end` is **inclusive** 30 | fn poll_read_entries( 31 | &mut self, 32 | cx: &mut Context<'_>, 33 | start: Index, 34 | end: Index, 35 | dst: &mut Vec>, 36 | ) -> Poll>; 37 | 38 | /// Append entries to the log. 39 | fn append_entries(&mut self, entries: &[message::Entry]); 40 | 41 | /// Truncate the log at the given index. 42 | /// 43 | /// All **earlier** indices are kept. All entries at the given index and 44 | /// after are discarded. 45 | fn truncate(&mut self, index: Index); 46 | 47 | /// Generate a random campaign wait time, in milliseconds. 48 | /// 49 | /// Section 5.2: Raft uses randomized election timeouts to ensure that split 50 | /// votes are rare and that they are resolved quickly. To prevent split 51 | /// votes in the first place, election timeouts are chosen randomly from a 52 | /// fixed interval (e.g., 150–300ms). This spreads out the servers so that 53 | /// in most cases only a single server will time out; it wins the election 54 | /// and sends heartbeats before any other servers time out. The same 55 | /// mechanism is used to handle split votes. 56 | fn rand_election_timeout(&mut self, lower: u64, upper: u64) -> u64; 57 | } 58 | -------------------------------------------------------------------------------- /src/raft/src/follower.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::io; 4 | use tokio::time::Instant; 5 | 6 | pub(crate) struct Follower { 7 | observer: Observer, 8 | /// If the node is currently eligible to vote in elections and become a 9 | /// leader. Note, that even when this is `Observer`, the node must still 10 | /// respond to vote requests as it may have been promoted to be a voter but 11 | /// has not been informed yet. 12 | role: Role, 13 | 14 | /// The instant at which we **last** heard from the leader. This is used to 15 | /// determine if we respond to vote requests. 16 | last_leader_msg_at: Instant, 17 | 18 | /// Who did we vote for **this** term 19 | voted_for: Option, 20 | } 21 | 22 | /// Whether or not the node is eligible for elections. 23 | #[derive(Debug)] 24 | enum Role { 25 | /// The node is just an observer and is not eligible for election. Even so, 26 | /// **if** we receive vote requests, we must still respond. 27 | Observer, 28 | /// The node is elligible to be elected a leader. 29 | Candidate { 30 | /// When the follower should start campaining. This instant is pushed 31 | /// forward as long as the follower receives signal that the leader is 32 | /// healthy. 33 | campaign_at: Instant, 34 | }, 35 | } 36 | 37 | impl Follower { 38 | pub(crate) fn new_observer(node: &mut Node) -> Follower { 39 | Follower { 40 | observer: Observer::new(), 41 | role: Role::Observer, 42 | // Set `last_leader_msg_at` in the past to be able to immediately 43 | // respond to vote requests. 44 | last_leader_msg_at: node.now - node.config.max_election_interval, 45 | voted_for: None, 46 | } 47 | } 48 | 49 | pub(crate) fn transition_from_observer(node: &mut Node) -> Follower { 50 | Follower { 51 | observer: Observer::new(), 52 | role: Role::Candidate { 53 | campaign_at: node.campaign_at(), 54 | }, 55 | last_leader_msg_at: node.now, 56 | voted_for: None, 57 | } 58 | } 59 | 60 | pub(crate) fn transition_from_leader( 61 | node: &mut Node, 62 | term: Term, 63 | leader: Option, 64 | ) -> Follower { 65 | node.set_term(term, leader); 66 | 67 | // We are being **downgraded** from a leader to a follower because a 68 | // newer leader sent us a message. Thus, we have received a message from 69 | // the leader! 70 | Follower { 71 | observer: Observer::new(), 72 | role: Role::Candidate { 73 | campaign_at: node.campaign_at(), 74 | }, 75 | last_leader_msg_at: node.now, 76 | voted_for: None, 77 | } 78 | } 79 | 80 | /// The node transitioned from candidate to follower. 81 | /// 82 | /// This can happens when the candidate receives a message that forces it 83 | /// back to the leader stage. 84 | pub(crate) fn transition_from_candidate( 85 | node: &mut Node, 86 | term: Term, 87 | leader: Option, 88 | voted_for: Option, 89 | last_leader_msg_at: Instant, 90 | ) -> Follower { 91 | if term > node.term { 92 | node.set_term(term, leader); 93 | } else if leader.is_some() { 94 | assert!(node.group.leader.is_none()); 95 | node.group.leader = leader; 96 | } 97 | 98 | Follower { 99 | observer: Observer::new(), 100 | role: Role::Candidate { 101 | campaign_at: node.campaign_at(), 102 | }, 103 | last_leader_msg_at, 104 | voted_for, 105 | } 106 | } 107 | 108 | pub(crate) fn transition_from_lost_election( 109 | node: &mut Node, 110 | last_leader_msg_at: Instant, 111 | ) -> Follower { 112 | Follower { 113 | observer: Observer::new(), 114 | role: Role::Candidate { 115 | campaign_at: node.campaign_at(), 116 | }, 117 | last_leader_msg_at, 118 | voted_for: Some(node.config.id.clone()), 119 | } 120 | } 121 | 122 | /// The leader lost connectivity with a quorum of voters. 123 | pub(crate) fn transition_from_leader_lost_quorum(node: &mut Node) -> Follower { 124 | // We should be transitioning from being a leader, so at this point, we 125 | // should believe that *we* are the leader. 126 | assert_eq!(node.group.leader.as_ref(), Some(&node.config.id)); 127 | 128 | // Since we *just* were the leader, this means we voted for ourselves 129 | // this term. This should remain, we cannot vote for any other peer in 130 | // the current term. 131 | let voted_for = Some(node.config.id.clone()); 132 | 133 | // We were just a leader, but we stepped down. So, we will set 134 | // `last_leader_msg_at` in the past to enable ourselves to vote for new 135 | // leaders as soon as our term is incremented. 136 | let last_leader_msg_at = node.now - node.config.max_election_interval; 137 | 138 | Follower { 139 | observer: Observer::new(), 140 | role: Role::Candidate { 141 | campaign_at: node.campaign_at(), 142 | }, 143 | last_leader_msg_at, 144 | voted_for, 145 | } 146 | } 147 | 148 | /// Returns true if the node is an observer 149 | pub(crate) fn is_observer(&self) -> bool { 150 | matches!(self.role, Role::Observer) 151 | } 152 | 153 | pub(crate) async fn receive( 154 | &mut self, 155 | node: &mut Node, 156 | message: Message, 157 | ) -> io::Result>> { 158 | use message::Action::*; 159 | 160 | match message.action { 161 | AppendEntries(append_entries) => { 162 | use observer::ReceiveAppendEntries::*; 163 | 164 | // Term mismatch is handled in the observer's 165 | // receive_append_entries method. 166 | let did_append_config = match self 167 | .observer 168 | .receive_append_entries(node, &message.origin, append_entries) 169 | .await? 170 | { 171 | Discard => return Ok(None), 172 | Appended => false, 173 | AppendedConfig => true, 174 | }; 175 | 176 | // Track that we received a message from the leader 177 | self.last_leader_msg_at = node.now; 178 | 179 | // TODO: this should not be called if the message is rejected 180 | if let Role::Candidate { campaign_at } = &mut self.role { 181 | *campaign_at = node.campaign_at(); 182 | } 183 | 184 | if did_append_config { 185 | self.process_membership_change(node); 186 | } 187 | 188 | Ok(None) 189 | } 190 | PreVote(message::Vote { last_log_pos }) => { 191 | self.process_vote_request(node, &message.origin, last_log_pos, true); 192 | Ok(None) 193 | } 194 | Vote(message::Vote { last_log_pos }) => { 195 | self.process_vote_request(node, &message.origin, last_log_pos, false); 196 | Ok(None) 197 | } 198 | // Never update a term when receiving a granted pre-vote response 199 | PreVoteResponse(message::VoteResponse { granted: true }) => Ok(None), 200 | AppendEntriesResponse(_) | PreVoteResponse(_) | VoteResponse(_) => { 201 | if message.origin.term > node.term { 202 | self.process_term_inc(node, message.origin.term, None); 203 | } 204 | // Not expected to receive these messages in this state. 205 | Ok(None) 206 | } 207 | } 208 | } 209 | 210 | pub(crate) fn tick(&mut self, node: &mut Node) -> Step> { 211 | match self.role { 212 | Role::Candidate { campaign_at } => { 213 | // Wait until the the campaign timeout is reached 214 | if node.before(campaign_at) { 215 | return campaign_at.into(); 216 | } 217 | 218 | // At this point, no messages have been received, so maybe 219 | // the leader is no more? 220 | Candidate::transition_from_follower(node, self.last_leader_msg_at).into() 221 | } 222 | Role::Observer => Step::Wait(None), 223 | } 224 | } 225 | 226 | // We received a (valid) message with a higher term, so lets update 227 | // ourselves to the next term. 228 | fn process_term_inc(&mut self, node: &mut Node, term: Term, leader: Option) { 229 | assert!(term > node.term); 230 | 231 | node.set_term(term, leader); 232 | 233 | self.voted_for = None; 234 | 235 | if let Role::Candidate { campaign_at } = &mut self.role { 236 | *campaign_at = node.campaign_at(); 237 | } 238 | } 239 | 240 | fn process_vote_request( 241 | &mut self, 242 | node: &mut Node, 243 | origin: &message::Origin, 244 | last_log_pos: Option, 245 | pre_vote: bool, 246 | ) { 247 | // If the leader is still sending us messages, then we will not 248 | // participate in any elections. 249 | let next_vote_at = self.last_leader_msg_at + node.config.min_election_interval; 250 | 251 | if origin.term < node.term { 252 | self.respond_vote_no(node, origin, pre_vote); 253 | } else if node.now < next_vote_at { 254 | // As far as we know, the leader is still active. Explicitly reject 255 | // the vote if this is a pre-election, otherwise just ignore the 256 | // message. 257 | if pre_vote { 258 | self.respond_vote_no(node, origin, pre_vote); 259 | } 260 | } else if origin.term > node.term { 261 | if !pre_vote { 262 | self.process_term_inc(node, origin.term, None); 263 | } 264 | 265 | self.respond_vote(node, origin, last_log_pos, pre_vote); 266 | } else if let Some(_) = &node.group.leader { 267 | self.respond_vote_no(node, origin, pre_vote); 268 | } else { 269 | // If we already voted for a different node, then we reject. 270 | // Otherwise, accept since it just means we received the vote 271 | // request twice. 272 | match &self.voted_for { 273 | Some(id) if *id != origin.id => { 274 | self.respond_vote_no(node, origin, pre_vote); 275 | } 276 | _ => { 277 | self.respond_vote(node, origin, last_log_pos, pre_vote); 278 | } 279 | } 280 | } 281 | } 282 | 283 | fn respond_vote( 284 | &mut self, 285 | node: &mut Node, 286 | origin: &message::Origin, 287 | last_log_pos: Option, 288 | pre_vote: bool, 289 | ) { 290 | let our_last_log_pos = node.log.last_appended(); 291 | 292 | if last_log_pos < our_last_log_pos { 293 | // Candidate is not as up-to-date as we are. 294 | self.respond_vote_no(node, origin, pre_vote); 295 | } else { 296 | self.respond_vote_yes(node, origin, pre_vote); 297 | } 298 | } 299 | 300 | fn process_membership_change(&mut self, node: &mut Node) { 301 | let peer = node 302 | .group 303 | .peers 304 | .get(&node.config.id) 305 | .expect("self missing in raft group"); 306 | 307 | if peer.is_voter() { 308 | if self.is_observer() { 309 | *self = Follower::transition_from_observer(node); 310 | } 311 | } else { 312 | if !self.is_observer() { 313 | todo!("unexpected downgrade to observer"); 314 | } 315 | } 316 | } 317 | 318 | /// Vote for the candidate. 319 | /// 320 | /// Respond with the term included in the original message. 321 | fn respond_vote_yes( 322 | &mut self, 323 | node: &mut Node, 324 | origin: &message::Origin, 325 | pre_vote: bool, 326 | ) { 327 | if pre_vote { 328 | assert!(node.term <= origin.term); 329 | } else { 330 | assert!(node.term == origin.term); 331 | assert!(self.voted_for.is_none() || self.voted_for == Some(origin.id.clone())); 332 | 333 | self.voted_for = Some(origin.id.clone()); 334 | } 335 | 336 | let response = message::VoteResponse { granted: true }; 337 | node.driver.dispatch( 338 | origin.id.clone(), 339 | Message { 340 | origin: message::Origin { 341 | id: node.config.id.clone(), 342 | // When responding to pre-vote messages, the origin term is 343 | // set to the original vote request term, not the current 344 | // term. The current node may still be in a past term. 345 | // Because we don't update our own term when receiving a 346 | // pre-vote message, if we include an out-of-date term, then 347 | // the candidate sending the vote request will discard the 348 | // message. If this is a regular vote (not pre), then the 349 | // node's current term is the same as the origin term. See 350 | // the above assertion that checks for this. 351 | term: origin.term, 352 | }, 353 | action: if pre_vote { 354 | message::Action::PreVoteResponse(response) 355 | } else { 356 | message::Action::VoteResponse(response) 357 | }, 358 | }, 359 | ); 360 | } 361 | 362 | fn respond_vote_no(&self, node: &mut Node, origin: &message::Origin, pre_vote: bool) { 363 | let response = message::VoteResponse { granted: false }; 364 | node.driver.dispatch( 365 | origin.id.clone(), 366 | Message { 367 | origin: message::Origin { 368 | id: node.config.id.clone(), 369 | // A "no" vote always responds with the current node's term (as opposed to a "yes" pre-vote response). 370 | term: node.term, 371 | }, 372 | action: if pre_vote { 373 | message::Action::PreVoteResponse(response) 374 | } else { 375 | message::Action::VoteResponse(response) 376 | }, 377 | }, 378 | ) 379 | } 380 | } 381 | -------------------------------------------------------------------------------- /src/raft/src/group.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use indexmap::IndexMap; 4 | use tokio::time::Instant; 5 | 6 | pub(crate) struct Group { 7 | /// Which peer is known to be the leader 8 | pub(crate) leader: Option, 9 | 10 | /// True when the Raft group is in "joint consensus" mode 11 | is_joint_consensus: bool, 12 | 13 | /// Set of all known peer nodes. This includes the leader, voters, and 14 | /// observers. 15 | pub(crate) peers: IndexMap>, 16 | } 17 | 18 | impl Group { 19 | pub(crate) fn new() -> Group { 20 | Group { 21 | leader: None, 22 | is_joint_consensus: false, 23 | peers: IndexMap::new(), 24 | } 25 | } 26 | 27 | /// Called on the first node of a Raft group to bootstrap consensus. 28 | pub(crate) fn bootstrap(&mut self, leader: T::Id, now: Instant) { 29 | let mut peer = Peer::new(leader.clone(), peer::Role::Voter, Index(0), now); 30 | 31 | // Update `next_idx` and `matched` to reflect the `InitRaft` log entry 32 | // that is appended during bootstrap. 33 | peer.matched = Some(peer.next_idx); 34 | peer.next_idx += 1; 35 | 36 | self.peers.insert(leader.clone(), peer); 37 | 38 | self.leader = Some(leader); 39 | } 40 | 41 | pub(crate) fn peers_mut<'a>( 42 | &'a mut self, 43 | self_id: &'a T::Id, 44 | ) -> impl Iterator> + 'a { 45 | self.peers.iter_mut().filter_map( 46 | move |(id, peer)| { 47 | if id == self_id { 48 | None 49 | } else { 50 | Some(peer) 51 | } 52 | }, 53 | ) 54 | } 55 | 56 | // The current node is transitioning to the candidate stage 57 | pub(crate) fn transition_to_candidate(&mut self, id: &T::Id, log: &Log) { 58 | let last_appended = log.last_appended_index(); 59 | 60 | for peer in self.peers.values_mut() { 61 | peer.replication = peer::ReplicationState::Init; 62 | 63 | if peer.id == *id { 64 | peer.replication = peer::ReplicationState::Replicate; 65 | peer.matched = last_appended; 66 | } else { 67 | peer.matched = None; 68 | } 69 | } 70 | } 71 | 72 | // The current node is transitioning to the leader stage 73 | pub(crate) fn transition_to_leader(&mut self, id: &T::Id, log: &Log, now: Instant) { 74 | let next_idx = log 75 | .last_appended_index() 76 | .map(|index| index + 1) 77 | .unwrap_or_default(); 78 | 79 | for peer in self.peers.values_mut() { 80 | peer.last_seen_at = now; 81 | 82 | if peer.id == *id { 83 | assert!(peer.replication.is_replicating()); 84 | } else if peer.replication.is_init() { 85 | assert!(peer.matched.is_none()); 86 | peer.next_idx = next_idx; 87 | } else { 88 | todo!("handle early syncing with VoteResponse"); 89 | } 90 | } 91 | 92 | self.leader = Some(id.clone()); 93 | } 94 | 95 | pub(crate) fn is_valid_config_change( 96 | &self, 97 | pos: Pos, 98 | config_change: &message::ConfigChange, 99 | ) -> bool { 100 | use message::ConfigChange::*; 101 | 102 | match config_change { 103 | InitGroup { .. } => { 104 | if pos.index != 0 || pos.term != 0 { 105 | false 106 | } else { 107 | true 108 | } 109 | } 110 | AddNode { id, .. } => { 111 | if self.is_joint_consensus() { 112 | false 113 | } else { 114 | // When adding a node, the node must not already be part of the group 115 | !self.peers.contains_key(id) 116 | } 117 | } 118 | UpgradeNode { 119 | id, 120 | phase: message::Phase::One, 121 | } => { 122 | if self.is_joint_consensus() { 123 | false 124 | } else { 125 | // To be a valid configuration transition, the peer being 126 | // upgraded must currently be an observer. 127 | match self.peers.get(id) { 128 | Some(peer) if peer.is_observer() => true, 129 | _ => false, 130 | } 131 | } 132 | } 133 | UpgradeNode { 134 | id, 135 | phase: message::Phase::Two, 136 | } => { 137 | if !self.is_joint_consensus() { 138 | false 139 | } else { 140 | // There must be an entry for the peer being upgraded and the 141 | // peer must be in phase one. 142 | match self.peers.get(id) { 143 | Some(peer) => matches!(peer.role, peer::Role::VoterIncoming { .. }), 144 | None => false, 145 | } 146 | } 147 | } 148 | RemoveNode { 149 | id, 150 | phase: message::Phase::One, 151 | } => { 152 | if self.is_joint_consensus() { 153 | false 154 | } else { 155 | // There must be an entry for the peer being removed and the 156 | // peer must be fully added. 157 | match self.peers.get(id) { 158 | Some(peer) => peer.is_removable(), 159 | None => false, 160 | } 161 | } 162 | } 163 | RemoveNode { 164 | id, 165 | phase: message::Phase::Two, 166 | } => { 167 | if !self.is_joint_consensus() { 168 | false 169 | } else { 170 | // There must be an entry for the peer being removed and the 171 | // peer must be fully added. 172 | match self.peers.get(id) { 173 | Some(peer) => matches!(peer.role, peer::Role::VoterOutgoing { .. }), 174 | None => false, 175 | } 176 | } 177 | } 178 | } 179 | } 180 | 181 | pub(crate) fn apply_config_change( 182 | &mut self, 183 | log: &Log, 184 | pos: Pos, 185 | config_change: &message::ConfigChange, 186 | now: Instant, 187 | ) -> Result<(), ()> { 188 | use message::ConfigChange::*; 189 | use std::cmp; 190 | 191 | // The configuration must be valid. This should have been checked at an 192 | // earlier point, we are just double checking here. It also is possible 193 | // that a peer sent an invalid entry. If so, we don't want to apply it. 194 | if !self.is_valid_config_change(pos, config_change) { 195 | return Err(()); 196 | } 197 | 198 | match config_change { 199 | InitGroup { id } => { 200 | let next_idx = cmp::max(pos.index, log.last_appended_index().unwrap_or_default()); 201 | 202 | let prev = self.peers.insert( 203 | id.clone(), 204 | Peer::new(id.clone(), peer::Role::Voter, next_idx, now), 205 | ); 206 | 207 | assert!(prev.is_none()); 208 | assert!(!self.is_joint_consensus()); 209 | } 210 | AddNode { id, auto_upgrade } => { 211 | let next_idx = cmp::max(pos.index, log.last_appended_index().unwrap_or_default()); 212 | 213 | let prev = self.peers.insert( 214 | id.clone(), 215 | Peer::new( 216 | id.clone(), 217 | peer::Role::Observer { 218 | // If auto-upgrade is requested, the node should be 219 | // upgraded to follower once it has synchronized the 220 | // log. 221 | auto_upgrade: if *auto_upgrade { Some(pos) } else { None }, 222 | }, 223 | next_idx, 224 | now, 225 | ), 226 | ); 227 | 228 | // Once again, make sure there was no previous entry 229 | assert!(prev.is_none()); 230 | 231 | // And, since we are adding an observer, make sure we are not in 232 | // joint consensus mode. 233 | assert!(!self.is_joint_consensus()); 234 | } 235 | UpgradeNode { 236 | id, 237 | phase: message::Phase::One, 238 | } => { 239 | // Only one peer can be upgraded at a time. This means we cannot 240 | // be in joint consensus mode when starting to upgrade a node. 241 | assert!(!self.is_joint_consensus()); 242 | 243 | self.peers 244 | .get_mut(id) 245 | .expect("peer missing") 246 | .upgrade_to_follower(pos); 247 | 248 | // Track that we are in joint-consensus mode 249 | self.set_joint_consensus(true); 250 | } 251 | UpgradeNode { 252 | id, 253 | phase: message::Phase::Two, 254 | } => { 255 | // This step moves the group out of joint consensus 256 | assert!(self.is_joint_consensus()); 257 | 258 | self.peers 259 | .get_mut(id) 260 | .expect("peer missing") 261 | .upgrade_to_follower_phase_2(); 262 | 263 | self.set_joint_consensus(false); 264 | } 265 | RemoveNode { 266 | id, 267 | phase: message::Phase::One, 268 | } => { 269 | // Only one configuration change can be applied at a time. This 270 | // means we cannot be in joint consensus when starting to 271 | // upgrade a node. 272 | assert!(!self.is_joint_consensus()); 273 | 274 | let peer = self.peers.get_mut(id).expect("peer missing"); 275 | 276 | if peer.is_voter() { 277 | peer.remove_phase_one(pos); 278 | 279 | // Track that we are in joint-consensus mode 280 | self.set_joint_consensus(true); 281 | 282 | // Number of voters stays the same 283 | } else { 284 | assert!(peer.is_observer()); 285 | 286 | // Observers can just be removed 287 | self.peers.remove(id); 288 | } 289 | } 290 | RemoveNode { 291 | id, 292 | phase: message::Phase::Two, 293 | } => { 294 | // This step moves the group out of joint consensus 295 | assert!(self.is_joint_consensus()); 296 | 297 | // Remove the peer 298 | let peer = self.peers.remove(id).expect("peer missing"); 299 | 300 | // The peer should be an outgoing voter 301 | assert!(matches!(peer.role, peer::Role::VoterOutgoing { .. })); 302 | 303 | self.set_joint_consensus(false); 304 | } 305 | } 306 | 307 | Ok(()) 308 | } 309 | 310 | /// Iterate all voters in the Raft group 311 | pub(crate) fn voters(&self) -> impl Iterator> { 312 | self.peers.values().filter(|peer| peer.is_voter()) 313 | } 314 | 315 | /// Iterate all IDs for voters in the Raft group 316 | pub(crate) fn voter_ids(&self) -> impl Iterator { 317 | self.voters().map(|peer| &peer.id) 318 | } 319 | 320 | /// Iterate all incoming voters in the Raft group. 321 | /// 322 | /// During joint-consensus, these are the peers that are either full voters 323 | /// or in the "incoming" set. 324 | pub(crate) fn incoming_voters(&self) -> impl Iterator> { 325 | self.peers.values().filter(|peer| peer.is_incoming_voter()) 326 | } 327 | 328 | /// During joint-consensus, these are the voter IDs from the "incoming" set. 329 | pub(crate) fn incoming_voter_ids(&self) -> impl Iterator { 330 | self.incoming_voters().map(|peer| &peer.id) 331 | } 332 | 333 | pub(crate) fn outgoing_voters(&self) -> impl Iterator> { 334 | self.peers.values().filter(|peer| peer.is_outgoing_voter()) 335 | } 336 | 337 | /// During joint-consensus, these are the voter IDs from the "outgoing" set. 338 | pub(crate) fn outgoing_voter_ids(&self) -> impl Iterator { 339 | self.outgoing_voters().map(|peer| &peer.id) 340 | } 341 | 342 | pub(crate) fn is_joint_consensus(&self) -> bool { 343 | use peer::Role::*; 344 | 345 | debug_assert_eq!( 346 | self.is_joint_consensus, 347 | self.peers.values().any(|peer| match peer.role { 348 | Observer { .. } | Voter => false, 349 | VoterIncoming { .. } | VoterOutgoing { .. } => true, 350 | }) 351 | ); 352 | 353 | self.is_joint_consensus 354 | } 355 | 356 | fn set_joint_consensus(&mut self, value: bool) { 357 | self.is_joint_consensus = value; 358 | 359 | // Call the accessor to perform a consistency check. 360 | self.is_joint_consensus(); 361 | } 362 | } 363 | -------------------------------------------------------------------------------- /src/raft/src/index.rs: -------------------------------------------------------------------------------- 1 | use std::{cmp, ops}; 2 | 3 | // Index of a log entry 4 | #[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, serde::Serialize)] 5 | pub struct Index(pub u64); 6 | 7 | impl Index { 8 | pub(crate) fn checked_sub(self, rhs: u64) -> Option { 9 | self.0.checked_sub(rhs).map(Index) 10 | } 11 | } 12 | 13 | impl cmp::PartialEq for Index { 14 | fn eq(&self, other: &i32) -> bool { 15 | (self.0 as i32).eq(other) 16 | } 17 | } 18 | 19 | impl cmp::PartialEq for Index { 20 | fn eq(&self, other: &usize) -> bool { 21 | self.0.eq(&(*other as _)) 22 | } 23 | } 24 | 25 | impl cmp::PartialOrd for Index { 26 | fn partial_cmp(&self, rhs: &usize) -> Option { 27 | self.0.partial_cmp(&(*rhs as _)) 28 | } 29 | } 30 | 31 | impl ops::Add for Index { 32 | type Output = Index; 33 | 34 | fn add(self, rhs: u64) -> Index { 35 | Index(self.0 + rhs) 36 | } 37 | } 38 | 39 | impl ops::Add for Index { 40 | type Output = Index; 41 | 42 | fn add(self, rhs: usize) -> Index { 43 | let rhs = u64::try_from(rhs).unwrap(); 44 | Index(self.0.checked_add(rhs).unwrap()) 45 | } 46 | } 47 | 48 | impl ops::Add for Index { 49 | type Output = Index; 50 | 51 | fn add(self, rhs: i32) -> Index { 52 | let rhs = u64::try_from(rhs).unwrap(); 53 | Index(self.0.checked_add(rhs).unwrap()) 54 | } 55 | } 56 | 57 | impl ops::Sub for Index { 58 | type Output = Index; 59 | 60 | fn sub(self, rhs: u64) -> Index { 61 | Index(self.0 - rhs) 62 | } 63 | } 64 | 65 | impl ops::Sub for Index { 66 | type Output = Index; 67 | 68 | fn sub(self, rhs: i32) -> Index { 69 | Index(self.0.checked_sub(rhs as _).unwrap()) 70 | } 71 | } 72 | 73 | impl ops::Sub for Index { 74 | type Output = Index; 75 | 76 | fn sub(self, rhs: usize) -> Index { 77 | Index(self.0.checked_sub(rhs as _).unwrap()) 78 | } 79 | } 80 | 81 | impl ops::AddAssign for Index { 82 | fn add_assign(&mut self, rhs: u64) { 83 | self.0 += rhs; 84 | } 85 | } 86 | 87 | impl ops::AddAssign for Index { 88 | fn add_assign(&mut self, rhs: i32) { 89 | let rhs = u64::try_from(rhs).unwrap(); 90 | self.0 = self.0.checked_add(rhs).unwrap(); 91 | } 92 | } 93 | 94 | impl ops::AddAssign for Index { 95 | fn add_assign(&mut self, rhs: usize) { 96 | let rhs = u64::try_from(rhs).unwrap(); 97 | self.0 = self.0.checked_add(rhs).unwrap(); 98 | } 99 | } 100 | 101 | impl ops::SubAssign for Index { 102 | fn sub_assign(&mut self, rhs: u64) { 103 | self.0 -= rhs; 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/raft/src/info.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::ops; 4 | 5 | #[derive(Debug, Clone, serde::Serialize)] 6 | pub struct Info { 7 | /// This node's term 8 | pub term: Term, 9 | 10 | /// This node's specific stage 11 | pub stage: Stage, 12 | 13 | /// Index of most recently committed entry 14 | pub committed: Option, 15 | 16 | /// Last applied log position 17 | pub last_applied: Option, 18 | 19 | /// Raft group as known by the queried node 20 | pub group: Group, 21 | } 22 | 23 | #[derive(Debug, Clone, serde::Serialize)] 24 | pub struct Group { 25 | /// The leader as known by the current node. 26 | pub leader: Option, 27 | 28 | /// All nodes in the group 29 | pub peers: Vec>, 30 | } 31 | 32 | #[derive(Debug, Clone, PartialEq, serde::Serialize)] 33 | pub struct Peer { 34 | /// Node identifier 35 | pub id: T, 36 | 37 | /// Role the node is playing within the group 38 | pub role: Role, 39 | 40 | /// How much of the peer log has been matched 41 | pub matched: Option, 42 | } 43 | 44 | #[derive(Debug, Clone, Copy, serde::Serialize)] 45 | pub enum Stage { 46 | Follower, 47 | Observer, 48 | PreCandidate, 49 | Candidate, 50 | Leader, 51 | } 52 | 53 | #[derive(Debug, Clone, PartialEq, serde::Serialize)] 54 | pub enum Role { 55 | Observer { 56 | auto_upgrade: Option, 57 | }, 58 | Voter { 59 | transitioning: Option, 60 | }, 61 | } 62 | 63 | #[derive(Debug, Clone, PartialEq, serde::Serialize)] 64 | pub struct Transitioning { 65 | pub phase_two_at: Pos, 66 | pub direction: Direction, 67 | } 68 | 69 | #[derive(Debug, Clone, PartialEq, serde::Serialize)] 70 | pub enum Direction { 71 | Incoming, 72 | Outgoing, 73 | } 74 | 75 | impl Info { 76 | pub(crate) fn from_raft(raft: &Raft) -> Info 77 | where 78 | T: Clone, 79 | D: Driver, 80 | { 81 | Info { 82 | term: raft.node.term, 83 | stage: match &raft.stage { 84 | crate::Stage::Follower(stage) => { 85 | if stage.is_observer() { 86 | Stage::Observer 87 | } else { 88 | Stage::Follower 89 | } 90 | } 91 | crate::Stage::Candidate(..) => Stage::Candidate, 92 | crate::Stage::Leader(..) => Stage::Leader, 93 | }, 94 | committed: raft.node.log.last_committed.map(|pos| pos.index), 95 | last_applied: raft.node.last_applied, 96 | group: Group { 97 | leader: raft.node.group.leader.clone(), 98 | peers: raft 99 | .node 100 | .group 101 | .peers 102 | .values() 103 | .map(|peer| Peer { 104 | id: peer.id.clone(), 105 | matched: peer.matched, 106 | role: match peer.role { 107 | crate::peer::Role::Observer { auto_upgrade } => { 108 | Role::Observer { auto_upgrade } 109 | } 110 | crate::peer::Role::Voter => Role::Voter { 111 | transitioning: None, 112 | }, 113 | crate::peer::Role::VoterIncoming { phase_two_at } => Role::Voter { 114 | transitioning: Some(Transitioning { 115 | phase_two_at, 116 | direction: Direction::Incoming, 117 | }), 118 | }, 119 | crate::peer::Role::VoterOutgoing { phase_two_at } => Role::Voter { 120 | transitioning: Some(Transitioning { 121 | phase_two_at, 122 | direction: Direction::Outgoing, 123 | }), 124 | }, 125 | }, 126 | }) 127 | .collect(), 128 | }, 129 | } 130 | } 131 | } 132 | 133 | impl Stage { 134 | pub fn is_follower(&self) -> bool { 135 | matches!(self, Stage::Follower) 136 | } 137 | 138 | pub fn is_observer(&self) -> bool { 139 | matches!(self, Stage::Observer) 140 | } 141 | 142 | pub fn is_pre_candidate(&self) -> bool { 143 | matches!(self, Stage::PreCandidate) 144 | } 145 | 146 | pub fn is_candidate(&self) -> bool { 147 | matches!(self, Stage::Candidate) 148 | } 149 | 150 | pub fn is_leader(&self) -> bool { 151 | matches!(self, Stage::Leader) 152 | } 153 | } 154 | 155 | impl Group { 156 | pub fn peer_by_id(&self, id: &T) -> Option<&Peer> 157 | where 158 | T: PartialEq, 159 | { 160 | self.peers.iter().find(|peer| peer.id == *id) 161 | } 162 | } 163 | 164 | impl ops::Index for Group { 165 | type Output = Peer; 166 | 167 | fn index(&self, index: T) -> &Peer { 168 | self.peer_by_id(&index).expect("peer missing") 169 | } 170 | } 171 | 172 | impl Peer { 173 | pub fn is_voter(&self) -> bool { 174 | matches!(self.role, Role::Voter { .. }) 175 | } 176 | 177 | /// Is a voter that is neither incoming or outgoing 178 | pub fn is_stable_voter(&self) -> bool { 179 | matches!( 180 | self.role, 181 | Role::Voter { 182 | transitioning: None 183 | } 184 | ) 185 | } 186 | 187 | pub fn is_incoming_voter(&self) -> bool { 188 | matches!( 189 | self.role, 190 | Role::Voter { 191 | transitioning: Some(Transitioning { 192 | direction: Direction::Incoming, 193 | .. 194 | }) 195 | } 196 | ) 197 | } 198 | 199 | pub fn is_outgoing_voter(&self) -> bool { 200 | matches!( 201 | self.role, 202 | Role::Voter { 203 | transitioning: Some(Transitioning { 204 | direction: Direction::Outgoing, 205 | .. 206 | }) 207 | } 208 | ) 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /src/raft/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | mod transition; 3 | use transition::Step; 4 | 5 | mod candidate; 6 | use candidate::Candidate; 7 | 8 | mod config; 9 | pub use config::Config; 10 | 11 | mod driver; 12 | pub use driver::Driver; 13 | 14 | mod follower; 15 | use follower::Follower; 16 | 17 | mod group; 18 | use group::Group; 19 | 20 | mod index; 21 | pub use index::Index; 22 | 23 | pub mod info; 24 | pub use info::Info; 25 | 26 | mod leader; 27 | use leader::Leader; 28 | 29 | mod log; 30 | use log::Log; 31 | 32 | pub mod message; 33 | pub use message::Message; 34 | 35 | mod node; 36 | use node::Node; 37 | 38 | mod observer; 39 | use observer::Observer; 40 | 41 | mod peer; 42 | use peer::Peer; 43 | 44 | mod pos; 45 | pub use pos::Pos; 46 | 47 | mod raft; 48 | pub use raft::{ProposeError, Raft, Tick}; 49 | 50 | mod stage; 51 | use stage::Stage; 52 | 53 | mod term; 54 | pub use term::Term; 55 | 56 | mod votes; 57 | use votes::Votes; 58 | 59 | pub use anyhow::{Error, Result}; 60 | -------------------------------------------------------------------------------- /src/raft/src/log.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use futures_lite::future::poll_fn; 4 | use std::io; 5 | use std::marker::PhantomData; 6 | 7 | pub(crate) struct Log { 8 | /// Index and term of highest log entry known to be committed. 9 | pub(crate) last_committed: Option, 10 | 11 | /// Index and term of last appended entry. 12 | pub(crate) last_appended: Option, 13 | 14 | _p: PhantomData, 15 | } 16 | 17 | impl Log { 18 | pub(crate) fn new() -> Log { 19 | Log { 20 | last_committed: None, 21 | last_appended: None, 22 | _p: PhantomData, 23 | } 24 | } 25 | 26 | /// Return the entry stored at the specified log position 27 | pub(crate) async fn get( 28 | &self, 29 | driver: &mut T, 30 | index: Index, 31 | ) -> io::Result>> { 32 | poll_fn(|cx| driver.poll_read_entry(cx, index)).await 33 | } 34 | 35 | pub(crate) fn last_appended(&self) -> Option { 36 | self.last_appended 37 | } 38 | 39 | /// Returns the index of the last appended entry 40 | pub(crate) fn last_appended_index(&self) -> Option { 41 | self.last_appended().map(|pos| pos.index) 42 | } 43 | 44 | pub(crate) fn last_committed(&self) -> Option { 45 | self.last_committed 46 | } 47 | 48 | pub(crate) fn last_committed_index(&self) -> Option { 49 | self.last_committed.map(|pos| pos.index) 50 | } 51 | 52 | /// Returns true if the log contains an entry matching the given pos. 53 | pub(crate) async fn contains_pos(&self, driver: &mut T, pos: Pos) -> io::Result { 54 | Ok(self.pos_for(driver, pos.index).await? == Some(pos)) 55 | } 56 | 57 | /// Returns the `Pos` (index, term) for the entry at the given index. 58 | pub(crate) async fn pos_for(&self, driver: &mut T, index: Index) -> io::Result> { 59 | let maybe_term = self.term_for(driver, index).await?; 60 | Ok(maybe_term.map(|term| Pos { term, index })) 61 | } 62 | 63 | /// Returns the term for the entry at the given index 64 | pub(crate) async fn term_for(&self, driver: &mut T, index: Index) -> io::Result> { 65 | poll_fn(|cx| driver.poll_term_for(cx, index)).await 66 | } 67 | 68 | /// Append the given entries to the log 69 | pub(crate) fn append_entries(&mut self, driver: &mut T, entries: &[message::Entry]) { 70 | assert!(!entries.is_empty()); 71 | 72 | // Verify entries are sequenced correctly 73 | for entry in entries { 74 | // Make sure terms are monotonically increasing 75 | assert!(entry.pos.term >= self.last_appended.map(|pos| pos.term).unwrap_or_default()); 76 | 77 | // Make sure the indexes are sequential 78 | assert_eq!( 79 | entry.pos.index, 80 | self.last_appended 81 | .map(|pos| pos.index + 1) 82 | .unwrap_or_default() 83 | ); 84 | 85 | self.last_appended = Some(entry.pos); 86 | } 87 | 88 | // Update last_appended 89 | driver.append_entries(entries); 90 | } 91 | 92 | /// Truncate the log at the given index. All **earlier** indices are kept. 93 | /// All after are removed. 94 | pub(crate) async fn truncate(&mut self, driver: &mut T, index: Index) -> io::Result<()> { 95 | debug_assert!( 96 | Some(index) > self.last_committed_index(), 97 | "truncated committed entry" 98 | ); 99 | driver.truncate(index); 100 | 101 | if index == 0 { 102 | self.last_appended = None; 103 | } else { 104 | self.last_appended = self.pos_for(driver, index - 1).await?; 105 | } 106 | 107 | Ok(()) 108 | } 109 | 110 | /// Attempts to commit the given index, returning `true` if successful. 111 | /// 112 | /// This is called once the index replication has reached quorum. 113 | pub(crate) async fn maybe_commit( 114 | &mut self, 115 | driver: &mut T, 116 | index: Index, 117 | current_term: Term, 118 | ) -> io::Result> { 119 | if let Some(last_committed) = self.last_committed { 120 | if index <= last_committed.index { 121 | return Ok(None); 122 | } 123 | } 124 | 125 | match self.pos_for(driver, index).await? { 126 | Some(pos) if pos.term == current_term => { 127 | self.commit(pos); 128 | Ok(Some(pos)) 129 | } 130 | _ => Ok(None), 131 | } 132 | } 133 | 134 | pub(crate) fn commit(&mut self, pos: Pos) { 135 | // The committed index is monotonically increasing and can never go 136 | // down. 137 | if self.last_committed_index() >= Some(pos.index) { 138 | return; 139 | } 140 | 141 | // Only commit an index if we have received the entries. 142 | if Some(pos.index) > self.last_appended_index() { 143 | return; 144 | } 145 | 146 | self.last_committed = Some(pos); 147 | } 148 | 149 | pub(crate) async fn copy_range_to( 150 | &self, 151 | driver: &mut T, 152 | start: Index, 153 | end: Index, 154 | dst: &mut Vec>, 155 | ) -> io::Result<()> { 156 | debug_assert!(start <= end, "start={:?}; end={:?}", start, end); 157 | debug_assert!( 158 | Some(end) <= self.last_appended_index(), 159 | "copy_range_to out of range; end={:?}; last_appended={:?}", 160 | end, 161 | self.last_appended_index() 162 | ); 163 | 164 | poll_fn(|cx| driver.poll_read_entries(cx, start, end, dst)).await 165 | } 166 | 167 | pub(crate) async fn find_conflict( 168 | &self, 169 | driver: &mut T, 170 | mut index: Index, 171 | term: Term, 172 | ) -> io::Result> { 173 | let last_appended = self.last_appended_index(); 174 | 175 | assert!(Some(index) <= last_appended); 176 | 177 | loop { 178 | match self.pos_for(driver, index).await? { 179 | Some(pos) => { 180 | if pos.term > term { 181 | index -= 1; 182 | } else { 183 | return Ok(Some(pos)); 184 | } 185 | } 186 | None => { 187 | return Ok(None); 188 | } 189 | } 190 | } 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/raft/src/message.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 4 | pub struct Message { 5 | /// When and whom sent the message 6 | pub origin: Origin, 7 | 8 | /// Action to perform 9 | pub action: Action, 10 | } 11 | 12 | /// Message sender's context 13 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 14 | pub struct Origin { 15 | /// Identifier of node that sent the message 16 | pub id: T, 17 | 18 | /// Sender's term when the message was sent 19 | pub term: Term, 20 | } 21 | 22 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 23 | pub enum Action { 24 | /// Before a node becomes a leader candidate, it must engage in a pre-vote 25 | /// round. This helps ensure liveliness in the face of network partitions. 26 | /// 27 | /// https://decentralizedthoughts.github.io/2020-12-12-raft-liveness-full-omission/ 28 | PreVote(Vote), 29 | PreVoteResponse(VoteResponse), 30 | Vote(Vote), 31 | VoteResponse(VoteResponse), 32 | AppendEntries(AppendEntries), 33 | AppendEntriesResponse(AppendEntriesResponse), 34 | } 35 | 36 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 37 | pub struct Vote { 38 | /// Position of the candidate's last log entry 39 | pub last_log_pos: Option, 40 | } 41 | 42 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 43 | pub struct VoteResponse { 44 | pub granted: bool, 45 | } 46 | 47 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 48 | pub struct AppendEntries { 49 | /// Position of the entry in the log (term, index) immedietly preceeding the 50 | /// entries included in this message. 51 | pub prev_log_pos: Option, 52 | 53 | /// Entries to append 54 | pub entries: Vec>, 55 | 56 | /// Index of highest log entry know to be committed **by the leader** 57 | pub leader_committed_index: Option, 58 | } 59 | 60 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 61 | pub enum AppendEntriesResponse { 62 | /// Successfully accepted the append 63 | Success { 64 | /// Position of last appended entry 65 | last_log_pos: Pos, 66 | // / Index of the last committed log entry 67 | // committed: Index, 68 | }, 69 | 70 | /// Attempting to append the entries resulted in a conflict due to diverging 71 | /// logs. 72 | Conflict { 73 | /// The index of the rejected message 74 | rejected: Index, 75 | 76 | /// A hint for the leader indicating where the follower thinks they may 77 | /// have diverged. 78 | hint: Option, 79 | // / Index of the last committed log entry 80 | // committed: Index, 81 | }, 82 | 83 | /// The peer rejected the entries because the leader has fallen out of date. 84 | Reject, 85 | } 86 | 87 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 88 | pub struct Entry { 89 | /// Position of the entry in the log (term, index) 90 | pub pos: Pos, 91 | 92 | /// Entry value, either arbitrary data or a configuration change. 93 | pub value: Value, 94 | } 95 | 96 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 97 | pub enum Value { 98 | /// An opaque value 99 | Data(Vec), 100 | 101 | /// A Raft group configuration change 102 | Config(ConfigChange), 103 | 104 | /// Used to puncutate the start of each new term. 105 | NewTerm, 106 | } 107 | 108 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 109 | pub enum ConfigChange { 110 | /// Initializes the group. This sets the first leader of the group. 111 | InitGroup { 112 | /// Identifier of the first node in the group, which starts as the 113 | /// defacto leader. 114 | id: T, 115 | }, 116 | 117 | /// Add a new node to the Raft group. 118 | /// 119 | /// The node starts as a non-voting observer. 120 | AddNode { 121 | /// Identifier of node being added 122 | id: T, 123 | 124 | /// When true, the node is automatically upgraded to follower once it 125 | /// has synchronized its log. 126 | auto_upgrade: bool, 127 | }, 128 | /// Upgrade a node from observer to follower. 129 | UpgradeNode { 130 | /// Identifier of the node being upgraded 131 | id: T, 132 | 133 | /// Upgrade phase. Changes to the group's voter set requires a 134 | /// two-phased approach. 135 | phase: Phase, 136 | }, 137 | /// Remove a node from the group 138 | RemoveNode { 139 | /// Identifier of the node being removed 140 | id: T, 141 | 142 | /// Removal phase. Changes to the group's voter set requires a 143 | /// two-phased approach. 144 | phase: Phase, 145 | }, 146 | } 147 | 148 | #[derive(Clone, Debug, PartialEq, serde::Serialize)] 149 | pub enum Phase { 150 | One, 151 | Two, 152 | } 153 | 154 | impl Value { 155 | pub fn data(data: impl Into>) -> Value { 156 | Value::Data(data.into()) 157 | } 158 | 159 | pub fn init_group(id: T) -> Value { 160 | Value::Config(message::ConfigChange::InitGroup { id }) 161 | } 162 | 163 | pub fn add_node(id: T) -> Value { 164 | Value::Config(message::ConfigChange::AddNode { 165 | id, 166 | auto_upgrade: false, 167 | }) 168 | } 169 | 170 | pub fn add_node_auto_upgrade(id: T) -> Value { 171 | Value::Config(message::ConfigChange::AddNode { 172 | id, 173 | auto_upgrade: true, 174 | }) 175 | } 176 | 177 | pub fn upgrade_node_phase_one(id: T) -> Value { 178 | Value::Config(message::ConfigChange::UpgradeNode { 179 | id, 180 | phase: Phase::One, 181 | }) 182 | } 183 | 184 | pub fn upgrade_node_phase_two(id: T) -> Value { 185 | Value::Config(message::ConfigChange::UpgradeNode { 186 | id, 187 | phase: Phase::Two, 188 | }) 189 | } 190 | 191 | pub fn remove_node_phase_one(id: T) -> Value { 192 | Value::Config(message::ConfigChange::RemoveNode { 193 | id, 194 | phase: Phase::One, 195 | }) 196 | } 197 | 198 | pub fn remove_node_phase_two(id: T) -> Value { 199 | Value::Config(message::ConfigChange::RemoveNode { 200 | id, 201 | phase: Phase::Two, 202 | }) 203 | } 204 | 205 | /// Proposed by the leader at the start of each term. 206 | pub fn new_term() -> Value { 207 | Value::NewTerm 208 | } 209 | 210 | /// Returns true if the value is a configuration change 211 | pub fn is_config_change(&self) -> bool { 212 | matches!(self, Value::Config(..)) 213 | } 214 | 215 | /// Returns true if the value is user supplied data 216 | pub fn is_data(&self) -> bool { 217 | matches!(self, Value::Data(..)) 218 | } 219 | 220 | /// Returns true if the value is an internal `NewTerm` entry 221 | pub fn is_new_term(&self) -> bool { 222 | matches!(self, Value::NewTerm) 223 | } 224 | } 225 | 226 | impl AppendEntries { 227 | pub fn prev_log_index(&self) -> Option { 228 | self.prev_log_pos.map(|pos| pos.index) 229 | } 230 | 231 | pub fn to_message(self, id: T, term: Term) -> Message { 232 | Message { 233 | origin: Origin { id, term }, 234 | action: Action::AppendEntries(self), 235 | } 236 | } 237 | } 238 | 239 | impl AppendEntriesResponse { 240 | pub fn to_message(self, id: T, term: Term) -> Message { 241 | Message { 242 | origin: Origin { id, term }, 243 | action: Action::AppendEntriesResponse(self), 244 | } 245 | } 246 | } 247 | 248 | impl Vote { 249 | pub fn to_prevote_message(self, id: T, term: Term) -> Message { 250 | Message { 251 | origin: Origin { id, term }, 252 | action: Action::PreVote(self), 253 | } 254 | } 255 | 256 | pub fn to_message(self, id: T, term: Term) -> Message { 257 | Message { 258 | origin: Origin { id, term }, 259 | action: Action::Vote(self), 260 | } 261 | } 262 | } 263 | 264 | impl VoteResponse { 265 | pub fn to_prevote_message(self, id: T, term: Term) -> Message { 266 | Message { 267 | origin: Origin { id, term }, 268 | action: Action::PreVoteResponse(self), 269 | } 270 | } 271 | 272 | pub fn to_message(self, id: T, term: Term) -> Message { 273 | Message { 274 | origin: Origin { id, term }, 275 | action: Action::VoteResponse(self), 276 | } 277 | } 278 | } 279 | 280 | impl ConfigChange { 281 | pub fn id(&self) -> &T { 282 | use ConfigChange::*; 283 | 284 | match self { 285 | InitGroup { id: leader } => leader, 286 | AddNode { id, .. } => id, 287 | UpgradeNode { id, .. } => id, 288 | RemoveNode { id, .. } => id, 289 | } 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /src/raft/src/node.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::io; 4 | use tokio::time::{Duration, Instant}; 5 | 6 | /// State and methods common to all nodes in Raft group. 7 | pub(crate) struct Node { 8 | /// Per-node configuration. 9 | pub(crate) config: Config, 10 | 11 | /// Handle to the driver, used to issue outbound requests, persist data to 12 | /// disk, ... 13 | pub(crate) driver: T, 14 | 15 | /// The current election term. 16 | pub(crate) term: Term, 17 | 18 | /// Tracks nodes in the raft group 19 | pub(crate) group: Group, 20 | 21 | /// Log of entries received by this node. 22 | pub(crate) log: Log, 23 | 24 | /// Index of the last log entry applied to the local state machine 25 | pub(crate) last_applied: Option, 26 | 27 | /// Last known time 28 | pub(crate) now: Instant, 29 | } 30 | 31 | /// Result of `append_entries` operation. 32 | #[derive(Debug)] 33 | pub(crate) struct AppendEntries { 34 | /// True if an appended log entry is a group configuration change that 35 | /// applies to the current node. 36 | pub(crate) did_configure_self: bool, 37 | 38 | /// The number of entries appended. This can be lower than the number of 39 | /// provided entries if an an invalid entry is encountered. 40 | pub(crate) num_appended: usize, 41 | } 42 | 43 | impl Node { 44 | pub(crate) fn new(driver: T, config: Config, now: Instant) -> Node { 45 | Node { 46 | config, 47 | driver, 48 | term: Term::default(), 49 | group: Group::new(), 50 | log: Log::new(), 51 | last_applied: None, 52 | now, 53 | } 54 | } 55 | 56 | /// Returns true if the current node believes itself to be the leader 57 | pub(crate) fn is_leader(&self) -> bool { 58 | Some(&self.config.id) == self.group.leader.as_ref() 59 | } 60 | 61 | /// Returns the position of the next appended entry in the current term. 62 | pub(crate) fn next_pos(&self) -> Pos { 63 | let index = self 64 | .log 65 | .last_appended_index() 66 | .map(|index| index + 1) 67 | .unwrap_or_default(); 68 | 69 | Pos { 70 | term: self.term, 71 | index, 72 | } 73 | } 74 | 75 | /// Returns `true` if the node has reached the `uncommitted_entries` limit. 76 | pub(crate) fn is_max_uncommitted_entries(&self) -> bool { 77 | match self.config.max_uncommitted_entries { 78 | Some(max_uncommitted) => { 79 | let num_appended = self 80 | .log 81 | .last_appended_index() 82 | .map(|index| index.0 + 1) 83 | .unwrap_or_default(); 84 | 85 | let num_committed = self 86 | .log 87 | .last_committed_index() 88 | .map(|index| index.0 + 1) 89 | .unwrap_or_default(); 90 | 91 | let num_uncommitted = num_appended.checked_sub(num_committed).unwrap_or_default(); 92 | 93 | num_uncommitted >= max_uncommitted 94 | } 95 | None => false, 96 | } 97 | } 98 | 99 | /// Returns `true` if any of the entries are configuration changes that 100 | /// apply to the **current** node. 101 | pub(crate) fn append_entries( 102 | &mut self, 103 | mut entries: &[message::Entry], 104 | ) -> AppendEntries { 105 | let mut ret = AppendEntries { 106 | did_configure_self: false, 107 | num_appended: entries.len(), 108 | }; 109 | 110 | // Immediately apply group configuration changes. This happens *before* 111 | // the entry is committed, which means it is possible that the change 112 | // will have to be reversed if the leader crashes before the entry is 113 | // committed. 114 | for (i, entry) in entries.iter().enumerate() { 115 | if let message::Value::Config(config_change) = &entry.value { 116 | if self 117 | .group 118 | .apply_config_change(&self.log, entry.pos, config_change, self.now) 119 | .is_ok() 120 | { 121 | if *config_change.id() == self.config.id { 122 | ret.did_configure_self |= true; 123 | } 124 | } else { 125 | // The configuration change is invalid. Let's **not** apply 126 | // this entry 127 | entries = &entries[..i]; 128 | ret.num_appended = i; 129 | 130 | break; 131 | } 132 | } 133 | } 134 | 135 | if !entries.is_empty() { 136 | self.log.append_entries(&mut self.driver, entries); 137 | } 138 | 139 | ret 140 | } 141 | 142 | /// Copies committed entries to the 143 | pub async fn copy_committed_entries_to( 144 | &mut self, 145 | dst: &mut Vec>, 146 | ) -> io::Result<()> { 147 | let start = self 148 | .last_applied 149 | .map(|pos| pos.index + 1) 150 | .unwrap_or_default(); 151 | 152 | let end = match self.log.last_committed { 153 | Some(pos) if pos.index >= start => pos.index, 154 | _ => return Ok(()), 155 | }; 156 | 157 | self.log 158 | .copy_range_to(&mut self.driver, start, end, dst) 159 | .await 160 | } 161 | 162 | pub(crate) fn applied_to(&mut self, pos: Pos) { 163 | let pos = Some(pos); 164 | 165 | assert!(pos <= self.log.last_committed); 166 | 167 | if pos <= self.last_applied { 168 | return; 169 | } 170 | 171 | self.last_applied = pos; 172 | } 173 | 174 | pub(crate) fn increment_term(&mut self) { 175 | self.set_term(self.term + 1, None); 176 | } 177 | 178 | pub(crate) fn set_term(&mut self, term: Term, leader: Option) { 179 | assert!(term > self.term); 180 | 181 | self.group.leader = leader; 182 | self.term = term; 183 | } 184 | 185 | /// Calculates when the next election campaign should start 186 | pub(crate) fn campaign_at(&mut self) -> Instant { 187 | let min = self.config.min_election_interval; 188 | let max = self.config.max_election_interval; 189 | 190 | let delay = Duration::from_millis(self.driver.rand_election_timeout( 191 | min.as_millis().try_into().expect("duration too big"), 192 | max.as_millis().try_into().expect("duration too big"), 193 | )); 194 | 195 | assert!(delay >= self.config.min_election_interval); 196 | assert!(delay <= self.config.max_election_interval); 197 | 198 | self.now + delay 199 | } 200 | 201 | pub(crate) fn before(&self, when: Instant) -> bool { 202 | self.now < when 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /src/raft/src/observer.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::marker::PhantomData; 4 | use std::{cmp, io}; 5 | 6 | /// All Raft nodes that are not bootstrapping the group start as an observer. 7 | /// This lets them catch up with the current log. 8 | pub(crate) struct Observer { 9 | _p: PhantomData, 10 | } 11 | 12 | /// Returned by `receive_append_entries` 13 | pub(crate) enum ReceiveAppendEntries { 14 | /// The message is outdated or has been discarded for some other reason. 15 | Discard, 16 | 17 | /// The entries have been appended 18 | Appended, 19 | 20 | /// The entries have been appended and include a configuration change that 21 | /// applies to the current node. 22 | AppendedConfig, 23 | } 24 | 25 | impl Observer { 26 | /// Initialize the observer state. 27 | /// 28 | /// The `leader` identifies the Raft group leader. This is where the request 29 | /// to join the group is sent to. 30 | pub(crate) fn new() -> Observer { 31 | Observer { _p: PhantomData } 32 | } 33 | 34 | /// Returns `true` if any entries contain configuration changes that apply 35 | /// to the current node. 36 | pub(crate) async fn receive_append_entries( 37 | &mut self, 38 | node: &mut Node, 39 | origin: &message::Origin, 40 | append_entries: message::AppendEntries, 41 | ) -> io::Result { 42 | use message::AppendEntriesResponse::*; 43 | use ReceiveAppendEntries::*; 44 | 45 | if origin.term > node.term { 46 | // We need to increment our term. The leader field will be set if we 47 | // accept the message. 48 | node.set_term(origin.term, None); 49 | } else if origin.term < node.term { 50 | // Let the peer know it has fallen out of date. 51 | node.driver.dispatch( 52 | origin.id.clone(), 53 | Reject.to_message(node.config.id.clone(), node.term), 54 | ); 55 | 56 | // The message is discarded 57 | return Ok(Discard); 58 | } 59 | 60 | // First, perform some basic validation of the message. 61 | match append_entries.prev_log_pos { 62 | Some(pos) => { 63 | // The origin term should be greater than the prev_log_pos term. 64 | // Reject messsages for which this isn't true. 65 | if pos.term > origin.term { 66 | return Ok(Discard); 67 | } else if let Some(entry) = append_entries.entries.first() { 68 | if entry.pos <= pos { 69 | return Ok(Discard); 70 | } 71 | } 72 | } 73 | None => { 74 | if let Some(entry) = append_entries.entries.first() { 75 | if entry.pos.index > 0 { 76 | return Ok(Discard); 77 | } 78 | } 79 | } 80 | } 81 | 82 | // Check that the message will not truncate committed entries 83 | if append_entries.prev_log_index() < node.log.last_committed_index() { 84 | return Ok(Discard); 85 | } 86 | 87 | // Check the first entry only as we will apply any that are valid 88 | if let Some(entry) = append_entries.entries.first() { 89 | if let message::Value::Config(config_change) = &entry.value { 90 | if !node.group.is_valid_config_change(entry.pos, config_change) { 91 | return Ok(Discard); 92 | } 93 | } 94 | } 95 | 96 | if let Some(leader) = &node.group.leader { 97 | if origin.id != *leader { 98 | return Ok(Discard); 99 | } 100 | } else { 101 | // The message origin matches the current term, but this node has 102 | // not yet observed a leader. 103 | node.group.leader = Some(origin.id.clone()); 104 | } 105 | 106 | // At this point, the message origin should be the currently known leader. 107 | assert_eq!(Some(&origin.id), node.group.leader.as_ref()); 108 | 109 | // Sanity check: the entry terms should be less than or equal to the 110 | // message term. 111 | if let Some(entry) = append_entries.entries.last() { 112 | if entry.pos.term > origin.term { 113 | // Something is wrong, reject it 114 | return Ok(Discard); 115 | } 116 | } 117 | 118 | // We may already have logged some entries included in this 119 | // AppendEntries message. If this is the case, we will skip appending 120 | // these messages. 121 | let offset; 122 | 123 | // Check that the received entries are sequenced immediately after the 124 | // last entry currently appended to our log. 125 | // if let Some(prev_log_pos) = append_entries.prev_log_pos { 126 | let last_appended = node.log.last_appended(); 127 | 128 | match append_entries.prev_log_pos { 129 | // If the message's `prev_log_pos` matches our last appended entry 130 | // position, then the received entries are sequenced at the end of 131 | // our log. 132 | pos if pos == last_appended => { 133 | offset = 0; 134 | } 135 | // Our log **does not** contain the entry immediately preceeding the 136 | // received entries. This means our log has divered from the 137 | // leader's. We need to re-synchronized before resuming replication. 138 | Some(prev_log_pos) 139 | if !node 140 | .log 141 | .contains_pos(&mut node.driver, prev_log_pos) 142 | .await? => 143 | { 144 | // Compute a hint for the leader to speed up synchronizing 145 | // replication 146 | let hint = if let Some(last_appended) = last_appended { 147 | let hint = node 148 | .log 149 | .find_conflict( 150 | &mut node.driver, 151 | cmp::min(prev_log_pos.index, last_appended.index), 152 | prev_log_pos.term, 153 | ) 154 | .await?; 155 | 156 | // The `last_committed_index` value cannot be used as we are 157 | // not synchronized with the leader's log. 158 | 159 | assert!(hint.is_some()); 160 | hint 161 | } else { 162 | None 163 | }; 164 | 165 | node.driver.dispatch( 166 | origin.id.clone(), 167 | Conflict { 168 | rejected: prev_log_pos.index, 169 | hint, 170 | } 171 | .to_message(node.config.id.clone(), node.term), 172 | ); 173 | 174 | // Even though we didn't actually append any entries, we still 175 | // accepted the message as from a valid leader. 176 | return Ok(Appended); 177 | } 178 | // While our log contains the entry **immediately** preceeding the 179 | // received entries, it also has entries after that one. This means 180 | // we either received duplicate entries **or** our log has diverged 181 | // from the leader. We need to figure out which one it is. 182 | _ => { 183 | // If the loop completes, then we already have all entries and 184 | // should discard them all. 185 | let mut off = append_entries.entries.len(); 186 | 187 | for (i, entry) in append_entries.entries.iter().enumerate() { 188 | // Lookup the term for the entry at the same index in our log. 189 | let our_term = node.log.term_for(&mut node.driver, entry.pos.index).await?; 190 | 191 | if Some(entry.pos.term) == our_term { 192 | // If the terms match, then we already have the entry in our log. 193 | // 194 | // When debugging, lets check that the entries are actually the same 195 | debug_assert_eq!( 196 | Some(entry), 197 | node.log 198 | .get(&mut node.driver, entry.pos.index) 199 | .await? 200 | .as_ref() 201 | ); 202 | } else { 203 | // We found a conflict 204 | node.log.truncate(&mut node.driver, entry.pos.index).await?; 205 | off = i; 206 | break; 207 | } 208 | } 209 | 210 | offset = off; 211 | } 212 | } 213 | 214 | let mut ret = Appended; 215 | 216 | // Grab the entries that are new to us. 217 | let entries = &append_entries.entries[offset..]; 218 | 219 | // If there are no entries, this is heart beat message. 220 | let last_synced = if !entries.is_empty() { 221 | // Append the entries. This also applies any received Raft group 222 | // configuration changes. Raft group configuration changes are applied 223 | // **before** they are committed. 224 | let append_entries = node.append_entries(entries); 225 | 226 | if append_entries.did_configure_self { 227 | ret = AppendedConfig; 228 | } 229 | 230 | if append_entries.num_appended == 0 { 231 | // Let the peer know it has fallen out of date. 232 | return Ok(Discard); 233 | } 234 | 235 | entries.last().map(|entry| entry.pos) 236 | } else { 237 | append_entries.prev_log_pos 238 | }; 239 | 240 | // Cannot commit past the last log entry that has been synchronized. 241 | let last_committed_index = cmp::min( 242 | last_synced.map(|pos| pos.index), 243 | append_entries.leader_committed_index, 244 | ); 245 | 246 | // Update the committed index to the index specified in the message 247 | if let Some(leader_committed) = last_committed_index { 248 | // If we have not yet received log entries, then there is nothing for us to do. 249 | if let Some(last_appended) = node.log.last_appended() { 250 | // Because we just ensured that this node's log is synchronized 251 | // with the leader., if `leader_committed_index` is past our 252 | // log's last appended index, it is safe to commit to 253 | // `last_appended`. 254 | if leader_committed >= last_appended.index { 255 | node.log.commit(last_appended); 256 | } else { 257 | // Need to get the term if the message to commit it 258 | let maybe_pos = node.log.pos_for(&mut node.driver, leader_committed).await?; 259 | 260 | if let Some(pos) = maybe_pos { 261 | node.log.commit(pos); 262 | } 263 | } 264 | } 265 | } 266 | 267 | let last_log_pos = cmp::min(last_synced, node.log.last_appended()).unwrap_or_default(); 268 | 269 | // Send ACK 270 | node.driver.dispatch( 271 | origin.id.clone(), 272 | Success { last_log_pos }.to_message(node.config.id.clone(), node.term), 273 | ); 274 | 275 | Ok(ret) 276 | } 277 | } 278 | -------------------------------------------------------------------------------- /src/raft/src/peer.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::{fmt, io}; 4 | use tokio::time::Instant; 5 | 6 | pub(crate) struct Peer { 7 | /// Node identifier 8 | pub(crate) id: T::Id, 9 | 10 | /// What role the peer is playing in the raft group 11 | pub(crate) role: Role, 12 | 13 | /// Index of the *next* log entry to send to this peer 14 | pub(crate) next_idx: Index, 15 | 16 | /// Highest log entry known to be replicated with this peer. 17 | pub(crate) matched: Option, 18 | 19 | /// `Instant` representing the when we **last** heard from the peer. 20 | /// 21 | /// This is used by the leader to detect if it has become separated from the 22 | /// group. A connected leader should receive `AppendEntriesResponse` 23 | /// messages from all other group members at an interval. If those 24 | /// `AppendEntriesResponse` **stop** arriving, the leader will step down. 25 | pub(crate) last_seen_at: Instant, 26 | 27 | /// Tracks the current replication state with the peer. 28 | pub(crate) replication: ReplicationState, 29 | } 30 | 31 | #[derive(Debug)] 32 | pub(crate) enum Role { 33 | /// The node is only observing the Raft group. It receives new log entries 34 | /// but does not participate in voting. 35 | Observer { 36 | /// When set, the node should automatically be upgraded to follower when 37 | /// it has synchronized the log up to the given point. 38 | auto_upgrade: Option, 39 | }, 40 | 41 | /// The node is a voter. If the Raft group is in "joint consensus" mode, 42 | /// this node exists in both the "incoming" and the "outgoing" 43 | /// configuration. 44 | Voter, 45 | 46 | /// The Raft group is in "joint consensus" mode and the current node is 47 | /// being **added** to the group. 48 | VoterIncoming { 49 | /// Adding a voter is a two-phased approach. The first phase needs to be 50 | /// committed before starting the second phase. The `phase_two_at` is 51 | /// the log position that must be committed before initiating phase two. 52 | phase_two_at: Pos, 53 | }, 54 | 55 | /// The Raft group is in "joint consensus" mode and the curent node is being 56 | /// **removed** from the group. 57 | VoterOutgoing { phase_two_at: Pos }, 58 | } 59 | 60 | #[derive(Debug)] 61 | pub(crate) enum ReplicationState { 62 | /// The peer has just been initialized and is neither probing or 63 | /// replicating. This usually means we know nothing about the peer and will 64 | /// start by sending the last entry. 65 | Init, 66 | 67 | /// We are not synced with the peer and are in the process of finding the 68 | /// most recent log entry that we have in common. This is done by sending 69 | /// empty `AppendEntry` responses until receiving a successful response. 70 | Probe, 71 | 72 | /// We are currently synced with the peer and are sending log entries. 73 | Replicate, 74 | } 75 | 76 | impl Peer { 77 | pub(crate) fn new(id: T::Id, role: peer::Role, next_idx: Index, now: Instant) -> Peer { 78 | Peer { 79 | id, 80 | role, 81 | next_idx, 82 | matched: None, 83 | last_seen_at: now, 84 | /// Always start by probing as we do not know the current state of 85 | /// the peer's log. 86 | replication: ReplicationState::Init, 87 | } 88 | } 89 | 90 | /// Returns true if the peer can vote. 91 | pub(crate) fn is_voter(&self) -> bool { 92 | matches!( 93 | self.role, 94 | Role::Voter | Role::VoterIncoming { .. } | Role::VoterOutgoing { .. } 95 | ) 96 | } 97 | 98 | /// Returns true if the peer is in the incoming set during joint-consensus. 99 | /// 100 | /// This method will always return `true` if the group is in joint-consensus 101 | /// mode. 102 | pub(crate) fn is_incoming_voter(&self) -> bool { 103 | matches!(self.role, Role::Voter | Role::VoterIncoming { .. }) 104 | } 105 | 106 | pub(crate) fn is_outgoing_voter(&self) -> bool { 107 | matches!(self.role, Role::Voter | Role::VoterOutgoing { .. }) 108 | } 109 | 110 | /// Returns true if the peer is an observer 111 | pub(crate) fn is_observer(&self) -> bool { 112 | matches!(self.role, Role::Observer { .. }) 113 | } 114 | 115 | /// Returns true if the peer can be removed in its current state. 116 | pub(crate) fn is_removable(&self) -> bool { 117 | // A node can be removed if it is a voter **or** it is an observer that 118 | // is not going to auto upgrade. 119 | matches!( 120 | self.role, 121 | Role::Voter | Role::Observer { auto_upgrade: None } 122 | ) 123 | } 124 | 125 | /// Called by the leader when it has synced replication with the peer. 126 | pub(crate) fn synced(&mut self) { 127 | if !self.replication.is_replicating() { 128 | self.replication = ReplicationState::Replicate; 129 | self.next_idx = self.matched.map(|index| index + 1).unwrap_or_default(); 130 | } 131 | } 132 | 133 | /// Send an `AppendEntries` message to the peer 134 | pub(crate) async fn send_append_entries( 135 | &mut self, 136 | driver: &mut T, 137 | log: &Log, 138 | heartbeat: bool, 139 | origin: message::Origin, 140 | ) -> io::Result<()> { 141 | let last_appended = log.last_appended(); 142 | let mut entries = vec![]; 143 | 144 | let prev_log_pos = if let Some(last_appended) = last_appended { 145 | // Log entries are only included when we are synced with the peer 146 | // and we are not sending a heartbeat. 147 | if !self.replication.is_probing() && !heartbeat { 148 | // Copy entries within the range 149 | // Sometimes, `next_idx` is set past last appended 150 | // 151 | // TODO: test this comparison 152 | if self.next_idx <= last_appended.index { 153 | log.copy_range_to(driver, self.next_idx, last_appended.index, &mut entries) 154 | .await?; 155 | } 156 | 157 | if entries.is_empty() { 158 | return Ok(()); 159 | } 160 | } 161 | 162 | // If `next_idx` is zero, then this is the first entry being sent. 163 | // In this case, `prev_log_pos` is `None`. 164 | if self.next_idx == 0 { 165 | None 166 | } else { 167 | log.pos_for(driver, self.next_idx - 1).await? 168 | } 169 | } else { 170 | if self.replication.is_replicating() && !heartbeat { 171 | // Nothing to send here 172 | return Ok(()); 173 | } 174 | 175 | None 176 | }; 177 | 178 | // Only optimistically update `next_idx` when in the replicating state 179 | if self.replication.is_replicating() { 180 | // Track which index should be sent to the peer next. 181 | self.next_idx += entries.len(); 182 | } 183 | 184 | // Get the last committed index visible to the leader 185 | let leader_committed_index = log.last_committed().map(|pos| pos.index); 186 | 187 | driver.dispatch( 188 | self.id.clone(), 189 | Message { 190 | origin, 191 | action: message::Action::AppendEntries(message::AppendEntries { 192 | prev_log_pos, 193 | entries, 194 | leader_committed_index, 195 | }), 196 | }, 197 | ); 198 | 199 | // If in the "initial" state, we have no idea what the peer's log is 200 | // currently at. We send a message and transition to "probing". This 201 | // only happens when a leader is newly promoted. 202 | if self.replication.is_init() { 203 | self.replication = ReplicationState::Probe; 204 | } 205 | 206 | Ok(()) 207 | } 208 | 209 | pub(crate) async fn receive_append_entries_conflict( 210 | &mut self, 211 | driver: &mut T, 212 | log: &Log, 213 | now: Instant, 214 | rejected: Index, 215 | hint: Option, 216 | origin: message::Origin, 217 | ) -> io::Result<()> { 218 | // Track that we have recently seen the node 219 | self.last_seen_at = now; 220 | 221 | // If the follower has an uncommitted log tail, we would end up 222 | // probing one by one until we hit the common prefix. 223 | // 224 | // For example, if the leader has: 225 | // 226 | // idx 1 2 3 4 5 6 7 8 9 227 | // ----------------- 228 | // term (L) 1 3 3 3 5 5 5 5 5 229 | // term (F) 1 1 1 1 2 2 230 | // 231 | // Then, after sending an append anchored at (idx=9,term=5) we 232 | // would receive a RejectHint of 6 and LogTerm of 2. Without the 233 | // code below, we would try an append at index 6, which would 234 | // fail again. 235 | // 236 | // However, looking only at what the leader knows about its own 237 | // log and the rejection hint, it is clear that a probe at index 238 | // 6, 5, 4, 3, and 2 must fail as well: 239 | // 240 | // For all of these indexes, the leader's log term is larger 241 | // than the rejection's log term. If a probe at one of these 242 | // indexes succeeded, its log term at that index would match the 243 | // leader's, i.e. 3 or 5 in this example. But the follower 244 | // already told the leader that it is still at term 2 at index 245 | // 9, and since the log term only ever goes up (within a log), 246 | // this is a contradiction. 247 | // 248 | // At index 1, however, the leader can draw no such conclusion, 249 | // as its term 1 is not larger than the term 2 from the 250 | // follower's rejection. We thus probe at 1, which will succeed 251 | // in this example. In general, with this approach we probe at 252 | // most once per term found in the leader's log. 253 | // 254 | // There is a similar mechanism on the follower (implemented in 255 | // handleAppendEntries via a call to findConflictByTerm) that is 256 | // useful if the follower has a large divergent uncommitted log 257 | // tail[1], as in this example: 258 | // 259 | // idx 1 2 3 4 5 6 7 8 9 260 | // ----------------- 261 | // term (L) 1 3 3 3 3 3 3 3 7 262 | // term (F) 1 3 3 4 4 5 5 5 6 263 | // 264 | // Naively, the leader would probe at idx=9, receive a rejection 265 | // revealing the log term of 6 at the follower. Since the 266 | // leader's term at the previous index is already smaller than 267 | // 6, the leader- side optimization discussed above is 268 | // ineffective. The leader thus probes at index 8 and, naively, 269 | // receives a rejection for the same index and log term 5. 270 | // Again, the leader optimization does not improve over linear 271 | // probing as term 5 is above the leader's term 3 for that and 272 | // many preceding indexes; the leader would have to probe 273 | // linearly until it would finally hit index 3, where the probe 274 | // would succeed. 275 | // 276 | // Instead, we apply a similar optimization on the follower. 277 | // When the follower receives the probe at index 8 (log term 3), 278 | // it concludes that all of the leader's log preceding that 279 | // index has log terms of 3 or below. The largest index in the 280 | // follower's log with a log term of 3 or below is index 3. The 281 | // follower will thus return a rejection for index=3, log term=3 282 | // instead. The leader's next probe will then succeed at that 283 | // index. 284 | // 285 | // [1]: more precisely, if the log terms in the large 286 | // uncommitted tail on the follower are larger than the 287 | // leader's. At first, it may seem unintuitive that a follower 288 | // could even have such a large tail, but it can happen: 289 | // 290 | // 1. Leader appends (but does not commit) entries 2 and 3, 291 | // crashes. 292 | // idx 1 2 3 4 5 6 7 8 9 293 | // ----------------- 294 | // term (L) 1 2 2 [crashes] 295 | // term (F) 1 296 | // term (F) 1 297 | // 298 | // 2. a follower becomes leader and appends entries at term 3. 299 | // ----------------- 300 | // term (x) 1 2 2 [down] 301 | // term (F) 1 3 3 3 3 302 | // term (F) 1 303 | // 304 | // 3. term 3 leader goes down, term 2 leader returns as term 4 305 | // leader. It commits the log & entries at term 4. 306 | // 307 | // ----------------- 308 | // term (L) 1 2 2 2 309 | // term (x) 1 3 3 3 3 [down] 310 | // term (F) 1 311 | // ----------------- 312 | // term (L) 1 2 2 2 4 4 4 313 | // term (F) 1 3 3 3 3 [gets probed] 314 | // term (F) 1 2 2 2 4 4 4 315 | // 316 | // 4. the leader will now probe the returning follower at index 317 | // 7, the rejection points it at the end of the follower's 318 | // log which is at a higher log term than the actually 319 | // committed log. 320 | // 321 | // (comment from raft-rs) 322 | let next_probe = match hint { 323 | Some(pos) if pos.term > 0 => log 324 | .find_conflict(driver, pos.index, pos.term) 325 | .await? 326 | .map(|pos| pos.index + 1) 327 | .unwrap_or_default(), 328 | Some(pos) => pos.index + 1, 329 | None => Index(0), 330 | }; 331 | 332 | if self.decr_next_index(rejected, next_probe) { 333 | self.send_append_entries(driver, log, false, origin).await?; 334 | } 335 | 336 | Ok(()) 337 | } 338 | 339 | /// Upgrade the peer from an observer to a follower. 340 | pub(crate) fn upgrade_to_follower(&mut self, phase_two_at: Pos) { 341 | // This should have already been checked 342 | assert!(self.is_observer()); 343 | 344 | // A node isn't immediately upgraded to a follower. Upgrading is 345 | // a two phased process. 346 | self.role = peer::Role::VoterIncoming { phase_two_at }; 347 | } 348 | 349 | pub(crate) fn maybe_propose_upgrade_phase_2( 350 | &mut self, 351 | pos: Pos, 352 | ) -> Option> { 353 | use Role::*; 354 | 355 | match self.role { 356 | VoterIncoming { phase_two_at } if phase_two_at <= pos => { 357 | Some(message::Value::upgrade_node_phase_two(self.id.clone())) 358 | } 359 | VoterOutgoing { phase_two_at } if phase_two_at <= pos => { 360 | Some(message::Value::remove_node_phase_two(self.id.clone())) 361 | } 362 | _ => None, 363 | } 364 | } 365 | 366 | pub(crate) fn upgrade_to_follower_phase_2(&mut self) { 367 | assert!(matches!(self.role, Role::VoterIncoming { .. })); 368 | self.role = Role::Voter; 369 | } 370 | 371 | pub(crate) fn remove_phase_one(&mut self, phase_two_at: Pos) { 372 | // This should already have been checked 373 | assert!(matches!(self.role, Role::Voter)); 374 | 375 | // A node isn't immediately removed when it is a voter. 376 | self.role = peer::Role::VoterOutgoing { phase_two_at }; 377 | } 378 | 379 | /// Decrement the index of the next message to send, returning `true` if the 380 | /// index was actually decremented. 381 | fn decr_next_index(&mut self, rejected: Index, hint: Index) -> bool { 382 | use std::cmp; 383 | 384 | if self.replication.is_replicating() { 385 | if Some(rejected) <= self.matched { 386 | // The peer has already matched the rejected index, it must be 387 | // stale. 388 | return false; 389 | } 390 | } else { 391 | // The rejected index is set to the `prev_log_pos` from the 392 | // `AppendEntries` message received by the peer. This is also equal 393 | // to `next_idx - 1`. If `rejected` does not match `next_idx - 1` 394 | // then the message is stale. 395 | if Some(rejected) != self.next_idx.checked_sub(1) { 396 | return false; 397 | } 398 | } 399 | 400 | // TODO: handle out-of-order messages better 401 | self.next_idx = cmp::min(rejected, hint); 402 | 403 | if self.next_idx == 0 { 404 | // We have reached the beginning of the log. Since we cannot probe 405 | // back further, we have synchronized. 406 | self.replication = ReplicationState::Replicate; 407 | } else if self.replication.is_replicating() { 408 | // We are no longer synchronized with the peer, transition to probing 409 | self.replication = ReplicationState::Probe; 410 | } 411 | 412 | true 413 | } 414 | } 415 | 416 | impl fmt::Debug for Peer { 417 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 418 | fmt.debug_struct("Peer") 419 | .field("id", &self.id) 420 | .field("role", &self.role) 421 | .field("next_idx", &self.next_idx) 422 | .field("matched", &self.matched) 423 | .field("last_seen_at", &self.last_seen_at) 424 | .field("replication", &self.replication) 425 | .finish() 426 | } 427 | } 428 | 429 | impl ReplicationState { 430 | pub(crate) fn is_init(&self) -> bool { 431 | matches!(self, ReplicationState::Init) 432 | } 433 | 434 | pub(crate) fn is_replicating(&self) -> bool { 435 | matches!(self, ReplicationState::Replicate) 436 | } 437 | 438 | pub(crate) fn is_probing(&self) -> bool { 439 | matches!(self, ReplicationState::Probe) 440 | } 441 | } 442 | -------------------------------------------------------------------------------- /src/raft/src/pos.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::fmt; 4 | 5 | /// Position of an entry in the log 6 | #[derive(Default, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, serde::Serialize)] 7 | pub struct Pos { 8 | /// Term the sequence identifier is part of 9 | pub term: Term, 10 | 11 | /// Absolute log index. 12 | pub index: Index, 13 | } 14 | 15 | impl fmt::Debug for Pos { 16 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 17 | write!(fmt, "Pos({}:{})", self.index.0, self.term.0) 18 | } 19 | } 20 | 21 | #[test] 22 | fn test_ord() { 23 | let a = Pos { 24 | term: Term(0), 25 | index: Index(1), 26 | }; 27 | 28 | let b = Pos { 29 | term: Term(1), 30 | index: Index(0), 31 | }; 32 | 33 | assert!(a < b); 34 | assert!(b > a); 35 | } 36 | -------------------------------------------------------------------------------- /src/raft/src/raft.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use std::io; 4 | use tokio::time::Instant; 5 | 6 | pub struct Raft { 7 | /// State common to all Raft roles 8 | pub(crate) node: Node, 9 | 10 | /// Current role the node plays in the raft group. All nodes start in the 11 | /// follower state. 12 | pub(crate) stage: Stage, 13 | } 14 | 15 | #[derive(Debug)] 16 | #[non_exhaustive] 17 | pub struct Tick { 18 | pub tick_at: Option, 19 | } 20 | 21 | #[derive(Debug)] 22 | #[non_exhaustive] 23 | pub enum ProposeError { 24 | NotLeader(Option), 25 | TooManyUncommitted, 26 | InvalidConfig, 27 | FailedToCommit, 28 | Io(io::Error), 29 | } 30 | 31 | impl Raft { 32 | /// Initialize a new raft group with no peers. 33 | pub fn new_group(driver: T, config: Config, now: Instant) -> Raft { 34 | let mut node = Node::new(driver, config, now); 35 | node.group.bootstrap(node.config.id.clone(), now); 36 | 37 | // Append the very first log entry which initializes the Raft group. 38 | node.log.append_entries( 39 | &mut node.driver, 40 | &[message::Entry { 41 | pos: Pos { 42 | term: Term(0), 43 | index: Index(0), 44 | }, 45 | value: message::Value::Config(message::ConfigChange::InitGroup { 46 | id: node.config.id.clone(), 47 | }), 48 | }], 49 | ); 50 | 51 | // Commit the entry 52 | node.log.commit(Pos { 53 | term: Term(0), 54 | index: Index(0), 55 | }); 56 | 57 | // If the Raft group is initialized with one node, then the current node 58 | // is the defacto leader. 59 | let stage = Stage::Leader(Leader::new(&node)); 60 | 61 | let raft = Raft { 62 | node, 63 | // If the Raft group is initialized with one node, then the current node 64 | // is the defacto leader. 65 | stage, 66 | }; 67 | 68 | raft 69 | } 70 | 71 | /// Initialize a new raft node as an observer to an existing Raft group 72 | pub fn new_observer(driver: T, config: Config, now: Instant) -> Raft { 73 | let mut node = Node::new(driver, config, now); 74 | 75 | // Set the role as an observer 76 | let stage = Stage::Follower(Follower::new_observer(&mut node)); 77 | 78 | Raft { node, stage } 79 | } 80 | 81 | /// Returns the node identifier for this Raft node 82 | pub fn id(&self) -> T::Id { 83 | self.node.config.id.clone() 84 | } 85 | 86 | /// Returns the node's current term 87 | pub fn term(&self) -> Term { 88 | self.node.term 89 | } 90 | 91 | /// Returns a reference to the driver 92 | pub fn driver(&self) -> &T { 93 | &self.node.driver 94 | } 95 | 96 | /// Returns a mutable reference to the driver 97 | pub fn driver_mut(&mut self) -> &mut T { 98 | &mut self.node.driver 99 | } 100 | 101 | /// Return info about the Raft node 102 | pub fn info(&self) -> Info { 103 | Info::from_raft(self) 104 | } 105 | 106 | /// Return the index of the last committed entry. This entry has been safely 107 | /// replicated to other nodes in the group. 108 | /// 109 | /// The method returns `None` when the log is empty. 110 | pub fn last_committed_index(&self) -> Option { 111 | self.node.log.last_committed().map(|pos| pos.index) 112 | } 113 | 114 | /// Read the value at the specific position 115 | pub async fn get(&mut self, pos: Pos) -> io::Result>> { 116 | match self.get_index(pos.index).await? { 117 | Some(entry) if entry.pos == pos => Ok(Some(entry)), 118 | _ => Ok(None), 119 | } 120 | } 121 | 122 | pub async fn get_index(&mut self, index: Index) -> io::Result>> { 123 | self.node.log.get(&mut self.node.driver, index).await 124 | } 125 | 126 | /// Copies committed entries to the 127 | pub async fn copy_committed_entries_to( 128 | &mut self, 129 | dst: &mut Vec>, 130 | ) -> io::Result<()> { 131 | self.node.copy_committed_entries_to(dst).await 132 | } 133 | 134 | /// Advance the "applied" cursor 135 | pub fn applied_to(&mut self, pos: Pos) { 136 | self.node.applied_to(pos); 137 | } 138 | 139 | /// Propose a new value to append to the log 140 | /// 141 | /// Returns the log position at which the value is being proposed for 142 | /// insertion. The caller may use this position to track when the entry is 143 | /// committed. 144 | pub async fn propose( 145 | &mut self, 146 | value: message::Value, 147 | ) -> Result> { 148 | if matches!(value, message::Value::NewTerm) { 149 | todo!("deny client proposing NewTerm."); 150 | } 151 | 152 | match &mut self.stage { 153 | Stage::Leader(leader) => { 154 | // If the proposed value is a configuration change, ensure it is valid 155 | if let message::Value::Config(config_change) = &value { 156 | let pos = self.node.next_pos(); 157 | 158 | if !self.node.group.is_valid_config_change(pos, config_change) { 159 | return Err(ProposeError::InvalidConfig); 160 | } 161 | } else if self.node.is_max_uncommitted_entries() { 162 | return Err(ProposeError::TooManyUncommitted); 163 | } 164 | 165 | Ok(leader.propose(&mut self.node, value).await?) 166 | } 167 | _ => { 168 | // Not currently the leader, but if we know who is the leader, 169 | // then we can redirect the client there. 170 | Err(ProposeError::NotLeader(self.node.group.leader.clone())) 171 | } 172 | } 173 | } 174 | 175 | /// Set this Raft node's current `Instant` 176 | pub fn set_now(&mut self, now: Instant) { 177 | assert!(now >= self.node.now); 178 | self.node.now = now; 179 | } 180 | 181 | // Do something w/ the message 182 | pub async fn receive(&mut self, message: Message) -> io::Result<()> { 183 | use Stage::*; 184 | 185 | let maybe_transition = match &mut self.stage { 186 | Candidate(candidate) => candidate.receive(&mut self.node, message).await?, 187 | Follower(follower) => follower.receive(&mut self.node, message).await?, 188 | Leader(leader) => leader.receive(&mut self.node, message).await?, 189 | }; 190 | 191 | if let Some(stage) = maybe_transition { 192 | self.stage = stage; 193 | } 194 | 195 | Ok(()) 196 | } 197 | 198 | pub async fn tick(&mut self) -> io::Result { 199 | use Stage::*; 200 | 201 | loop { 202 | let transition = match &mut self.stage { 203 | Candidate(candidate) => candidate.tick(&mut self.node).await?, 204 | Follower(follower) => follower.tick(&mut self.node), 205 | Leader(leader) => leader.tick(&mut self.node).await?, 206 | }; 207 | 208 | match transition { 209 | Step::Transition(stage) => { 210 | self.stage = stage; 211 | } 212 | Step::Wait(when) => { 213 | return Ok(Tick { tick_at: when }); 214 | } 215 | } 216 | } 217 | } 218 | } 219 | 220 | impl From for ProposeError { 221 | fn from(src: io::Error) -> ProposeError { 222 | ProposeError::Io(src) 223 | } 224 | } 225 | -------------------------------------------------------------------------------- /src/raft/src/stage.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | pub(crate) enum Stage { 4 | /// The node is following a leader 5 | Follower(Follower), 6 | 7 | /// The node is proposing itself as the new leader. 8 | Candidate(Candidate), 9 | 10 | // The node believes it is the group's leader. 11 | Leader(Leader), 12 | } 13 | -------------------------------------------------------------------------------- /src/raft/src/term.rs: -------------------------------------------------------------------------------- 1 | use std::{cmp, ops}; 2 | 3 | // A raft term 4 | #[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, serde::Serialize)] 5 | pub struct Term(pub u64); 6 | 7 | impl ops::Add for Term { 8 | type Output = Term; 9 | 10 | fn add(self, rhs: u64) -> Self::Output { 11 | Term(self.0 + rhs) 12 | } 13 | } 14 | 15 | impl ops::AddAssign for Term { 16 | fn add_assign(&mut self, rhs: u64) { 17 | self.0 += rhs; 18 | } 19 | } 20 | 21 | impl cmp::PartialEq for Term { 22 | fn eq(&self, other: &u64) -> bool { 23 | self.0 == *other 24 | } 25 | } 26 | 27 | impl cmp::PartialOrd for Term { 28 | fn partial_cmp(&self, other: &u64) -> Option { 29 | self.0.partial_cmp(other) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/raft/src/transition.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use tokio::time::Instant; 4 | 5 | macro_rules! transition { 6 | ( $a:expr, $b:expr ) => {{ 7 | let ret = $a; 8 | 9 | return Ok(if ret.is_some() { ret } else { $b.into() }); 10 | }}; 11 | } 12 | 13 | /// Either transition the raft node's stage or wait until the specified 14 | /// instant. 15 | pub(crate) enum Step { 16 | Transition(T), 17 | Wait(Option), 18 | } 19 | 20 | impl From> for Step> { 21 | fn from(src: Follower) -> Step> { 22 | Step::Transition(Stage::Follower(src)) 23 | } 24 | } 25 | 26 | impl From> for Step> { 27 | fn from(src: Candidate) -> Step> { 28 | Step::Transition(Stage::Candidate(src)) 29 | } 30 | } 31 | 32 | impl From> for Step> { 33 | fn from(src: Leader) -> Step> { 34 | Step::Transition(Stage::Leader(src)) 35 | } 36 | } 37 | 38 | impl From for Step> { 39 | fn from(src: Instant) -> Step> { 40 | Step::Wait(Some(src)) 41 | } 42 | } 43 | 44 | impl From> for Option> { 45 | fn from(src: Follower) -> Option> { 46 | Some(Stage::Follower(src)) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/raft/src/votes.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use indexmap::IndexMap; 4 | 5 | /// Tracks votes received from peer nodes. 6 | pub(crate) struct Votes { 7 | /// When the group is **not** in joint-consensus mode, votes are tracked in 8 | /// this set. When the group **is** in joint-consensus mode, peers in the 9 | /// "incoming" group are tracked here. 10 | primary: IndexMap, 11 | 12 | /// When the group is **not** in joint-consensus mode, this map is not used. 13 | /// When the group **is** in joint-consensus mode, peers in the **outgoing** 14 | /// group are tracked here. 15 | secondary: IndexMap, 16 | } 17 | 18 | #[derive(Debug)] 19 | pub(crate) enum Tally { 20 | Win, 21 | Lose, 22 | Pending, 23 | } 24 | 25 | impl Votes { 26 | pub(crate) fn new() -> Votes { 27 | Votes { 28 | primary: IndexMap::new(), 29 | secondary: IndexMap::new(), 30 | } 31 | } 32 | 33 | pub(crate) fn clear(&mut self) { 34 | self.primary.clear(); 35 | self.secondary.clear(); 36 | } 37 | 38 | pub(crate) fn record(&mut self, group: &Group, id: &T::Id, granted: bool) { 39 | use peer::Role; 40 | 41 | if group.is_joint_consensus() { 42 | let peer = match group.peers.get(id) { 43 | Some(peer) => peer, 44 | // The message is from an unknown peer (from the perspective of 45 | // the current node); 46 | _ => return, 47 | }; 48 | 49 | match peer.role { 50 | Role::Voter => { 51 | // Votes count in both the incoming **and** outgoing group 52 | self.primary.entry(id.clone()).or_insert(granted); 53 | self.secondary.entry(id.clone()).or_insert(granted); 54 | } 55 | Role::VoterIncoming { .. } => { 56 | self.primary.entry(id.clone()).or_insert(granted); 57 | } 58 | Role::VoterOutgoing { .. } => { 59 | self.secondary.entry(id.clone()).or_insert(granted); 60 | } 61 | _ => return, 62 | } 63 | } else { 64 | self.primary.entry(id.clone()).or_insert(granted); 65 | } 66 | } 67 | 68 | pub(crate) fn tally(&self, group: &Group) -> Tally { 69 | if group.is_joint_consensus() { 70 | let incoming = Tally::from_votes::(&self.primary, group.incoming_voter_ids()); 71 | let outgoing = Tally::from_votes::(&&self.secondary, group.outgoing_voter_ids()); 72 | 73 | incoming.join(outgoing) 74 | } else { 75 | Tally::from_votes::(&self.primary, group.voter_ids()) 76 | } 77 | } 78 | } 79 | 80 | impl Tally { 81 | fn from_votes<'a, T, I>(votes: &IndexMap, voters: I) -> Tally 82 | where 83 | T: Driver, 84 | I: Iterator, 85 | { 86 | let mut yes = 0; 87 | let mut no = 0; 88 | let mut pending = 0; 89 | 90 | for voter_id in voters { 91 | match votes.get(voter_id) { 92 | Some(true) => yes += 1, 93 | Some(false) => no += 1, 94 | None => pending += 1, 95 | } 96 | } 97 | 98 | let threshold = ((yes + no + pending) / 2) + 1; 99 | 100 | if yes >= threshold { 101 | Tally::Win 102 | } else if yes + pending < threshold { 103 | Tally::Lose 104 | } else { 105 | assert!( 106 | yes + pending >= threshold, 107 | "yes={yes}; no={no}; pending={pending}; threshold={threshold}" 108 | ); 109 | Tally::Pending 110 | } 111 | } 112 | 113 | fn join(self, other: Tally) -> Tally { 114 | use Tally::*; 115 | 116 | match (self, other) { 117 | // If either tallies are `Lose`, then candidate lost the election 118 | (Lose, _) | (_, Lose) => Lose, 119 | 120 | // If either is `Pending` with `Pending` or `Win`, then the tally is still pending. 121 | (Pending, _) | (_, Pending) => Pending, 122 | 123 | // If both tallies are `Win` then the join is also `Win`. 124 | (Win, Win) => Win, 125 | } 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /tests/isolated/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "isolated" 3 | version = "0.1.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | validated = { path = "../validated" } 11 | futures-lite = "1" 12 | mini-raft = { path = "../../src/raft" } 13 | tokio = { version = "1.19", features = ["time"] } 14 | pretty_assertions = "1.2.1" 15 | -------------------------------------------------------------------------------- /tests/isolated/src/builder.rs: -------------------------------------------------------------------------------- 1 | use crate::{ElectionTimeout, Id, Instance, MockDriver, Raft}; 2 | 3 | use mini_raft::*; 4 | use validated::Validated; 5 | 6 | use pretty_assertions::assert_eq; 7 | use tokio::time::Instant; 8 | 9 | pub struct Builder { 10 | driver: MockDriver, 11 | config: Config<&'static str>, 12 | } 13 | 14 | impl Builder { 15 | pub(crate) fn new() -> Builder { 16 | Builder { 17 | driver: MockDriver::new(), 18 | config: Config::new("raft"), 19 | } 20 | } 21 | 22 | pub fn election_timeout(&mut self, election_timeout: ElectionTimeout) -> &mut Self { 23 | self.driver.election_timeout = election_timeout; 24 | self 25 | } 26 | 27 | pub fn max_uncommitted_entries(&mut self, max: u64) -> &mut Self { 28 | self.config.max_uncommitted_entries = Some(max); 29 | self 30 | } 31 | 32 | pub fn build_observer(&mut self) -> Instance { 33 | let now = Instant::now(); 34 | let raft = Raft::new_observer(self.driver.clone(), self.config.clone(), now); 35 | 36 | Instance { 37 | raft: Validated::new(raft), 38 | epoch: now, 39 | now, 40 | tick_at: None, 41 | } 42 | } 43 | 44 | pub fn build_group(&mut self) -> Instance { 45 | let now = Instant::now(); 46 | let raft = Raft::new_group(self.driver.clone(), self.config.clone(), now); 47 | 48 | Instance { 49 | raft: Validated::new(raft), 50 | epoch: now, 51 | now, 52 | tick_at: None, 53 | } 54 | } 55 | 56 | pub fn build_follower(&mut self, leader: Id, followers: &[Id], observers: &[Id]) -> Instance { 57 | let mut entries = vec![]; 58 | let mut next_index = Index(1); 59 | 60 | // Initialize the group with the leader 61 | entries.push(entry!(0:0, init_group(leader))); 62 | 63 | // Add all the other nodes 64 | for peer in ["raft"] 65 | .iter() 66 | .chain(followers.iter()) 67 | .chain(observers.iter()) 68 | { 69 | entries.push(message::Entry { 70 | pos: Pos { 71 | term: Term(0), 72 | index: next_index, 73 | }, 74 | value: val!(add_node(peer)), 75 | }); 76 | 77 | next_index += 1; 78 | } 79 | 80 | // Now, upgrade followers 81 | for peer in ["raft"].iter().chain(followers.iter()) { 82 | entries.push(message::Entry { 83 | pos: Pos { 84 | term: Term(0), 85 | index: next_index, 86 | }, 87 | value: val!(upgrade_node_phase_one(peer)), 88 | }); 89 | 90 | entries.push(message::Entry { 91 | pos: Pos { 92 | term: Term(0), 93 | index: next_index + 1, 94 | }, 95 | value: val!(upgrade_node_phase_two(peer)), 96 | }); 97 | 98 | next_index += 2; 99 | } 100 | 101 | let pos = entries.last().unwrap().pos; 102 | 103 | let mut raft = self.build_observer(); 104 | 105 | // Sync the log 106 | raft.recv_append_entries( 107 | "0", 108 | Term(0), 109 | message::AppendEntries { 110 | prev_log_pos: None, 111 | entries, 112 | leader_committed_index: Some(Index(0)), 113 | }, 114 | ); 115 | 116 | raft.assert_sent( 117 | "0", 118 | message::AppendEntriesResponse::Success { last_log_pos: pos } 119 | .to_message("raft", Term(0)), 120 | ); 121 | 122 | raft 123 | } 124 | 125 | pub fn build_leader(&mut self, followers: &[Id], observers: &[Id]) -> Instance { 126 | let mut raft = self.build_group(); 127 | 128 | let mut peers = followers.to_vec(); 129 | peers.extend_from_slice(observers); 130 | 131 | // First add all the nodes as observers 132 | for (i, new) in peers.iter().enumerate() { 133 | let pos = raft.propose(message::Value::add_node(new)).unwrap(); 134 | 135 | for peer in &peers[..i] { 136 | let entries = vec![raft.log().last().unwrap().clone()]; 137 | 138 | // Send the initial `AppendEntries` to replicate the log 139 | raft.assert_sent( 140 | peer, 141 | message::AppendEntries { 142 | prev_log_pos: Some(Pos { 143 | index: entries[0].pos.index - 1, 144 | term: Term(0), 145 | }), 146 | entries, 147 | // Everything is committed because we haven't added any voters yet. 148 | leader_committed_index: Some(pos.index), 149 | } 150 | .to_message("raft", Term(0)), 151 | ); 152 | } 153 | 154 | // The full log is sent to the new node 155 | let entries = raft.log().to_vec(); 156 | let prev_log_pos = entries[entries.len() - 2].pos; 157 | 158 | // Send the initial `AppendEntries` to sync the log 159 | raft.assert_sent( 160 | new, 161 | message::AppendEntries { 162 | prev_log_pos: Some(prev_log_pos), 163 | entries: vec![entries.last().cloned().unwrap()], 164 | // Everything is committed because we haven't added any voters yet. 165 | leader_committed_index: Some(pos.index), 166 | } 167 | .to_message("raft", Term(0)), 168 | ) 169 | .assert_idle(); 170 | 171 | // Receive the response that indicates we nothing is replicated 172 | raft.receive_append_entries_response( 173 | new, 174 | Term(0), 175 | message::AppendEntriesResponse::Conflict { 176 | rejected: prev_log_pos.index, 177 | hint: None, 178 | }, 179 | ); 180 | 181 | // Because `hint: None`, the leader assumes the peer has no log 182 | // entries and starts sending everything. 183 | raft.assert_sent( 184 | new, 185 | message::AppendEntries { 186 | prev_log_pos: None, 187 | entries: entries.clone(), 188 | leader_committed_index: Some(entries.last().unwrap().pos.index), 189 | } 190 | .to_message("raft", Term(0)), 191 | ) 192 | .assert_idle(); 193 | } 194 | 195 | raft.assert_idle(); 196 | 197 | let last_log_pos = raft.log().last().unwrap().pos; 198 | 199 | // ACK from all peers 200 | for peer in &peers { 201 | raft.receive_append_entries_response( 202 | peer, 203 | Term(0), 204 | message::AppendEntriesResponse::Success { last_log_pos }, 205 | ); 206 | } 207 | 208 | raft.assert_idle(); 209 | 210 | // Now we have to upgrade each follower. This is a multi-step process. 211 | for follower in followers { 212 | raft.propose(message::Value::upgrade_node_phase_one(follower)) 213 | .unwrap(); 214 | let entry = raft.log().last().unwrap().clone(); 215 | let prev_log_pos = Pos { 216 | term: Term(0), 217 | index: entry.pos.index - 1, 218 | }; 219 | 220 | // Message is sent to peer 221 | for peer in &peers { 222 | raft.assert_sent( 223 | peer, 224 | message::AppendEntries { 225 | prev_log_pos: Some(prev_log_pos), 226 | entries: vec![entry.clone()], 227 | leader_committed_index: Some(prev_log_pos.index), 228 | } 229 | .to_message("raft", Term(0)), 230 | ); 231 | } 232 | 233 | raft.assert_idle(); 234 | 235 | // All peers ACK message 236 | for peer in &peers { 237 | raft.receive_append_entries_response( 238 | peer, 239 | Term(0), 240 | message::AppendEntriesResponse::Success { 241 | last_log_pos: entry.pos, 242 | }, 243 | ); 244 | } 245 | 246 | let entry = raft.log().last().unwrap().clone(); 247 | assert_eq!( 248 | entry.value, 249 | message::Value::upgrade_node_phase_two(*follower) 250 | ); 251 | let prev_log_pos = Pos { 252 | term: Term(0), 253 | index: entry.pos.index - 1, 254 | }; 255 | 256 | // Phase 2 257 | for peer in &peers { 258 | raft.assert_sent( 259 | peer, 260 | message::AppendEntries { 261 | prev_log_pos: Some(prev_log_pos), 262 | entries: vec![entry.clone()], 263 | leader_committed_index: Some(prev_log_pos.index), 264 | } 265 | .to_message("raft", Term(0)), 266 | ); 267 | } 268 | 269 | raft.assert_idle(); 270 | 271 | for peer in &peers { 272 | raft.receive_append_entries_response( 273 | peer, 274 | Term(0), 275 | message::AppendEntriesResponse::Success { 276 | last_log_pos: entry.pos, 277 | }, 278 | ); 279 | } 280 | } 281 | 282 | // First tick 283 | raft.tick(); 284 | 285 | assert_leader!(raft); 286 | 287 | raft 288 | } 289 | 290 | pub fn build_promoted_leader( 291 | &mut self, 292 | committed: Index, 293 | entries: &[message::Entry], 294 | ) -> Instance { 295 | let mut raft = self.build_observer(); 296 | let last_log_pos = entries.last().unwrap().pos; 297 | 298 | raft.recv_append_entries( 299 | "0", 300 | last_log_pos.term, 301 | message::AppendEntries { 302 | prev_log_pos: None, 303 | entries: entries.to_vec(), 304 | leader_committed_index: Some(committed), 305 | }, 306 | ); 307 | 308 | raft.assert_sent( 309 | "0", 310 | message::AppendEntriesResponse::Success { last_log_pos } 311 | .to_message("raft", last_log_pos.term), 312 | ); 313 | 314 | // Sleep until the election campaign starts 315 | raft.assert_sleep_for(150); 316 | raft.sleep(); 317 | 318 | let mut pre_vote_responses = vec![]; 319 | 320 | for (outbound, dst) in raft.raft.driver_mut().outbound.drain(..) { 321 | match outbound.action { 322 | message::Action::PreVote(_) => pre_vote_responses.push( 323 | message::VoteResponse { granted: true } 324 | .to_prevote_message(dst, outbound.origin.term), 325 | ), 326 | _ => panic!("unexpected outbound message; {:#?}", outbound), 327 | } 328 | } 329 | 330 | for response in pre_vote_responses.drain(..) { 331 | raft.recv(response); 332 | } 333 | 334 | let mut vote_responses = vec![]; 335 | 336 | for (outbound, dst) in raft.raft.driver_mut().outbound.drain(..) { 337 | match outbound.action { 338 | message::Action::Vote(_) => vote_responses.push( 339 | message::VoteResponse { granted: true }.to_message(dst, outbound.origin.term), 340 | ), 341 | _ => panic!("unexpected outbound message; {:#?}", outbound), 342 | } 343 | } 344 | 345 | for response in vote_responses.drain(..) { 346 | raft.recv(response); 347 | } 348 | 349 | let entries = raft.log().to_vec(); 350 | let new_term = entries[entries.len() - 1].clone(); 351 | 352 | for (outbound, _) in raft.raft.driver_mut().outbound.drain(..) { 353 | assert_eq!( 354 | outbound, 355 | message::AppendEntries { 356 | prev_log_pos: Some(entries[entries.len() - 2].pos), 357 | entries: vec![new_term.clone()], 358 | leader_committed_index: Some(committed), 359 | } 360 | .to_message("raft", new_term.pos.term) 361 | ); 362 | } 363 | 364 | raft.assert_idle(); 365 | 366 | assert_leader!(raft); 367 | assert_term!(raft, last_log_pos.term + 1); 368 | assert_committed!(raft, committed); 369 | 370 | raft 371 | } 372 | } 373 | -------------------------------------------------------------------------------- /tests/isolated/src/driver.rs: -------------------------------------------------------------------------------- 1 | use mini_raft::*; 2 | 3 | use std::io; 4 | use std::task::{Context, Poll}; 5 | 6 | #[derive(Clone, Debug)] 7 | pub struct MockDriver { 8 | /// Log entries 9 | pub(crate) log: Vec>, 10 | 11 | /// List of outbound messages 12 | pub(crate) outbound: Vec<(Message<&'static str>, &'static str)>, 13 | 14 | /// Value returned by `rand_election_timeout` 15 | pub(crate) election_timeout: ElectionTimeout, 16 | } 17 | 18 | #[derive(Clone, Debug)] 19 | pub enum ElectionTimeout { 20 | /// Always use the minimum election timeout 21 | Min, 22 | 23 | /// Always offset the minimum election timeout by a specified duration. 24 | Offset(u64), 25 | 26 | /// Return the following offsets in sequence 27 | Multi(Vec), 28 | } 29 | 30 | impl mini_raft::Driver for MockDriver { 31 | type Id = &'static str; 32 | 33 | fn dispatch(&mut self, dst: &'static str, message: Message<&'static str>) { 34 | self.outbound.push((message, dst)); 35 | } 36 | 37 | fn poll_term_for( 38 | &mut self, 39 | _cx: &mut Context<'_>, 40 | index: Index, 41 | ) -> Poll>> { 42 | let index: usize = index.0.try_into().unwrap(); 43 | let maybe_term = self.log.get(index).map(|entry| entry.pos.term); 44 | Poll::Ready(Ok(maybe_term)) 45 | } 46 | 47 | fn poll_read_entry( 48 | &mut self, 49 | _cx: &mut Context<'_>, 50 | _index: Index, 51 | ) -> Poll>>> { 52 | todo!(); 53 | } 54 | 55 | fn poll_read_entries( 56 | &mut self, 57 | _cx: &mut Context<'_>, 58 | start: Index, 59 | end: Index, 60 | dst: &mut Vec>, 61 | ) -> Poll> { 62 | let start = usize::try_from(start.0).unwrap(); 63 | // `end` is inclusive, but exclusive for rust slices. 64 | let end = usize::try_from(end.0).unwrap() + 1; 65 | dst.extend_from_slice(&self.log[start..end]); 66 | Poll::Ready(Ok(())) 67 | } 68 | 69 | fn append_entries(&mut self, entries: &[message::Entry<&'static str>]) { 70 | self.log.extend_from_slice(entries); 71 | } 72 | 73 | fn truncate(&mut self, index: Index) { 74 | let index = usize::try_from(index.0).unwrap(); 75 | self.log.truncate(index); 76 | } 77 | 78 | fn rand_election_timeout(&mut self, lower: u64, upper: u64) -> u64 { 79 | match &mut self.election_timeout { 80 | ElectionTimeout::Min => lower, 81 | ElectionTimeout::Offset(offset) => { 82 | assert!(lower + *offset <= upper); 83 | lower + *offset 84 | } 85 | ElectionTimeout::Multi(vals) => { 86 | assert!(!vals.is_empty(), "no more specified election timeouts"); 87 | let offset = vals.remove(0); 88 | assert!(lower + offset <= upper); 89 | lower + offset 90 | } 91 | } 92 | } 93 | } 94 | 95 | impl MockDriver { 96 | pub fn new() -> MockDriver { 97 | MockDriver { 98 | log: vec![], 99 | outbound: vec![], 100 | election_timeout: ElectionTimeout::Min, 101 | } 102 | } 103 | 104 | pub fn set_election_timeout(&mut self, election_timeout: ElectionTimeout) { 105 | self.election_timeout = election_timeout; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /tests/isolated/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | mod macros; 3 | 4 | mod builder; 5 | pub use builder::Builder; 6 | 7 | mod driver; 8 | pub use driver::{ElectionTimeout, MockDriver}; 9 | 10 | use mini_raft::*; 11 | use validated::Validated; 12 | 13 | use futures_lite::future::block_on; 14 | use pretty_assertions::assert_eq; 15 | use tokio::time::{Duration, Instant}; 16 | 17 | pub struct Instance { 18 | /// Handle to the raft instance 19 | raft: Validated, 20 | 21 | /// Instant the instance was created 22 | epoch: Instant, 23 | 24 | /// Last instant 25 | now: Instant, 26 | 27 | /// Last `tick_at` return value. 28 | tick_at: Option, 29 | } 30 | 31 | pub type Id = &'static str; 32 | 33 | impl Instance { 34 | pub fn builder() -> Builder { 35 | Builder::new() 36 | } 37 | 38 | // Returns a new, unconfigured, follower 39 | pub fn new() -> Instance { 40 | Instance::builder().build_observer() 41 | } 42 | 43 | /// Returns a new leader in an empty group. 44 | pub fn new_group() -> Instance { 45 | Instance::builder().build_group() 46 | } 47 | 48 | /// Create a raft leader initialized with the specified group. 49 | pub fn new_leader(followers: &[&'static str], observers: &[&'static str]) -> Instance { 50 | Self::builder().build_leader(followers, observers) 51 | } 52 | 53 | pub fn propose( 54 | &mut self, 55 | value: message::Value<&'static str>, 56 | ) -> Result> { 57 | block_on(self.raft.propose(value)) 58 | } 59 | 60 | pub fn propose_ignore_append_entries(&mut self, value: message::Value<&'static str>) { 61 | self.propose(value).unwrap(); 62 | 63 | for (outbound, _) in self.raft.driver_mut().outbound.drain(..) { 64 | match outbound.action { 65 | message::Action::AppendEntries(..) => {} 66 | _ => panic!("unexpected message; {:#?}", outbound), 67 | } 68 | } 69 | } 70 | 71 | /// Tick time to the next action point 72 | pub fn sleep(&mut self) { 73 | if let Some(when) = self.tick_at { 74 | self.now = when; 75 | self.raft.set_now(when); 76 | } 77 | 78 | self.tick(); 79 | } 80 | 81 | pub fn sleep_for(&mut self, ms: u64) { 82 | self.now += Duration::from_millis(ms); 83 | self.raft.set_now(self.now); 84 | self.tick(); 85 | } 86 | 87 | pub fn tick(&mut self) { 88 | let Tick { tick_at, .. } = block_on(self.raft.tick()).unwrap(); 89 | self.tick_at = tick_at; 90 | } 91 | 92 | pub fn recv_append_entries( 93 | &mut self, 94 | id: &'static str, 95 | term: Term, 96 | message: message::AppendEntries<&'static str>, 97 | ) { 98 | self.recv(Message { 99 | origin: message::Origin { id, term }, 100 | action: message::Action::AppendEntries(message), 101 | }) 102 | } 103 | 104 | pub fn receive_append_entries_response( 105 | &mut self, 106 | id: &'static str, 107 | term: Term, 108 | append_entries_response: message::AppendEntriesResponse, 109 | ) { 110 | self.recv(append_entries_response.to_message(id, term)) 111 | } 112 | 113 | pub fn receive_pre_vote_request(&mut self, id: &'static str, term: Term, vote: message::Vote) { 114 | self.recv(vote.to_prevote_message(id, term)) 115 | } 116 | 117 | pub fn receive_vote_request(&mut self, id: &'static str, term: Term, vote: message::Vote) { 118 | self.recv(vote.to_message(id, term)) 119 | } 120 | 121 | pub fn receive_pre_vote_response( 122 | &mut self, 123 | id: &'static str, 124 | term: Term, 125 | vote_response: message::VoteResponse, 126 | ) { 127 | self.recv(vote_response.to_prevote_message(id, term)) 128 | } 129 | 130 | pub fn receive_vote_response( 131 | &mut self, 132 | id: &'static str, 133 | term: Term, 134 | vote_response: message::VoteResponse, 135 | ) { 136 | self.recv(vote_response.to_message(id, term)) 137 | } 138 | 139 | pub fn recv(&mut self, message: Message<&'static str>) { 140 | block_on(async { 141 | self.raft.receive(message).await.unwrap(); 142 | let Tick { tick_at, .. } = self.raft.tick().await.unwrap(); 143 | self.tick_at = tick_at; 144 | }); 145 | } 146 | 147 | pub fn log(&self) -> &[message::Entry<&'static str>] { 148 | &self.raft.driver().log[..] 149 | } 150 | 151 | /// Assert the raft node is sleeping for a set amount of time 152 | #[track_caller] 153 | pub fn assert_sleep_for(&mut self, ms: u64) -> &mut Self { 154 | assert_eq!(Duration::from_millis(ms), self.tick_at.unwrap() - self.now); 155 | self 156 | } 157 | 158 | pub fn info(&self) -> mini_raft::Info<&'static str> { 159 | self.raft.info() 160 | } 161 | 162 | /// Return the node's term 163 | pub fn term(&self) -> Term { 164 | self.info().term 165 | } 166 | 167 | pub fn last_appended(&self) -> Pos { 168 | self.log().last().unwrap().pos 169 | } 170 | 171 | /// When the instance will tick again, relative to the epoch 172 | pub fn tick_at(&self) -> Option { 173 | self.tick_at.map(|when| when - self.epoch) 174 | } 175 | 176 | /// Return the current leader from the node's point of view 177 | pub fn leader(&self) -> Option<&'static str> { 178 | self.raft.info().group.leader 179 | } 180 | 181 | /// Return the set of peers known to the node 182 | pub fn peers(&self) -> Vec> { 183 | self.raft.info().group.peers 184 | } 185 | 186 | #[track_caller] 187 | pub fn assert_sent(&mut self, to: &'static str, message: Message<&'static str>) -> &mut Self { 188 | if self.raft.driver().outbound.is_empty() { 189 | panic!("no pending outbound messages"); 190 | } 191 | 192 | let sent = self.raft.driver_mut().outbound.remove(0); 193 | assert_eq!(sent, (message, to)); 194 | self 195 | } 196 | 197 | #[track_caller] 198 | pub fn assert_idle(&mut self) -> &mut Self { 199 | let outbound = &self.raft.driver().outbound; 200 | 201 | if !outbound.is_empty() { 202 | panic!( 203 | "expected the raft node to be idle, but has pending messages\n{:#?}", 204 | outbound 205 | ); 206 | } 207 | 208 | self 209 | } 210 | 211 | /// Drain all messages in the sent queue without checking them 212 | pub fn drain_sent(&mut self) -> &mut Self { 213 | self.raft.driver_mut().outbound.clear(); 214 | self 215 | } 216 | 217 | /// Assert the peer has replicated the log up to the given index 218 | #[track_caller] 219 | pub fn assert_peer_matched(&mut self, peer: &'static str, expect: Option) -> &mut Self { 220 | let matched = self 221 | .info() 222 | .group 223 | .peer_by_id(&peer) 224 | .expect("peer missing") 225 | .matched; 226 | assert_eq!(matched, expect); 227 | self 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /tests/isolated/src/macros.rs: -------------------------------------------------------------------------------- 1 | #[macro_export] 2 | macro_rules! pos { 3 | ($index:literal : $term:literal) => { 4 | mini_raft::Pos { 5 | index: Index($index), 6 | term: Term($term), 7 | } 8 | }; 9 | } 10 | 11 | #[macro_export] 12 | macro_rules! val { 13 | ( $data:literal ) => { 14 | mini_raft::message::Value::data(&$data[..]) 15 | }; 16 | ( $f:ident $( ( $($arg:expr),* ) )? ) => { 17 | mini_raft::message::Value::$f( $( $( $arg ),* )? ) 18 | }; 19 | ( $data:literal ) => { 20 | mini_raft::message::Value::data(&$data[..]) 21 | }; 22 | ( $value:expr ) => { 23 | $value 24 | }; 25 | } 26 | 27 | #[macro_export] 28 | macro_rules! entry { 29 | ( $index:literal : $term:literal ) => { 30 | entry!($index : $term, data( stringify!("data" -> $index : $term).as_bytes() )) 31 | }; 32 | ( $index:literal: $term:literal, $( $t:tt )* ) => { 33 | mini_raft::message::Entry { 34 | pos: Pos { 35 | index: Index($index), 36 | term: Term($term), 37 | }, 38 | value: val!( $( $t )* ), 39 | } 40 | }; 41 | } 42 | 43 | #[macro_export] 44 | macro_rules! assert_observer { 45 | ($i:expr) => {{ 46 | let info = $i.info(); 47 | assert!( 48 | info.stage.is_observer(), 49 | "expected Observer but was {:?}", 50 | info.stage 51 | ); 52 | }}; 53 | } 54 | 55 | #[macro_export] 56 | macro_rules! assert_follower { 57 | ($i:expr) => {{ 58 | let info = $i.info(); 59 | assert!( 60 | info.stage.is_follower(), 61 | "expected Follower but was {:?}", 62 | info.stage 63 | ); 64 | }}; 65 | } 66 | 67 | #[macro_export] 68 | macro_rules! assert_candidate { 69 | ($i:expr) => {{ 70 | let info = $i.info(); 71 | assert!( 72 | info.stage.is_candidate(), 73 | "expected Candidate but was {:?}", 74 | info.stage 75 | ); 76 | }}; 77 | } 78 | 79 | #[macro_export] 80 | macro_rules! assert_leader { 81 | ($i:expr) => {{ 82 | let info = $i.info(); 83 | assert!( 84 | info.stage.is_leader(), 85 | "expected Leader but was {:?}", 86 | info.stage, 87 | ); 88 | }}; 89 | } 90 | 91 | #[macro_export] 92 | macro_rules! assert_term { 93 | ($i:expr, $term:expr) => {{ 94 | let expect = $term; 95 | let actual = $i.info().term; 96 | assert_eq!( 97 | actual, expect, 98 | "expected term {:?}, but was {:?}", 99 | expect, actual 100 | ); 101 | }}; 102 | } 103 | 104 | #[macro_export] 105 | macro_rules! assert_committed { 106 | ($i:expr, $index:expr) => {{ 107 | let expect = $index; 108 | let actual = $i.info().committed; 109 | 110 | assert_eq!( 111 | actual.expect("expected a committed index, but was None"), 112 | expect 113 | ); 114 | }}; 115 | } 116 | 117 | #[macro_export] 118 | macro_rules! assert_none_committed { 119 | ($i:expr) => {{ 120 | assert!($i.info().committed.is_none()); 121 | }}; 122 | } 123 | 124 | #[macro_export] 125 | macro_rules! assert_tick_at { 126 | ($i:expr, $dur:expr) => {{ 127 | let expect = std::time::Duration::from_millis($dur); 128 | let actual = $i.tick_at().unwrap(); 129 | assert_eq!( 130 | actual, expect, 131 | "expected next tick at {:?}, but was {:?}", 132 | expect, actual 133 | ); 134 | }}; 135 | } 136 | 137 | #[macro_export] 138 | macro_rules! assert_peers { 139 | ($i:ident, [ $( $peer:expr ),* ]) => {{ 140 | let mut actual: Vec<_> = $i.peers() 141 | .iter() 142 | .map(|peer| peer.id) 143 | .collect(); 144 | actual.sort(); 145 | 146 | let mut expect = vec![ $( $peer ),* ]; 147 | expect.sort(); 148 | 149 | assert_eq!(actual, expect); 150 | }} 151 | } 152 | -------------------------------------------------------------------------------- /tests/isolated/tests/test_flow.rs: -------------------------------------------------------------------------------- 1 | use isolated::*; 2 | use mini_raft::*; 3 | 4 | #[test] 5 | fn leader_sends_empty_heartbeat_even_if_peer_is_not_synced() { 6 | let mut raft = Instance::new_leader(&["0", "1"], &[]); 7 | 8 | raft.propose(message::Value::data("hello")).unwrap(); 9 | 10 | for peer in &["0", "1"] { 11 | raft.assert_sent( 12 | peer, 13 | message::AppendEntries { 14 | prev_log_pos: Some(pos!(6:0)), 15 | entries: vec![entry!(7:0, b"hello")], 16 | leader_committed_index: Some(Index(6)), 17 | } 18 | .to_message("raft", Term(0)), 19 | ); 20 | } 21 | 22 | raft.assert_idle().assert_sleep_for(10).sleep(); 23 | 24 | // Send heartbeats 25 | for peer in &["0", "1"] { 26 | raft.assert_sent( 27 | peer, 28 | message::AppendEntries { 29 | prev_log_pos: Some(pos!(7:0)), 30 | entries: vec![], 31 | leader_committed_index: Some(Index(6)), 32 | } 33 | .to_message("raft", Term(0)), 34 | ); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/isolated/tests/test_lifecycle.rs: -------------------------------------------------------------------------------- 1 | use isolated::*; 2 | use mini_raft::*; 3 | 4 | #[test] 5 | fn node_starts_as_observer() { 6 | let mut raft = Instance::new(); 7 | 8 | assert_observer!(raft); 9 | assert_none_committed!(raft); 10 | assert!(raft.leader().is_none()); 11 | assert!(raft.peers().is_empty()); 12 | 13 | // No outbound messages 14 | raft.assert_idle(); 15 | } 16 | 17 | #[test] 18 | fn node_becomes_follower_one_append() { 19 | let mut raft = Instance::new(); 20 | 21 | let entries = vec![ 22 | message::Entry { 23 | pos: pos!(0:0), 24 | value: message::Value::init_group("0"), 25 | }, 26 | message::Entry { 27 | pos: pos!(1:0), 28 | value: message::Value::add_node("raft"), 29 | }, 30 | message::Entry { 31 | pos: pos!(2:0), 32 | value: message::Value::upgrade_node_phase_one("raft"), 33 | }, 34 | ]; 35 | 36 | raft.recv_append_entries( 37 | "0", 38 | Term(0), 39 | message::AppendEntries { 40 | prev_log_pos: None, 41 | entries: entries.clone(), 42 | leader_committed_index: None, 43 | }, 44 | ); 45 | 46 | assert_follower!(raft); 47 | assert_term!(raft, 0); 48 | assert_none_committed!(raft); 49 | assert_tick_at!(raft, 150); 50 | assert_eq!(entries, raft.log()); 51 | assert_eq!(raft.leader(), Some("0")); 52 | assert_peers!(raft, ["raft", "0"]); 53 | 54 | raft.assert_sent( 55 | "0", 56 | message::AppendEntriesResponse::Success { 57 | last_log_pos: pos!(2:0), 58 | } 59 | .to_message("raft", Term(0)), 60 | ) 61 | .assert_idle(); 62 | } 63 | 64 | #[test] 65 | fn node_becomes_follower_multi_append() { 66 | let mut raft = Instance::new(); 67 | 68 | let entries = vec![ 69 | message::Entry { 70 | pos: pos!(0:0), 71 | value: message::Value::init_group("0"), 72 | }, 73 | message::Entry { 74 | pos: pos!(1:0), 75 | value: message::Value::add_node("raft"), 76 | }, 77 | message::Entry { 78 | pos: pos!(2:0), 79 | value: message::Value::upgrade_node_phase_one("raft"), 80 | }, 81 | ]; 82 | 83 | raft.recv_append_entries( 84 | "0", 85 | Term(0), 86 | message::AppendEntries { 87 | prev_log_pos: None, 88 | entries: vec![entries[0].clone()], 89 | leader_committed_index: None, 90 | }, 91 | ); 92 | 93 | assert_observer!(raft); 94 | assert_term!(raft, 0); 95 | assert_none_committed!(raft); 96 | assert_eq!(&entries[..1], raft.log()); 97 | assert_eq!(raft.leader(), Some("0")); 98 | assert_peers!(raft, ["0"]); 99 | 100 | raft.assert_sent( 101 | "0", 102 | message::AppendEntriesResponse::Success { 103 | last_log_pos: pos!(0:0), 104 | } 105 | .to_message("raft", Term(0)), 106 | ) 107 | .assert_idle(); 108 | 109 | raft.recv_append_entries( 110 | "0", 111 | Term(0), 112 | message::AppendEntries { 113 | prev_log_pos: Some(pos!(0:0)), 114 | entries: vec![entries[1].clone()], 115 | leader_committed_index: Some(Index(0)), 116 | }, 117 | ); 118 | 119 | assert_observer!(raft); 120 | assert_term!(raft, 0); 121 | assert_committed!(raft, 0); 122 | assert_eq!(&entries[..2], raft.log()); 123 | assert_eq!(raft.leader(), Some("0")); 124 | assert_peers!(raft, ["raft", "0"]); 125 | 126 | raft.assert_sent( 127 | "0", 128 | message::AppendEntriesResponse::Success { 129 | last_log_pos: pos!(1:0), 130 | } 131 | .to_message("raft", Term(0)), 132 | ) 133 | .assert_idle(); 134 | 135 | raft.recv_append_entries( 136 | "0", 137 | Term(0), 138 | message::AppendEntries { 139 | prev_log_pos: Some(pos!(1:0)), 140 | entries: vec![entries[2].clone()], 141 | leader_committed_index: Some(Index(0)), 142 | }, 143 | ); 144 | 145 | assert_follower!(raft); 146 | assert_term!(raft, 0); 147 | assert_tick_at!(raft, 150); 148 | assert_eq!(entries, raft.log()); 149 | assert_eq!(raft.leader(), Some("0")); 150 | assert_peers!(raft, ["raft", "0"]); 151 | 152 | raft.assert_sent( 153 | "0", 154 | message::AppendEntriesResponse::Success { 155 | last_log_pos: pos!(2:0), 156 | } 157 | .to_message("raft", Term(0)), 158 | ) 159 | .assert_idle(); 160 | } 161 | -------------------------------------------------------------------------------- /tests/isolated/tests/test_observer.rs: -------------------------------------------------------------------------------- 1 | use isolated::*; 2 | use mini_raft::*; 3 | use pretty_assertions::assert_eq; 4 | 5 | /// Send some entries to a new node 6 | #[test] 7 | fn basic_replication() { 8 | let mut raft = Instance::new(); 9 | 10 | let entries = vec![ 11 | entry!(0:0, init_group("0")), 12 | entry!(1:0, b"one"), 13 | entry!(2:0, b"two"), 14 | entry!(3:0, b"three"), 15 | ]; 16 | 17 | raft.recv_append_entries( 18 | "0", 19 | Term(0), 20 | message::AppendEntries { 21 | prev_log_pos: None, 22 | entries: entries[..1].to_vec(), 23 | leader_committed_index: Some(Index(0)), 24 | }, 25 | ); 26 | 27 | assert_term!(raft, 0); 28 | assert_eq!(raft.log(), &entries[..1]); 29 | 30 | raft.assert_sent( 31 | "0", 32 | message::AppendEntriesResponse::Success { 33 | last_log_pos: pos!(0:0), 34 | } 35 | .to_message("raft", Term(0)), 36 | ) 37 | .assert_idle(); 38 | 39 | // Send a heartbeat 40 | raft.recv_append_entries( 41 | "0", 42 | Term(0), 43 | message::AppendEntries { 44 | prev_log_pos: Some(pos!(0:0)), 45 | entries: vec![], 46 | leader_committed_index: Some(Index(0)), 47 | }, 48 | ); 49 | 50 | assert_term!(raft, 0); 51 | assert_eq!(raft.log(), &entries[..1]); 52 | 53 | raft.assert_sent( 54 | "0", 55 | message::AppendEntriesResponse::Success { 56 | last_log_pos: pos!(0:0), 57 | } 58 | .to_message("raft", Term(0)), 59 | ) 60 | .assert_idle(); 61 | 62 | raft.recv_append_entries( 63 | "0", 64 | Term(0), 65 | message::AppendEntries { 66 | prev_log_pos: Some(pos!(0:0)), 67 | entries: entries[1..3].to_vec(), 68 | leader_committed_index: Some(Index(0)), 69 | }, 70 | ); 71 | 72 | assert_term!(raft, 0); 73 | assert_eq!(raft.log(), &entries[..3]); 74 | 75 | raft.assert_sent( 76 | "0", 77 | message::AppendEntriesResponse::Success { 78 | last_log_pos: pos!(2:0), 79 | } 80 | .to_message("raft", Term(0)), 81 | ) 82 | .assert_idle(); 83 | 84 | raft.recv_append_entries( 85 | "0", 86 | Term(0), 87 | message::AppendEntries { 88 | prev_log_pos: Some(pos!(2:0)), 89 | entries: entries[3..].to_vec(), 90 | leader_committed_index: Some(Index(2)), 91 | }, 92 | ); 93 | 94 | assert_term!(raft, 0); 95 | assert_eq!(raft.log(), &entries[..]); 96 | 97 | raft.assert_sent( 98 | "0", 99 | message::AppendEntriesResponse::Success { 100 | last_log_pos: pos!(3:0), 101 | } 102 | .to_message("raft", Term(0)), 103 | ) 104 | .assert_idle(); 105 | } 106 | 107 | #[test] 108 | fn ignore_append_entries_response() { 109 | let mut raft = Instance::new(); 110 | 111 | raft.receive_append_entries_response( 112 | "1", 113 | Term(1), 114 | message::AppendEntriesResponse::Success { 115 | last_log_pos: pos!(10:0), 116 | }, 117 | ); 118 | 119 | raft.assert_idle(); 120 | assert_observer!(raft); 121 | assert_term!(raft, 1); 122 | } 123 | 124 | #[test] 125 | fn responds_to_pre_vote() { 126 | let mut raft = Instance::new(); 127 | 128 | raft.receive_pre_vote_request( 129 | "1", 130 | Term(1), 131 | message::Vote { 132 | last_log_pos: Some(pos!(10:0)), 133 | }, 134 | ); 135 | 136 | raft.assert_sent( 137 | "1", 138 | message::VoteResponse { granted: true }.to_prevote_message("raft", Term(1)), 139 | ) 140 | .assert_idle(); 141 | 142 | assert_observer!(raft); 143 | assert_term!(raft, 0); 144 | } 145 | 146 | #[test] 147 | fn ignore_pre_vote_response() { 148 | let mut raft = Instance::new(); 149 | 150 | raft.receive_pre_vote_response("1", Term(1), message::VoteResponse { granted: true }); 151 | 152 | raft.assert_idle(); 153 | assert_observer!(raft); 154 | assert_term!(raft, 0); 155 | } 156 | 157 | #[test] 158 | fn responds_to_vote() { 159 | let mut raft = Instance::new(); 160 | 161 | raft.receive_vote_request( 162 | "1", 163 | Term(1), 164 | message::Vote { 165 | last_log_pos: Some(pos!(10:0)), 166 | }, 167 | ); 168 | 169 | raft.assert_sent( 170 | "1", 171 | message::VoteResponse { granted: true }.to_message("raft", Term(1)), 172 | ) 173 | .assert_idle(); 174 | 175 | assert_observer!(raft); 176 | assert_term!(raft, 1); 177 | } 178 | 179 | #[test] 180 | fn ignore_vote_response() { 181 | let mut raft = Instance::new(); 182 | 183 | raft.receive_vote_response("1", Term(1), message::VoteResponse { granted: true }); 184 | 185 | raft.assert_idle(); 186 | assert_observer!(raft); 187 | assert_term!(raft, 1); 188 | } 189 | -------------------------------------------------------------------------------- /tests/simulated/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "simulated" 3 | version = "0.1.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | turmoil = { path = "/home/carllerche/Code/turmoil" } 11 | 12 | futures = "0.3" 13 | mini-raft = { path = "../../src/raft" } 14 | validated = { path = "../validated" } 15 | getrandom = "0.2" 16 | rand = { version = "0.8.5", features = ["small_rng"] } 17 | tokio = { version = "1.19", features = ["full"] } 18 | tracing = "0.1" 19 | tracing-subscriber = "0.2" 20 | serde = { version = "1.0.142", features = ["derive"] } 21 | serde_json = "1.0.83" 22 | -------------------------------------------------------------------------------- /tests/simulated/src/builder.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use mini_raft::{Config, Raft}; 4 | use validated::Validated; 5 | 6 | use rand::rngs::SmallRng; 7 | use rand::{RngCore, SeedableRng}; 8 | use std::cell::RefCell; 9 | use std::path::Path; 10 | use std::rc::Rc; 11 | use std::time::Duration; 12 | use tokio::time::Instant; 13 | 14 | pub struct Builder { 15 | /// Number of nodes in the raft group 16 | num_nodes: usize, 17 | 18 | /// Random number generator seed. This is used to reproduce a failing test. 19 | seed: Option<[u8; 32]>, 20 | 21 | /// How long to fuzz each test for, in real time 22 | duration: Duration, 23 | 24 | /// Turmoil builder 25 | turmoil: turmoil::Builder, 26 | } 27 | 28 | impl Builder { 29 | pub(crate) fn new(num_nodes: usize) -> Builder { 30 | super::init_tracing(); 31 | 32 | const DEFAULT_DURATION: u64 = 500; 33 | 34 | let duration = match std::env::var("RAFT_DURATION") { 35 | Ok(val) => match val.parse() { 36 | Ok(val) => val, 37 | Err(_) => DEFAULT_DURATION, 38 | }, 39 | Err(_) => DEFAULT_DURATION, 40 | }; 41 | 42 | let mut turmoil = turmoil::Builder::new(); 43 | turmoil.fail_rate(0.1).repair_rate(0.5); 44 | 45 | Builder { 46 | num_nodes, 47 | seed: None, 48 | duration: Duration::from_millis(duration), 49 | turmoil, 50 | } 51 | } 52 | 53 | pub fn seed(&mut self, seed: [u8; 32]) -> &mut Self { 54 | self.seed = Some(seed); 55 | self 56 | } 57 | 58 | pub fn duration(&mut self, duration: Duration) -> &mut Self { 59 | self.duration = duration; 60 | self 61 | } 62 | 63 | pub fn simulation_duration(&mut self, duration: Duration) -> &mut Self { 64 | self.turmoil.simulation_duration(duration); 65 | self 66 | } 67 | 68 | pub fn tick_duration(&mut self, duration: Duration) -> &mut Self { 69 | self.turmoil.tick_duration(duration); 70 | self 71 | } 72 | 73 | pub fn max_message_latency(&mut self, duration: Duration) -> &mut Self { 74 | self.turmoil.max_message_latency(duration); 75 | self 76 | } 77 | 78 | /// Log events to the specified file 79 | pub fn log(&mut self, path: impl AsRef) -> &mut Self { 80 | self.turmoil.log(path); 81 | self 82 | } 83 | 84 | pub fn fuzz(&self, mut f: F) 85 | where 86 | F: FnMut(&TestGroup), 87 | { 88 | if self.seed.is_some() { 89 | // A seed is set, so there is only one possible run. 90 | let group = self.new_group(); 91 | f(&group); 92 | } else { 93 | let now = std::time::Instant::now(); 94 | 95 | while now.elapsed() < self.duration { 96 | for _ in 0..50 { 97 | let group = self.new_group(); 98 | f(&group); 99 | } 100 | } 101 | } 102 | } 103 | 104 | pub fn test(&self, f: F) 105 | where 106 | F: std::future::Future, 107 | { 108 | self.new_group().test(f); 109 | } 110 | 111 | fn new_group(&self) -> TestGroup { 112 | let seed = self.seed.unwrap_or_else(|| { 113 | let mut seed = [0; 32]; 114 | getrandom::getrandom(&mut seed[..]).unwrap(); 115 | seed 116 | }); 117 | 118 | let mut nodes = vec![]; 119 | let rng = Rc::new(RefCell::new(SmallRng::from_seed(seed))); 120 | let rand = Box::new(Rand(rng.clone())); 121 | 122 | assert!(self.num_nodes > 0); 123 | 124 | let mut sim = self.turmoil.build_with_rng(rand); 125 | 126 | for i in 0..self.num_nodes { 127 | let host = format!("raft{}", i); 128 | 129 | sim.register(host.clone(), |io| { 130 | let outbound = Driver { 131 | io: io.clone(), 132 | log: vec![], 133 | rng: rng.clone(), 134 | }; 135 | 136 | let mut config = Config::new(io.local_addr()); 137 | config.max_uncommitted_entries = Some(10); 138 | 139 | let raft = if i == 0 { 140 | Rc::new(RefCell::new(Validated::new(Raft::new_group( 141 | outbound, 142 | config, 143 | Instant::now(), 144 | )))) 145 | } else { 146 | Rc::new(RefCell::new(Validated::new(Raft::new_observer( 147 | outbound, 148 | config, 149 | Instant::now(), 150 | )))) 151 | }; 152 | 153 | nodes.push(raft.clone()); 154 | 155 | let mut node = Node::new(raft, host, io); 156 | 157 | async move { node.run().await } 158 | }); 159 | } 160 | 161 | let client = sim.client("test-group"); 162 | 163 | // Don't drop messages from/to the client 164 | for peer in &nodes { 165 | let peer = peer.borrow(); 166 | sim.set_link_fail_rate(client.local_addr(), peer.id(), 0.0); 167 | sim.set_link_max_message_latency( 168 | client.local_addr(), 169 | peer.id(), 170 | Duration::from_millis(0), 171 | ); 172 | } 173 | 174 | TestGroup { 175 | nodes, 176 | sim: RefCell::new(sim), 177 | client, 178 | seed, 179 | } 180 | } 181 | } 182 | 183 | struct Rand(Rc>); 184 | 185 | impl RngCore for Rand { 186 | fn next_u32(&mut self) -> u32 { 187 | self.0.borrow_mut().next_u32() 188 | } 189 | 190 | fn next_u64(&mut self) -> u64 { 191 | self.0.borrow_mut().next_u64() 192 | } 193 | 194 | fn fill_bytes(&mut self, dest: &mut [u8]) { 195 | self.0.borrow_mut().fill_bytes(dest) 196 | } 197 | 198 | fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), rand::Error> { 199 | self.0.borrow_mut().try_fill_bytes(dest) 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /tests/simulated/src/driver.rs: -------------------------------------------------------------------------------- 1 | use crate::Message; 2 | use mini_raft::{message, Index, Term}; 3 | 4 | use rand::rngs::SmallRng; 5 | use std::cell::RefCell; 6 | use std::fmt; 7 | use std::io; 8 | use std::net::SocketAddr; 9 | use std::rc::Rc; 10 | use std::task::{Context, Poll}; 11 | use turmoil::Io; 12 | 13 | pub(crate) struct Driver { 14 | /// turmoil I/O handle for sending messages 15 | pub(crate) io: Io, 16 | 17 | /// Log entries 18 | pub(crate) log: Vec>, 19 | 20 | /// Random number generator used by `Raft`, but also shared across the test 21 | /// group. 22 | pub(crate) rng: Rc>, 23 | } 24 | 25 | impl mini_raft::Driver for Driver { 26 | type Id = SocketAddr; 27 | 28 | fn dispatch(&mut self, dst: Self::Id, message: mini_raft::Message) { 29 | self.io.send(dst, Message::Raft(message)); 30 | } 31 | 32 | fn poll_term_for( 33 | &mut self, 34 | _cx: &mut Context<'_>, 35 | index: Index, 36 | ) -> Poll>> { 37 | let index: usize = index.0.try_into().unwrap(); 38 | let maybe_term = self.log.get(index).map(|entry| entry.pos.term); 39 | Poll::Ready(Ok(maybe_term)) 40 | } 41 | 42 | fn poll_read_entry( 43 | &mut self, 44 | _cx: &mut Context<'_>, 45 | index: Index, 46 | ) -> Poll>>> { 47 | let index = usize::try_from(index.0).unwrap(); 48 | let entry = self.log.get(index).cloned(); 49 | Poll::Ready(Ok(entry)) 50 | } 51 | 52 | fn poll_read_entries( 53 | &mut self, 54 | _cx: &mut Context<'_>, 55 | start: Index, 56 | end: Index, 57 | dst: &mut Vec>, 58 | ) -> Poll> { 59 | let start = usize::try_from(start.0).unwrap(); 60 | // `end` is inclusive, but exclusive for rust slices. 61 | let end = usize::try_from(end.0).unwrap() + 1; 62 | dst.extend_from_slice(&self.log[start..end]); 63 | Poll::Ready(Ok(())) 64 | } 65 | 66 | fn append_entries(&mut self, entries: &[message::Entry]) { 67 | self.log.extend_from_slice(entries); 68 | } 69 | 70 | fn truncate(&mut self, index: Index) { 71 | let index = usize::try_from(index.0).unwrap(); 72 | self.log.truncate(index); 73 | } 74 | 75 | fn rand_election_timeout(&mut self, lower: u64, upper: u64) -> u64 { 76 | use rand::Rng; 77 | 78 | self.rng.borrow_mut().gen_range(lower..upper) 79 | } 80 | } 81 | 82 | impl fmt::Debug for Driver { 83 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 84 | write!(fmt, "Driver {{ .. }}") 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /tests/simulated/src/io.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | /// Wrapper around a turmoil I/O adding happens-before tracking using vector 4 | /// clocks. 5 | #[derive(Clone)] 6 | pub(crate) struct Io { 7 | turmoil: turmoil::Io, 8 | } 9 | 10 | impl Io { 11 | pub(crate) fn new(turmoil: turmoil::Io) -> Io { 12 | Io { 13 | turmoil, 14 | } 15 | } 16 | 17 | pub(crate) fn addr(&self) -> SocketAddr { 18 | self.turmoil.local_addr() 19 | } 20 | 21 | pub(crate) fn send(&self, host: SocketAddr, elapsed: u64, message: Message) { 22 | if let Some(log) = &self.log { 23 | log.send( 24 | host, 25 | self.addr(), 26 | self.vv.borrow().get(self.addr()), 27 | elapsed, 28 | &message, 29 | ); 30 | } 31 | 32 | self.turmoil.send( 33 | host, 34 | message::Envelope { 35 | vv: self.vv.borrow().clone(), 36 | sender: self.turmoil.addr, 37 | message, 38 | }, 39 | ); 40 | } 41 | 42 | pub(crate) async fn recv(&self, elapsed: u64) -> (Message, SocketAddr) { 43 | let ( 44 | message::Envelope { 45 | vv, 46 | message, 47 | sender, 48 | }, 49 | addr, 50 | ) = self.turmoil.recv().await; 51 | self.join_vv(&vv); 52 | 53 | if let Some(log) = &self.log { 54 | log.recv( 55 | self.addr(), 56 | self.vv.borrow().get(self.addr()), 57 | sender, 58 | vv.get(sender), 59 | elapsed, 60 | &message, 61 | ); 62 | } 63 | 64 | (message, addr) 65 | } 66 | 67 | pub(crate) async fn recv_from(&self, host: SocketAddr) -> Message { 68 | let message::Envelope { vv, message, .. } = self.turmoil.recv_from(host).await; 69 | self.join_vv(&vv); 70 | message 71 | } 72 | 73 | fn join_vv(&self, vv: &VersionVec) { 74 | let mut our_vv = self.vv.borrow_mut(); 75 | our_vv.join(vv); 76 | our_vv.inc(self.turmoil.addr); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tests/simulated/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod builder; 2 | pub use builder::Builder; 3 | 4 | mod driver; 5 | use driver::Driver; 6 | 7 | pub mod message; 8 | pub use message::Message; 9 | 10 | mod node; 11 | use node::Node; 12 | 13 | use validated::Validated; 14 | 15 | use std::cell::RefCell; 16 | use std::future::Future; 17 | use std::net::SocketAddr; 18 | use std::rc::Rc; 19 | use tokio::time::{self, Instant}; 20 | use turmoil::{Sim, ToSocketAddr}; 21 | 22 | pub struct TestGroup { 23 | /// Handle to raft nodes, enabling introspection 24 | nodes: Vec>>>, 25 | 26 | /// Network simulator 27 | sim: RefCell, 28 | 29 | /// Default client 30 | client: turmoil::Io, 31 | 32 | /// Random number generator seed 33 | seed: [u8; 32], 34 | } 35 | 36 | impl TestGroup { 37 | pub fn builder(num_nodes: usize) -> Builder { 38 | Builder::new(num_nodes) 39 | } 40 | 41 | /// Run a test against this group 42 | pub fn test(&self, f: impl Future) { 43 | self.sim.borrow_mut().run_until(&self.client, f); 44 | } 45 | 46 | /// Not **exactly** a request/response, but does a send followed by a 47 | /// recv_from 48 | pub async fn request(&self, host: impl ToSocketAddr, message: Message) -> Message { 49 | let host = self.lookup(host); 50 | self.send(host, message); 51 | self.recv_from(host).await 52 | } 53 | 54 | /// Send a message to the specified host using the default client 55 | pub fn send(&self, host: impl ToSocketAddr, message: Message) { 56 | let host = self.lookup(host); 57 | self.client.send(host, message); 58 | } 59 | 60 | /// Receive a message from any host 61 | pub async fn recv(&self) -> (Message, SocketAddr) { 62 | self.client.recv().await 63 | } 64 | 65 | /// Receive a message from the specified host. 66 | pub async fn recv_from(&self, host: impl ToSocketAddr) -> Message { 67 | let host = self.lookup(host); 68 | self.client.recv_from(host).await 69 | } 70 | 71 | /// Lookup the associated SocketAddr for the given host name 72 | pub fn lookup(&self, host: impl ToSocketAddr) -> SocketAddr { 73 | self.client.lookup(host) 74 | } 75 | 76 | /// Partition a Raft node from others. 77 | /// 78 | /// Clients will still be able to communicate with the node. 79 | pub fn partition(&self, host: &str) { 80 | let addr = self.client.lookup(host); 81 | 82 | for peer in &self.nodes { 83 | let peer = peer.borrow(); 84 | 85 | if peer.id() != addr { 86 | turmoil::partition(addr, peer.id()); 87 | } 88 | } 89 | } 90 | 91 | /// Repair a Raft node's connection. 92 | pub fn repair(&self, host: &str) { 93 | let addr = self.client.lookup(host); 94 | 95 | for peer in &self.nodes { 96 | let peer = peer.borrow(); 97 | 98 | if peer.id() != addr { 99 | turmoil::repair(addr, peer.id()); 100 | } 101 | } 102 | } 103 | 104 | /// Returns the info for the Raft node with the given host name 105 | pub fn info(&self, host: impl ToSocketAddr) -> mini_raft::Info { 106 | let addr = self.client.lookup(host); 107 | 108 | for node in &self.nodes { 109 | let node = node.borrow(); 110 | 111 | if node.id() == addr { 112 | return node.info(); 113 | } 114 | } 115 | 116 | panic!("no Raft node with host name {}", addr); 117 | } 118 | } 119 | 120 | impl Drop for TestGroup { 121 | fn drop(&mut self) { 122 | if std::thread::panicking() { 123 | println!("RANDOM NUMBER GENERATOR SEED = {:?}", self.seed); 124 | } 125 | } 126 | } 127 | 128 | fn init_tracing() { 129 | use tracing_subscriber::filter::EnvFilter; 130 | use tracing_subscriber::fmt::format::FmtSpan; 131 | 132 | let _ = tracing_subscriber::fmt() 133 | .with_env_filter(EnvFilter::from_default_env()) 134 | .with_test_writer() 135 | .with_span_events(FmtSpan::NEW) 136 | .try_init(); 137 | } 138 | -------------------------------------------------------------------------------- /tests/simulated/src/message.rs: -------------------------------------------------------------------------------- 1 | use mini_raft::*; 2 | 3 | use std::net::SocketAddr; 4 | 5 | #[derive(Clone, Debug, serde::Serialize)] 6 | pub enum Message { 7 | /// The client is proposing a new value to apply to the log 8 | Propose { value: message::Value }, 9 | 10 | /// The result of proposing the new value 11 | ProposeResult { result: ProposeResult }, 12 | 13 | /// Get the log entry at the given position 14 | Read { pos: Option }, 15 | 16 | /// Get the log entry at the given index 17 | ReadIdx { idx: Index }, 18 | 19 | /// Result of the read request 20 | ReadResult { 21 | pos: Pos, 22 | value: message::Value, 23 | }, 24 | 25 | /// Wait until a leader is elected for the given term 26 | WaitLeader { term: Term }, 27 | 28 | /// Result of the WaitLeader request 29 | WaitLeaderResponse { info: Info }, 30 | 31 | /// A Raft-internal message 32 | Raft(message::Message), 33 | } 34 | 35 | #[derive(Debug, Clone, serde::Serialize)] 36 | pub enum ProposeResult { 37 | Ok(Pos), 38 | Failed, 39 | NotLeader(SocketAddr), 40 | } 41 | 42 | #[derive(Debug)] 43 | pub struct ReadResult { 44 | pub pos: Pos, 45 | pub value: message::Value, 46 | } 47 | 48 | impl turmoil::Message for Message { 49 | fn write_json(&self, dst: &mut dyn std::io::Write) { 50 | serde_json::to_writer_pretty(dst, self).unwrap(); 51 | } 52 | } 53 | 54 | impl Message { 55 | /// Return a new message proposing to add a new node to the group 56 | pub fn add_observer(id: SocketAddr) -> Message { 57 | Message::Propose { 58 | value: message::Value::Config(message::ConfigChange::AddNode { 59 | id, 60 | auto_upgrade: false, 61 | }), 62 | } 63 | } 64 | 65 | /// Return a new message proposing to add a new follower node to the group. 66 | /// 67 | /// The node is first added as an observer with the "auto_upgrade" flag set. 68 | /// Once the new node has received the log, it is then automatically 69 | /// upgraded to follower. 70 | pub fn add_follower(id: SocketAddr) -> Message { 71 | Message::Propose { 72 | value: message::Value::Config(message::ConfigChange::AddNode { 73 | id, 74 | auto_upgrade: true, 75 | }), 76 | } 77 | } 78 | 79 | /// Return a new message proposing to remove a follower 80 | pub fn remove_follower(id: SocketAddr) -> Message { 81 | Message::Propose { 82 | value: message::Value::Config(message::ConfigChange::RemoveNode { 83 | id, 84 | phase: message::Phase::One, 85 | }), 86 | } 87 | } 88 | 89 | /// Return a new message proposing an opaque data value to the Raft group. 90 | pub fn propose_data(data: impl AsRef<[u8]>) -> Message { 91 | Message::Propose { 92 | value: message::Value::Data(data.as_ref().to_vec()), 93 | } 94 | } 95 | 96 | pub fn read(pos: Pos) -> Message { 97 | Message::Read { pos: Some(pos) } 98 | } 99 | 100 | pub fn read_idx(idx: Index) -> Message { 101 | Message::ReadIdx { idx } 102 | } 103 | 104 | pub fn read_latest() -> Message { 105 | Message::Read { pos: None } 106 | } 107 | 108 | pub fn wait_leader(term: Term) -> Message { 109 | Message::WaitLeader { term } 110 | } 111 | 112 | pub fn to_propose_value(self) -> message::Value { 113 | match self { 114 | Message::Propose { value } => value, 115 | _ => panic!("expecting Message::Propose; actual={:?}", self), 116 | } 117 | } 118 | 119 | /// Assume the message is of variant `ProposeResult` 120 | pub fn to_propose_result(self) -> Result> { 121 | match self { 122 | Message::ProposeResult { result } => match result { 123 | ProposeResult::Ok(pos) => Ok(pos), 124 | ProposeResult::Failed => Err(ProposeError::FailedToCommit), 125 | ProposeResult::NotLeader(leader) => Err(ProposeError::NotLeader(Some(leader))), 126 | }, 127 | Message::WaitLeaderResponse { info } => Err(ProposeError::NotLeader(info.group.leader)), 128 | _ => panic!("expected Message::ProposeResult; actual={:?}", self), 129 | } 130 | } 131 | 132 | pub fn to_read_result(self) -> ReadResult { 133 | match self { 134 | Message::ReadResult { pos, value } => ReadResult { pos, value }, 135 | _ => panic!("expected Message::ReadResult; actual={:?}", self), 136 | } 137 | } 138 | 139 | pub fn to_read_value(self) -> message::Value { 140 | self.to_read_result().value 141 | } 142 | 143 | pub fn to_info(self) -> Info { 144 | match self { 145 | Message::WaitLeaderResponse { info } => info, 146 | _ => panic!("expected Message::WaitLeaderResponse; actual={:?}", self), 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /tests/simulated/src/node.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | use mini_raft::{Index, Pos, ProposeError, Term}; 3 | use validated::Validated; 4 | 5 | use futures::executor::block_on; 6 | use std::cell::RefCell; 7 | use std::rc::Rc; 8 | use tokio::time::Duration; 9 | use turmoil::Io; 10 | 11 | /// Simulated Raft node 12 | pub(crate) struct Node { 13 | /// Handle to the Raft state machine for this node 14 | raft: RaftRef, 15 | 16 | /// This node's host name 17 | host: String, 18 | 19 | /// Handle for receiving and sending messages 20 | io: Io, 21 | 22 | /// Last applied position 23 | last_applied: Option, 24 | 25 | /// In-flight requests 26 | requests: Vec, 27 | } 28 | 29 | enum Request { 30 | Propose { pos: Pos, sender: SocketAddr }, 31 | Read { pos: Pos, sender: SocketAddr }, 32 | ReadIdx { idx: Index, sender: SocketAddr }, 33 | WaitLeader { term: Term, sender: SocketAddr }, 34 | } 35 | 36 | type RaftRef = Rc>>; 37 | 38 | impl Node { 39 | pub(crate) fn new(raft: RaftRef, host: String, io: Io) -> Node { 40 | Node { 41 | raft, 42 | host, 43 | io, 44 | last_applied: None, 45 | requests: vec![], 46 | } 47 | } 48 | 49 | pub(crate) async fn run(&mut self) { 50 | use futures::FutureExt; 51 | 52 | // Tick once to prime the pump 53 | let tick = { 54 | let mut raft = self.raft.borrow_mut(); 55 | raft.set_now(Instant::now()); 56 | block_on(raft.tick()).unwrap() 57 | }; 58 | 59 | let when = tick 60 | .tick_at 61 | .unwrap_or(Instant::now() + Duration::from_secs(10)); 62 | let mut sleep = Box::pin(time::sleep_until(when)); 63 | let mut entries = vec![]; 64 | 65 | loop { 66 | tracing::trace!(?self.host, "Sim::tick;"); 67 | 68 | futures::select_biased! { 69 | (message, sender) = self.io.recv().fuse() => { 70 | self.raft.borrow_mut().set_now(Instant::now()); 71 | self.recv(message, sender); 72 | } 73 | _ = (&mut sleep).fuse() => { 74 | self.raft.borrow_mut().set_now(Instant::now()); 75 | } 76 | } 77 | 78 | // Tick raft 79 | let tick = block_on(self.raft.borrow_mut().tick()).unwrap(); 80 | let when = tick 81 | .tick_at 82 | .unwrap_or(Instant::now() + Duration::from_secs(10)); 83 | sleep.as_mut().reset(when); 84 | 85 | // Find newly comitted entries 86 | entries.clear(); 87 | block_on( 88 | self.raft 89 | .borrow_mut() 90 | .copy_committed_entries_to(&mut entries), 91 | ) 92 | .unwrap(); 93 | 94 | // Track entries as applied 95 | for entry in &entries { 96 | assert!( 97 | Some(entry.pos) > self.last_applied, 98 | "re-applying commits; last-applied={:?}; entry={:?}", 99 | self.last_applied, 100 | entry 101 | ); 102 | self.raft.borrow_mut().applied_to(entry.pos); 103 | self.last_applied = Some(entry.pos); 104 | } 105 | 106 | // Try completing requests 107 | self.process_requests(); 108 | } 109 | } 110 | 111 | fn process_requests(&mut self) { 112 | use Request::*; 113 | 114 | self.requests.retain_mut(|request| match request { 115 | Propose { pos, sender } => { 116 | if Some(*pos) <= self.last_applied { 117 | let maybe_entry = block_on(self.raft.borrow_mut().get(*pos)).unwrap(); 118 | 119 | if let Some(entry) = maybe_entry { 120 | self.io.send( 121 | *sender, 122 | Message::ProposeResult { 123 | result: message::ProposeResult::Ok(entry.pos), 124 | }, 125 | ); 126 | } else { 127 | self.io.send( 128 | *sender, 129 | Message::ProposeResult { 130 | result: message::ProposeResult::Failed, 131 | }, 132 | ); 133 | } 134 | false 135 | } else { 136 | true 137 | } 138 | } 139 | Read { pos, sender } => { 140 | if Some(*pos) <= self.last_applied { 141 | let entry = match block_on(self.raft.borrow_mut().get(*pos)).unwrap() { 142 | Some(entry) => entry, 143 | None => { 144 | panic!( 145 | "expected to read pos {:?}; host={:?}; last_applied={:?}", 146 | pos, self.host, self.last_applied 147 | ); 148 | } 149 | }; 150 | self.io.send( 151 | *sender, 152 | Message::ReadResult { 153 | pos: *pos, 154 | value: entry.value, 155 | }, 156 | ); 157 | false 158 | } else { 159 | true 160 | } 161 | } 162 | ReadIdx { idx, sender } => { 163 | let last_applied_index = self.last_applied.map(|pos| pos.index); 164 | 165 | if Some(*idx) <= last_applied_index { 166 | let entry = block_on(self.raft.borrow_mut().get_index(*idx)) 167 | .unwrap() 168 | .unwrap(); 169 | self.io.send( 170 | *sender, 171 | Message::ReadResult { 172 | pos: entry.pos, 173 | value: entry.value, 174 | }, 175 | ); 176 | false 177 | } else { 178 | true 179 | } 180 | } 181 | WaitLeader { term, sender } => { 182 | let info = self.raft.borrow().info(); 183 | 184 | if info.term >= *term && info.group.leader.is_some() { 185 | self.io.send(*sender, Message::WaitLeaderResponse { info }); 186 | false 187 | } else { 188 | true 189 | } 190 | } 191 | }); 192 | } 193 | 194 | fn recv(&mut self, msg: Message, sender: SocketAddr) { 195 | tracing::debug!(?self.host, ?msg, " stream.recv()"); 196 | 197 | match msg { 198 | Message::Propose { value } => { 199 | let res = block_on(self.raft.borrow_mut().propose(value)); 200 | match res { 201 | Ok(pos) => { 202 | self.requests.push(Request::Propose { pos, sender }); 203 | } 204 | Err(ProposeError::NotLeader(Some(leader))) => { 205 | self.io.send( 206 | sender, 207 | Message::ProposeResult { 208 | result: message::ProposeResult::NotLeader(leader), 209 | }, 210 | ); 211 | } 212 | Err(ProposeError::NotLeader(None)) => { 213 | let term = self.raft.borrow().term(); 214 | self.requests.push(Request::WaitLeader { 215 | term, 216 | sender: sender, 217 | }); 218 | } 219 | Err(ProposeError::TooManyUncommitted) => { 220 | self.io.send( 221 | sender, 222 | Message::ProposeResult { 223 | result: message::ProposeResult::Failed, 224 | }, 225 | ); 226 | } 227 | Err(e) => todo!("{:?}", e), 228 | } 229 | } 230 | Message::Read { pos: Some(pos) } => { 231 | self.requests.push(Request::Read { pos, sender }); 232 | } 233 | Message::ReadIdx { idx } => { 234 | self.requests.push(Request::ReadIdx { idx, sender }); 235 | } 236 | Message::Raft(message) => { 237 | block_on(self.raft.borrow_mut().receive(message)).unwrap(); 238 | } 239 | Message::WaitLeader { term } => { 240 | self.requests.push(Request::WaitLeader { term, sender }); 241 | } 242 | msg => unimplemented!("this message is sent to client; msg={:?}", msg), 243 | } 244 | } 245 | } 246 | -------------------------------------------------------------------------------- /tests/simulated/tests/test_raft.rs: -------------------------------------------------------------------------------- 1 | use mini_raft::{message, ProposeError}; 2 | use simulated::*; 3 | use std::time::Duration; 4 | 5 | #[test] 6 | fn single_node_group() { 7 | TestGroup::builder(1).fuzz(|group| { 8 | group.test(async { 9 | // The first node should *start* as a leader 10 | assert!(group.info("raft0").stage.is_leader()); 11 | 12 | group.send("raft0", Message::propose_data(b"hello world")); 13 | 14 | let message = group.recv_from("raft0").await; 15 | assert_eq!(message.to_propose_result().unwrap().index, 1); 16 | }); 17 | }); 18 | } 19 | 20 | #[test] 21 | fn single_leader_add_observer() { 22 | TestGroup::builder(2) 23 | .simulation_duration(Duration::from_millis(600)) 24 | .seed([ 25 | 196, 161, 95, 121, 61, 10, 141, 232, 156, 23, 115, 29, 133, 247, 37, 129, 48, 255, 49, 26 | 88, 112, 73, 226, 10, 185, 124, 253, 186, 49, 180, 36, 254, 27 | ]) 28 | .fuzz(|group| { 29 | group.test(async { 30 | // The first node should *start* as a leader 31 | assert!(group.info("raft0").stage.is_leader()); 32 | assert!(group.info("raft1").stage.is_observer()); 33 | 34 | let propose = Message::add_observer(group.lookup("raft1")); 35 | 36 | // Add the second node to the group 37 | group.send("raft0", propose.clone()); 38 | 39 | // Wait for the config change to be added to the log 40 | let pos = group.recv_from("raft0").await.to_propose_result().unwrap(); 41 | 42 | // Read from the observer at the same log position 43 | group.send("raft1", Message::read(pos)); 44 | 45 | let message = group.recv_from("raft1").await.to_read_value(); 46 | assert_eq!(message, propose.to_propose_value()); 47 | 48 | // Propose a second value 49 | let value = message::Value::Data(b"hello world".to_vec()); 50 | group.send( 51 | "raft0", 52 | Message::Propose { 53 | value: value.clone(), 54 | }, 55 | ); 56 | 57 | // Wait for the message to commit 58 | let message = group.recv_from("raft0").await; 59 | 60 | // Read from the observer 61 | group.send("raft1", Message::read(message.to_propose_result().unwrap())); 62 | 63 | let message = group.recv_from("raft1").await.to_read_value(); 64 | assert_eq!(message, value); 65 | }); 66 | }); 67 | } 68 | 69 | #[test] 70 | fn single_leader_add_follower() { 71 | TestGroup::builder(2).fuzz(|group| { 72 | group.test(async { 73 | let raft1 = group.lookup("raft1"); 74 | 75 | // The first node should *start* as a leader 76 | assert!(group.info("raft0").stage.is_leader()); 77 | assert!(group.info("raft1").stage.is_observer()); 78 | 79 | let propose = Message::add_follower(raft1); 80 | 81 | // Add the second node to the group 82 | group.send("raft0", propose.clone()); 83 | 84 | // Wait for the config change to be added to the log 85 | let mut pos = group.recv_from("raft0").await.to_propose_result().unwrap(); 86 | 87 | // Read from the observer at the same log position 88 | group.send("raft1", Message::read(pos)); 89 | 90 | let message = group.recv_from("raft1").await.to_read_value(); 91 | assert_eq!(message, propose.to_propose_value()); 92 | 93 | // Read the second value, which should be the *upgrade* request 94 | pos.index += 1; 95 | group.send("raft1", Message::read(pos)); 96 | 97 | let message = group.recv_from("raft1").await.to_read_value(); 98 | assert_eq!(message, message::Value::upgrade_node_phase_one(raft1)); 99 | 100 | // Read the *third* value, which is the phase-two *upgrade* message 101 | pos.index += 1; 102 | group.send("raft1", Message::read(pos)); 103 | 104 | let message = group.recv_from("raft1").await.to_read_value(); 105 | assert_eq!(message, message::Value::upgrade_node_phase_two(raft1)); 106 | 107 | // Check that the second node has become a voter. 108 | assert!(group.info("raft0").group[raft1].is_stable_voter()); 109 | 110 | // Check that the second node sees itself as a voter 111 | assert!(group.info("raft1").group[raft1].is_stable_voter()); 112 | }); 113 | }); 114 | } 115 | 116 | #[test] 117 | fn single_leader_add_2_followers() { 118 | TestGroup::builder(3).fuzz(|group| { 119 | group.test(async move { 120 | add_two_followers(&group).await; 121 | }); 122 | }); 123 | } 124 | 125 | #[test] 126 | fn add_2_followers_remove_one() { 127 | TestGroup::builder(3).fuzz(|group| { 128 | group.test(async move { 129 | add_two_followers(&group).await; 130 | 131 | // Remove "raft1" 132 | let addr = group.lookup("raft1"); 133 | let mut pos = group 134 | .request("raft0", Message::remove_follower(addr)) 135 | .await 136 | .to_propose_result() 137 | .unwrap(); 138 | 139 | // The next entry should be the second phase in the removal process. 140 | pos.index += 1; 141 | 142 | let phase2 = group 143 | .request("raft0", Message::read(pos)) 144 | .await 145 | .to_read_value(); 146 | assert_eq!(phase2, message::Value::remove_node_phase_two(addr)); 147 | }); 148 | }); 149 | } 150 | 151 | #[test] 152 | #[ignore] 153 | fn repeated_failure_1() { 154 | // This test assumes that links fail 155 | TestGroup::builder(3) 156 | .max_message_latency(Duration::from_millis(1000)) 157 | .simulation_duration(Duration::from_secs(200)) 158 | .tick_duration(Duration::from_millis(10)) 159 | // .log("repeated_failure.log") 160 | .seed([ 161 | 67, 153, 15, 136, 147, 29, 83, 97, 147, 240, 121, 113, 130, 72, 2, 176, 198, 17, 187, 162 | 27, 100, 133, 166, 82, 208, 74, 63, 51, 141, 13, 70, 52, 163 | ]) 164 | .fuzz(|group| { 165 | group.test(async move { 166 | add_two_followers(&group).await; 167 | 168 | let mut term = group.info("raft0").term; 169 | 170 | // Propose a message every few elections 171 | for _ in 0..3 { 172 | let mut leader = group 173 | .request("raft0", Message::wait_leader(term + 3)) 174 | .await 175 | .to_info() 176 | .group 177 | .leader 178 | .unwrap(); 179 | 180 | term = group.info("raft0").term; 181 | 182 | // Propose some data 183 | let data = format!("data {:?}", term); 184 | 185 | let pos = loop { 186 | let message = group 187 | .request(leader, Message::propose_data(data.as_bytes())) 188 | .await; 189 | 190 | match message.to_propose_result() { 191 | Ok(pos) => break pos, 192 | Err(ProposeError::NotLeader(Some(new_leader))) => { 193 | leader = new_leader; 194 | } 195 | Err(ProposeError::NotLeader(None)) => { 196 | let not_leader_term = group.info(leader).term; 197 | leader = group 198 | .request(leader, Message::wait_leader(not_leader_term)) 199 | .await 200 | .to_info() 201 | .group 202 | .leader 203 | .unwrap(); 204 | } 205 | Err(_) => todo!(), 206 | } 207 | }; 208 | 209 | // Wait until synced on all hosts 210 | for host in ["raft0", "raft1", "raft2"] { 211 | let resp = group.request(host, Message::read(pos)).await; 212 | assert_eq!( 213 | message::Value::data(data.as_bytes()), 214 | resp.to_read_result().value 215 | ); 216 | } 217 | } 218 | }); 219 | }); 220 | } 221 | 222 | #[test] 223 | fn repeated_failure_2() { 224 | // This test assumes that links fail 225 | TestGroup::builder(3) 226 | .duration(Duration::from_secs(100)) 227 | .max_message_latency(Duration::from_millis(750)) 228 | .simulation_duration(Duration::from_secs(50_000)) 229 | .tick_duration(Duration::from_millis(10)) 230 | .fuzz(|group| { 231 | group.test(async move { 232 | add_two_followers(&group).await; 233 | 234 | let mut term = group.info("raft0").term; 235 | 236 | // Propose a message every few elections 237 | for _ in 0..3 { 238 | let mut leader = group 239 | .request("raft0", Message::wait_leader(term + 3)) 240 | .await 241 | .to_info() 242 | .group 243 | .leader 244 | .unwrap(); 245 | 246 | term = group.info("raft0").term; 247 | 248 | // Propose some data 249 | let data = format!("data {:?}", term); 250 | 251 | let pos = loop { 252 | let message = group 253 | .request(leader, Message::propose_data(data.as_bytes())) 254 | .await; 255 | 256 | match message.to_propose_result() { 257 | Ok(pos) => break pos, 258 | Err(ProposeError::NotLeader(Some(new_leader))) => { 259 | leader = new_leader; 260 | } 261 | Err(ProposeError::NotLeader(None)) => { 262 | let not_leader_term = group.info(leader).term; 263 | leader = group 264 | .request(leader, Message::wait_leader(not_leader_term)) 265 | .await 266 | .to_info() 267 | .group 268 | .leader 269 | .unwrap(); 270 | } 271 | Err(ProposeError::FailedToCommit) => {} 272 | Err(_) => todo!(), 273 | } 274 | }; 275 | 276 | // Wait until synced on all hosts 277 | for host in ["raft0", "raft1", "raft2"] { 278 | let resp = group.request(host, Message::read(pos)).await; 279 | assert_eq!( 280 | message::Value::data(data.as_bytes()), 281 | resp.to_read_result().value 282 | ); 283 | } 284 | } 285 | }); 286 | }); 287 | } 288 | 289 | #[test] 290 | fn leader_disconnect() { 291 | TestGroup::builder(3).fuzz(|group| { 292 | group.test(async move { 293 | add_two_followers(&group).await; 294 | 295 | // Read the latest log position 296 | let last_applied = group.info("raft0").last_applied.unwrap(); 297 | 298 | group.partition("raft0"); 299 | 300 | // Wait for a leader to be elected in the next term 301 | group.send("raft1", Message::wait_leader(last_applied.term + 1)); 302 | 303 | let message = group.recv_from("raft1").await.to_info(); 304 | let leader = message.group.leader.unwrap(); 305 | 306 | // Propose some data 307 | let message = group 308 | .request(leader, Message::propose_data(b"hello world")) 309 | .await; 310 | let pos = message.to_propose_result().unwrap(); 311 | 312 | // The value should be committed 313 | for host in ["raft1", "raft2"] { 314 | let resp = group.request(host, Message::read(pos)).await; 315 | assert_eq!( 316 | message::Value::data(&b"hello world"[..]), 317 | resp.to_read_result().value 318 | ); 319 | } 320 | 321 | // Bring "raft0" back. It should become a follower and catch up 322 | 323 | group.repair("raft0"); 324 | 325 | let resp = group.request("raft0", Message::read(pos)).await; 326 | assert_eq!( 327 | message::Value::data(&b"hello world"[..]), 328 | resp.to_read_result().value 329 | ); 330 | }); 331 | }) 332 | } 333 | 334 | async fn add_two_followers(group: &TestGroup) { 335 | // The first node should *start* as a leader 336 | assert!(group.info("raft0").stage.is_leader()); 337 | 338 | let mut leader = group.lookup("raft0"); 339 | 340 | for host in ["raft1", "raft2"] { 341 | let addr = group.lookup(host); 342 | let propose = Message::add_follower(addr); 343 | 344 | assert!(group.info(host).stage.is_observer()); 345 | 346 | // Wait for the config change to be added to the log 347 | let pos = loop { 348 | group.send(leader, propose.clone()); 349 | match group.recv_from(leader).await.to_propose_result() { 350 | Ok(pos) => break pos, 351 | Err(ProposeError::FailedToCommit) => { 352 | continue; 353 | } 354 | Err(ProposeError::NotLeader(l)) => { 355 | leader = l.unwrap(); 356 | } 357 | Err(e) => panic!("error={:?}", e), 358 | } 359 | }; 360 | 361 | // Read from the observer at the same log position 362 | group.send(host, Message::read(pos)); 363 | 364 | let message = group.recv_from(host).await.to_read_value(); 365 | assert_eq!(message, propose.to_propose_value()); 366 | 367 | // Read the second value, which should be the *upgrade* request 368 | // let index = pos.index + 1; 369 | let message = read_next_config(group, host, pos.index + 1) 370 | .await 371 | .to_read_result(); 372 | 373 | assert_eq!(message.value, message::Value::upgrade_node_phase_one(addr)); 374 | 375 | // Read the *third* value, which is the phase-two *upgrade* message 376 | let message = read_next_config(group, host, message.pos.index + 1) 377 | .await 378 | .to_read_result(); 379 | assert_eq!(message.value, message::Value::upgrade_node_phase_two(addr)); 380 | 381 | // Check that the second node has become a voter. 382 | assert!(group.info(host).group[addr].is_stable_voter()); 383 | 384 | let info = group.info(host); 385 | 386 | // Check that the second node sees itself as a voter 387 | assert!(info.group[addr].is_stable_voter()); 388 | 389 | // Check that the second node includes the leader in its group 390 | assert!(info.group[leader].is_stable_voter()); 391 | } 392 | } 393 | 394 | async fn read_next_config(group: &TestGroup, host: &str, mut index: mini_raft::Index) -> Message { 395 | loop { 396 | group.send(host, Message::read_idx(index)); 397 | 398 | let message = group.recv_from(host).await; 399 | 400 | match &message { 401 | Message::ReadResult { 402 | value: message::Value::Config(..), 403 | .. 404 | } => { 405 | return message; 406 | } 407 | _ => { 408 | index += 1; 409 | } 410 | } 411 | } 412 | } 413 | -------------------------------------------------------------------------------- /tests/validated/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "validated" 3 | version = "0.1.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | mini-raft = { path = "../../src/raft" } 11 | tokio = { version = "1.19", features = ["time"] } -------------------------------------------------------------------------------- /tests/validated/src/lib.rs: -------------------------------------------------------------------------------- 1 | use mini_raft::*; 2 | use std::io; 3 | use tokio::time::Instant; 4 | 5 | /// Wrapper around a `Raft` instance that performs extra validation of the state. 6 | pub struct Validated { 7 | raft: Raft, 8 | } 9 | 10 | impl Validated { 11 | pub fn new(raft: Raft) -> Validated { 12 | Validated { raft } 13 | } 14 | 15 | pub fn id(&self) -> T::Id { 16 | self.raft.id() 17 | } 18 | 19 | pub fn set_now(&mut self, now: Instant) { 20 | self.raft.set_now(now); 21 | } 22 | 23 | pub async fn propose( 24 | &mut self, 25 | value: message::Value, 26 | ) -> Result> { 27 | self.raft.propose(value).await 28 | } 29 | 30 | pub async fn receive(&mut self, message: Message) -> io::Result<()> { 31 | let info = self.info(); 32 | let term = info.term; 33 | let stage = info.stage; 34 | let should_inc_term = self.should_inc_term(stage, term, &message); 35 | 36 | let ret = self.raft.receive(message.clone()).await; 37 | let new_term = self.info().term; 38 | 39 | match should_inc_term { 40 | Some(true) => { 41 | assert_eq!( 42 | new_term, message.origin.term, 43 | "expected term to be updated to message term; stage={:?}; {:#?}", 44 | stage, message 45 | ) 46 | } 47 | Some(false) => { 48 | assert_eq!( 49 | new_term, term, 50 | "expected term to remain the same; stage={:?}; {:#?}", 51 | stage, message 52 | ); 53 | } 54 | None => {} 55 | } 56 | 57 | ret 58 | } 59 | 60 | /// Copies committed entries to the 61 | pub async fn copy_committed_entries_to( 62 | &mut self, 63 | dst: &mut Vec>, 64 | ) -> io::Result<()> { 65 | self.raft.copy_committed_entries_to(dst).await 66 | } 67 | 68 | /// Advance the "applied" cursor 69 | pub fn applied_to(&mut self, pos: Pos) { 70 | self.raft.applied_to(pos) 71 | } 72 | 73 | /// Read the value at the specific position 74 | pub async fn get(&mut self, pos: Pos) -> io::Result>> { 75 | self.raft.get(pos).await 76 | } 77 | 78 | pub async fn get_index(&mut self, index: Index) -> io::Result>> { 79 | self.raft.get_index(index).await 80 | } 81 | 82 | /// Returns the node's current term 83 | pub fn term(&self) -> Term { 84 | self.raft.term() 85 | } 86 | 87 | pub async fn tick(&mut self) -> io::Result { 88 | self.raft.tick().await 89 | } 90 | 91 | /// Returns a reference to the driver 92 | pub fn driver(&self) -> &T { 93 | self.raft.driver() 94 | } 95 | 96 | /// Returns a mutable reference to the driver 97 | pub fn driver_mut(&mut self) -> &mut T { 98 | self.raft.driver_mut() 99 | } 100 | 101 | pub fn info(&self) -> Info { 102 | self.raft.info() 103 | } 104 | 105 | fn should_inc_term( 106 | &self, 107 | _stage: info::Stage, 108 | term: Term, 109 | message: &Message, 110 | ) -> Option { 111 | match &message.action { 112 | // Incrementing the term for a vote message is somewhat involved, skip for now. 113 | message::Action::Vote(_) | message::Action::PreVote(_) => None, 114 | message::Action::PreVoteResponse(response) => { 115 | Some(!response.granted && message.origin.term > term) 116 | } 117 | _ if message.origin.term > term => Some(true), 118 | _ => Some(false), 119 | } 120 | } 121 | } 122 | --------------------------------------------------------------------------------