├── .github └── workflows │ └── rust.yml ├── .gitignore ├── .rustfmt.toml ├── .travis.yml ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── design.md └── src ├── async_ch.rs ├── async_rt.rs ├── conf_change ├── conf_change.rs ├── datadriven_test.rs ├── mod.rs ├── quick_test.rs ├── restore.rs └── testdata │ ├── joint_autoleave.txt │ ├── joint_idempotency.txt │ ├── joint_learners_next.txt │ ├── joint_safety.txt │ ├── simple_idempotency.txt │ ├── simple_promote_demote.txt │ ├── simple_safety.txt │ ├── update.txt │ └── zero.txt ├── lib.rs ├── node.rs ├── nom_data_test └── mod.rs ├── paper_test.rs ├── protocol ├── mod.rs └── raft.proto ├── quorum ├── data_driven_test.rs ├── joint.rs ├── majority.rs ├── majority_vote.txt ├── mod.rs ├── quick_test.rs ├── quorum.rs └── testdata │ ├── joint_commit.txt │ ├── joint_vote.txt │ ├── majority_commit.txt │ └── majority_vote.txt ├── raft.rs ├── raft_flow_control_test.rs ├── raft_log.rs ├── raft_snap_test.rs ├── raft_test.rs ├── raftpb ├── .gitignore ├── gogoproto │ ├── .gitignore │ └── gogo.proto ├── mod.rs └── raft.proto ├── rawnode.rs ├── read_only.rs ├── status.rs ├── storage.rs ├── tests_util.rs ├── tracker ├── inflights.rs ├── mod.rs ├── progress.rs └── state.rs ├── unstable.rs └── util └── mod.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Install latest nightly 17 | uses: actions-rs/toolchain@v1 18 | with: 19 | toolchain: nightly 20 | override: true 21 | 22 | - name: Install Protoc 23 | uses: arduino/setup-protoc@master 24 | 25 | - name: Build 26 | run: cargo build --verbose 27 | - name: Run tests 28 | run: RUST_LOG=debug cargo test --verbose -- --test-threads=1 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | reorder_imports = true -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 3 | - nightly 4 | jobs: 5 | allow_failures: 6 | - rust: nightly 7 | fast_finish: true 8 | install: 9 | - curl -L https://github.com/google/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip -o /tmp/protoc.zip 10 | - unzip /tmp/protoc.zip -d $HOME/protoc 11 | env: 12 | - PATH=$HOME/protoc/bin:$PATH 13 | script: 14 | - cargo build --verbose --all 15 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "eraft-rs" 3 | version = "0.1.0" 4 | authors = ["Rg"] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | log = "0.4.14" 11 | protobuf = { version = "2.27.1", features = ["with-bytes"] } 12 | bytes = "1.1.0" 13 | anyhow = "1.0.53" 14 | thiserror = "1.0.30" 15 | getset = "0.1.2" 16 | nom = "7.1.0" 17 | chrono = "0.4.19" 18 | env_logger = "0.9.0" 19 | rand = "0.8.5" 20 | serde = { version = "1.0.136", features = ["derive"] } 21 | serde_json = "1.0.79" 22 | tokio = { version = "1.16.1", features = ["full"] } 23 | futures = { version = "0.3.21", default-features = true } 24 | lazy_static = "1.4.0" 25 | async-channel = "1.6.1" 26 | async-io = "1.6.0" 27 | async-trait = "0.1.52" 28 | [dev-dependencies] 29 | maplit = "1.0.2" 30 | env_logger = "0.9.0" 31 | [build-dependencies] 32 | protoc-rust = "2.27.1" 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # eraft-rs 2 | eraft-rs is raft component of etcd-rs [![Build Status](https://travis-ci.org/laohanlinux/eraft-rs.svg?branch=master)](https://travis-ci.org/laohanlinux/eraft-rs) 3 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | use protoc_rust::Customize; 2 | 3 | fn main() { 4 | protoc_rust::Codegen::new() 5 | .out_dir("src/raftpb") 6 | .inputs(&["src/raftpb/raft.proto"]) 7 | .includes(&["src/raftpb"]) 8 | .customize(protoc_rust::Customize { 9 | carllerche_bytes_for_bytes: Some(true), 10 | carllerche_bytes_for_string: Some(true), 11 | ..Default::default() 12 | }) 13 | .run() 14 | .expect("protoc"); 15 | } 16 | -------------------------------------------------------------------------------- /design.md: -------------------------------------------------------------------------------- 1 | ## Progress 2 | 3 | Progress represents a follower’s progress in the view of the leader. Leader maintains progresses of all followers, and sends `replication message` to the follower based on its progress. 4 | 5 | `replication message` is a `msgApp` with log entries. 6 | 7 | A progress has two attribute: `match` and `next`. `match` is the index of the highest known matched entry. If leader knows nothing about follower’s replication status, `match` is set to zero. `next` is the index of the first entry that will be replicated to the follower. Leader puts entries from `next` to its latest one in next `replication message`. 8 | 9 | A progress is in one of the three state: `probe`, `replicate`, `snapshot`. 10 | 11 | ``` 12 | +--------------------------------------------------------+ 13 | | send snapshot | 14 | | | 15 | +---------+----------+ +----------v---------+ 16 | +---> probe | | snapshot | 17 | | | max inflight = 1 <----------------------------------+ max inflight = 0 | 18 | | +---------+----------+ +--------------------+ 19 | | | 1. snapshot success 20 | | | (next=snapshot.index + 1) 21 | | | 2. snapshot failure 22 | | | (no change) 23 | | | 3. receives msgAppResp(rej=false&&index>lastsnap.index) 24 | | | (match=m.index,next=match+1) 25 | receives msgAppResp(rej=true) 26 | (next=match+1)| | 27 | | | 28 | | | 29 | | | receives msgAppResp(rej=false&&index>match) 30 | | | (match=m.index,next=match+1) 31 | | | 32 | | | 33 | | | 34 | | +---------v----------+ 35 | | | replicate | 36 | +---+ max inflight = n | 37 | +--------------------+ 38 | ``` 39 | 40 | When the progress of a follower is in `probe` state, leader sends at most one `replication message` per heartbeat interval. The leader sends `replication message` slowly and probing the actual progress of the follower. A `msgHeartbeatResp` or a `msgAppResp` with reject might trigger the sending of the next `replication message`. 41 | 42 | When the progress of a follower is in `replicate` state, leader sends `replication message`, then optimistically increases `next` to the latest entry sent. This is an optimized state for fast replicating log entries to the follower. 43 | 44 | When the progress of a follower is in `snapshot` state, leader stops sending any `replication message`. 45 | 46 | A newly elected leader sets the progresses of all the followers to `probe` state with `match` = 0 and `next` = last index. The leader slowly (at most once per heartbeat) sends `replication message` to the follower and probes its progress. 47 | 48 | A progress changes to `replicate` when the follower replies with a non-rejection `msgAppResp`, which implies that it has matched the index sent. At this point, leader starts to stream log entries to the follower fast. The progress will fall back to `probe` when the follower replies a rejection `msgAppResp` or the link layer reports the follower is unreachable. We aggressively reset `next` to `match`+1 since if we receive any `msgAppResp` soon, both `match` and `next` will increase directly to the `index` in `msgAppResp`. (We might end up with sending some duplicate entries when aggressively reset `next` too low. see open question) 49 | 50 | A progress changes from `probe` to `snapshot` when the follower falls very far behind and requires a snapshot. After sending `msgSnap`, the leader waits until the success, failure or abortion of the previous snapshot sent. The progress will go back to `probe` after the sending result is applied. 51 | 52 | ### Flow Control 53 | 54 | 1. limit the max size of message sent per message. Max should be configurable. 55 | Lower the cost at probing state as we limit the size per message; lower the penalty when aggressively decreased to a too low `next` 56 | 57 | 2. limit the # of in flight messages < N when in `replicate` state. N should be configurable. Most implementation will have a sending buffer on top of its actual network transport layer (not blocking raft node). We want to make sure raft does not overflow that buffer, which can cause message dropping and triggering a bunch of unnecessary resending repeatedly. 58 | -------------------------------------------------------------------------------- /src/async_ch.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | use std::time::Duration; 3 | use futures::SinkExt; 4 | use async_channel::{bounded, Sender, Receiver, SendError, RecvError, TryRecvError}; 5 | use env_logger::Env; 6 | use futures::task::SpawnExt; 7 | use tokio::select; 8 | use crate::node::SafeResult; 9 | use crate::raftpb::raft::Message; 10 | 11 | #[derive(Clone)] 12 | pub(crate) struct Channel { 13 | rx: Option>, 14 | tx: Option>, 15 | } 16 | 17 | impl Channel { 18 | pub(crate) fn new(n: usize) -> Self { 19 | let (tx, rx) = bounded(n); 20 | Channel { 21 | rx: Some(rx), 22 | tx: Some(tx), 23 | } 24 | } 25 | async fn try_send(&self, msg: T) -> Result<(), SendError> { 26 | if let Some(tx) = &self.tx { 27 | return tx.send(msg).await; 28 | } 29 | Ok(()) 30 | } 31 | 32 | pub(crate) async fn try_recv(&self) -> Result { 33 | if let Some(rx) = &self.rx { 34 | return rx.try_recv(); 35 | } 36 | Err(TryRecvError::Empty) 37 | } 38 | 39 | pub(crate) async fn recv(&self) -> Result { 40 | let rx = self.rx.as_ref().unwrap(); 41 | rx.recv().await 42 | } 43 | 44 | pub(crate) async fn send(&self, msg: T) -> Result<(), SendError> { 45 | let tx = self.tx.as_ref().unwrap(); 46 | tx.send(msg).await 47 | } 48 | 49 | pub(crate) fn tx(&self) -> Sender { 50 | self.tx.as_ref().unwrap().clone() 51 | } 52 | 53 | pub(crate) fn take_tx(&mut self) -> Option> { 54 | self.tx.take() 55 | } 56 | } 57 | 58 | #[derive(Clone)] 59 | pub(crate) struct MsgWithResult { 60 | m: Option, 61 | ch: Option>>, 62 | } 63 | 64 | impl Default for MsgWithResult { 65 | fn default() -> Self { 66 | MsgWithResult { 67 | m: None, 68 | ch: None, 69 | } 70 | } 71 | } 72 | 73 | impl MsgWithResult { 74 | pub fn new() -> Self { 75 | MsgWithResult { 76 | m: None, 77 | ch: None, 78 | } 79 | } 80 | 81 | pub fn new_with_msg(msg: Message) -> Self { 82 | MsgWithResult { 83 | m: Some(msg), 84 | ch: None, 85 | } 86 | } 87 | 88 | pub fn new_with_channel(tx: Sender>, msg: Message) -> Self { 89 | MsgWithResult { 90 | m: Some(msg), 91 | ch: Some(tx), 92 | } 93 | } 94 | 95 | pub fn get_msg(&self) -> Option<&Message> { 96 | self.m.as_ref() 97 | } 98 | 99 | pub(crate) async fn notify(&self, msg: SafeResult<()>) { 100 | if let Some(sender) = &self.ch { 101 | sender.send(msg).await; 102 | } 103 | } 104 | 105 | pub(crate) async fn notify_and_close(&mut self, msg: SafeResult<()>) { 106 | if let Some(sender) = self.ch.take() { 107 | sender.send(msg).await; 108 | sender.close(); 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/async_rt.rs: -------------------------------------------------------------------------------- 1 | use std::future::Future; 2 | use tokio::runtime::Handle; 3 | use tokio::time::error::Elapsed; 4 | use tokio::task; 5 | use tokio::time::{self, Duration}; 6 | 7 | pub(crate) fn wait_timeout(d: Duration, fut: F) -> Result 8 | where F: Future + Send + 'static, F::Output: Send + 'static 9 | { 10 | task::block_in_place(move || { 11 | Handle::current().block_on(async move { 12 | time::timeout(d, fut).await 13 | }) 14 | }) 15 | } 16 | 17 | pub(crate) fn sleep(d: Duration) { 18 | task::block_in_place(move || { 19 | Handle::current().block_on(async move { 20 | time::sleep(d).await 21 | }); 22 | }); 23 | } 24 | 25 | pub(crate) fn wait(fut: F) -> F::Output 26 | where F: Future + Send + 'static, F::Output: Send + 'static { 27 | tokio::task::block_in_place(move || { 28 | Handle::current().block_on(async move { 29 | fut.await 30 | }) 31 | }) 32 | } 33 | 34 | #[test] 35 | fn it_works() {} -------------------------------------------------------------------------------- /src/conf_change/datadriven_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod test { 3 | use crate::conf_change::conf_change::Changer; 4 | use crate::nom_data_test::{execute_test, walk}; 5 | use crate::raftpb::raft::{ConfChange, ConfChangeSingle, ConfChangeType}; 6 | use crate::tracker::progress::ProgressMap; 7 | use crate::tracker::{Config, ProgressTracker}; 8 | use env_logger::init; 9 | use protobuf::ProtobufEnum; 10 | use std::convert::AsMut; 11 | 12 | #[test] 13 | fn t_conf_data_driven() { 14 | // flexi_logger::Logger::with_env().start(); 15 | walk("src/conf_change/testdata", |p| { 16 | let mut tr = ProgressTracker::new(10); 17 | let mut c = Changer { 18 | tracker: tr, 19 | last_index: 0, // incremented in this test with each cmd 20 | }; 21 | execute_test(p, "--------------------------------", |data| -> String { 22 | // The test files use the commands 23 | // - simple: run a simple conf change (i.e. no joint consensus), 24 | // - enter-joint: enter a joint config, and 25 | // - leave-joint: leave a joint config 26 | // The first two take a list of config changes, which have the following 27 | // syntax: 28 | // - vn: make a voter, 29 | // - ln: make n a learner, 30 | // - rn: remove n, and 31 | // - un: update n 32 | let mut ccs: Vec = vec![]; 33 | let mut auto_leave = false; 34 | for cmd_arg in data.cmd_args.iter() { 35 | let mut cc = ConfChangeSingle::new(); 36 | match cmd_arg.key.as_str() { 37 | "v" => { 38 | cc.set_field_type(ConfChangeType::ConfChangeAddNode); 39 | } 40 | "l" => cc.set_field_type(ConfChangeType::ConfChangeAddLearnerNode), 41 | "r" => cc.set_field_type(ConfChangeType::ConfChangeRemoveNode), 42 | "u" => cc.set_field_type(ConfChangeType::ConfChangeUpdateNode), 43 | "autoleave" => { 44 | auto_leave = cmd_arg.vals[0].parse().unwrap(); 45 | } 46 | u => panic!("unknown input: {}", u), 47 | } 48 | if cmd_arg.key.as_str() != "autoleave" { 49 | let id = cmd_arg.vals[0].parse().unwrap(); 50 | cc.set_node_id(id); 51 | ccs.push(cc); 52 | } 53 | } 54 | 55 | let mut cfg = Config::default(); 56 | let mut prs = ProgressMap::default(); 57 | match data.cmd.as_str() { 58 | "simple" => match c.simple(&mut ccs) { 59 | Ok((new_cfg, new_prs)) => { 60 | cfg = new_cfg; 61 | prs = new_prs; 62 | } 63 | e => { 64 | c.last_index += 1; 65 | return e.unwrap_err(); 66 | } 67 | }, 68 | "enter-joint" => match c.enter_joint(auto_leave, &mut ccs) { 69 | Ok((new_cfg, new_prs)) => { 70 | cfg = new_cfg; 71 | prs = new_prs; 72 | } 73 | e => { 74 | c.last_index += 1; 75 | return e.unwrap_err(); 76 | } 77 | }, 78 | "leave-joint" => { 79 | info!("ccs {:?}", ccs); 80 | if !ccs.is_empty() { 81 | return "this command takes no input".to_owned(); 82 | } 83 | match c.leave_joint() { 84 | Ok((new_cfg, new_prs)) => { 85 | cfg = new_cfg; 86 | prs = new_prs; 87 | } 88 | e => { 89 | c.last_index += 1; 90 | return e.unwrap_err(); 91 | } 92 | } 93 | } 94 | u => panic!("unknown command: {}", u), 95 | } 96 | c.tracker.config = cfg; 97 | c.tracker.progress = prs; 98 | c.last_index += 1; 99 | format!("{}\n{}", c.tracker.config, c.tracker.progress) 100 | }) 101 | }); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/conf_change/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::raftpb::raft::{ConfChangeSingle, ConfChangeType}; 2 | 3 | pub mod conf_change; 4 | mod datadriven_test; 5 | mod quick_test; 6 | pub mod restore; 7 | 8 | pub(crate) fn new_conf_change_single(id: u64, typ: ConfChangeType) -> ConfChangeSingle { 9 | let mut ccs = ConfChangeSingle::new(); 10 | ccs.set_node_id(id); 11 | ccs.set_field_type(typ); 12 | ccs 13 | } 14 | -------------------------------------------------------------------------------- /src/conf_change/quick_test.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #[cfg(test)] 16 | mod tests { 17 | use crate::conf_change::conf_change::Changer; 18 | use crate::raftpb::raft::{ConfChange, ConfChangeSingle, ConfChangeType}; 19 | use crate::tracker::ProgressTracker; 20 | use protobuf::ProtobufEnum; 21 | use rand::Rng; 22 | use std::fmt::Error; 23 | 24 | // uses quick_check to verify that simple and joint config 25 | // changes arrive at the same result. 26 | #[test] 27 | fn t_conf_change_quick() { 28 | // flexi_logger::Logger::with_env().start(); 29 | let count = 1000; 30 | // log the first couple of runs of give some indication of things working 31 | // as intended. 32 | const info_count: usize = 5; 33 | 34 | for i in 0..count { 35 | let (simple_change, mut ccs) = wrapper().unwrap(); 36 | let mut epoch_cc = ccs.drain(..1).collect::>(); 37 | let mut tr = ProgressTracker::new(10); 38 | let mut c = Changer { 39 | tracker: tr, 40 | last_index: 10, 41 | }; 42 | let ret = c.simple(&mut epoch_cc); 43 | assert!(ret.is_ok()); 44 | c.tracker.config = ret.as_ref().unwrap().0.clone(); 45 | c.tracker.progress = ret.as_ref().unwrap().1.clone(); 46 | let ret = with_joint(&mut c, &mut ccs); 47 | assert!(ret.is_ok()); 48 | assert_eq!(simple_change, c); 49 | } 50 | } 51 | 52 | fn gen_cc( 53 | num: impl Fn() -> usize, 54 | id: impl Fn() -> u64, 55 | typ: impl Fn() -> ConfChangeType, 56 | ) -> Vec { 57 | let mut ccs = Vec::new(); 58 | let n = num(); 59 | for i in 0..n { 60 | let mut cc = ConfChangeSingle::new(); 61 | cc.set_field_type(typ()); 62 | cc.set_node_id(id()); 63 | ccs.push(cc); 64 | } 65 | ccs 66 | } 67 | 68 | fn wrapper() -> Result<(Changer, Vec), String> { 69 | let mut ccs = gen_cc( 70 | || -> usize { 71 | let mut r = rand::thread_rng(); 72 | r.gen_range(1..9) + 1 73 | }, 74 | || -> u64 { 75 | let mut r = rand::thread_rng(); 76 | r.gen_range(1..9) + 1 77 | }, 78 | || -> ConfChangeType { 79 | let mut r = rand::thread_rng(); 80 | let n = ConfChangeType::values().len(); 81 | let em = r.gen_range(0..n); 82 | ConfChangeType::from_i32(em as i32).unwrap() 83 | }, 84 | ); 85 | let mut epoch_cc = ConfChangeSingle::new(); 86 | epoch_cc.set_node_id(1); 87 | epoch_cc.set_field_type(ConfChangeType::ConfChangeAddNode); 88 | ccs.push(epoch_cc); 89 | ccs.reverse(); 90 | 91 | let ccs_copy = ccs.clone(); 92 | 93 | let mut tr = ProgressTracker::new(10); 94 | let mut c = Changer { 95 | tracker: tr, 96 | last_index: 10, 97 | }; 98 | with_simple(&mut c, &mut ccs).map(|_| (c, ccs_copy)) 99 | } 100 | 101 | fn with_simple(c: &mut Changer, ccs: &mut [ConfChangeSingle]) -> Result<(), String> { 102 | for cc in ccs.iter() { 103 | let mut ccs = Vec::new(); 104 | ccs.push(cc.clone()); 105 | let (cfg, prs) = c.simple(&mut ccs)?; 106 | c.tracker.config = cfg; 107 | c.tracker.progress = prs; 108 | } 109 | Ok(()) 110 | } 111 | 112 | fn with_joint(c: &mut Changer, ccs: &mut [ConfChangeSingle]) -> Result<(), String> { 113 | let (cfg, prs) = c.enter_joint(false, ccs)?; 114 | // Also do this with auto_leave on, just to check that we'd get the same 115 | // result. 116 | let (mut cfg2a, mut prs2a) = c.enter_joint(true, ccs)?; 117 | cfg2a.auto_leave = false; 118 | assert_eq!(cfg, cfg2a); 119 | assert_eq!(prs, prs2a); 120 | 121 | c.tracker.config = cfg.clone(); 122 | c.tracker.progress = prs.clone(); 123 | let (mut cfg2b, mut prs2b) = c.leave_joint()?; 124 | // Reset back to the main branch with auto_leave = false. 125 | c.tracker.config = cfg.clone(); 126 | c.tracker.progress = prs.clone(); 127 | let (cfg, prs) = c.leave_joint()?; 128 | assert_eq!(cfg, cfg2b); 129 | assert_eq!(prs, prs2b); 130 | 131 | c.tracker.config = cfg.clone(); 132 | c.tracker.progress = prs.clone(); 133 | 134 | Ok(()) 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/conf_change/restore.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use crate::conf_change::conf_change::Changer; 16 | use crate::conf_change::new_conf_change_single; 17 | use crate::raftpb::raft::ConfChangeType::{ 18 | ConfChangeAddLearnerNode, ConfChangeAddNode, ConfChangeRemoveNode, 19 | }; 20 | use crate::raftpb::raft::{ConfChange, ConfChangeSingle, ConfChangeType, ConfState}; 21 | use crate::tracker::progress::ProgressMap; 22 | use crate::tracker::Config; 23 | 24 | // toConfChangeSingle translates a conf state into 1) a slice of operations creating 25 | // first the config that will become the outgoing one, and then the incoming one, and 26 | // b) another slice that, when applied to the config resulted from 1), respresents the 27 | // ConfState. 28 | fn to_conf_change_single(cs: &ConfState) -> (Vec, Vec) { 29 | // Example to follow along this code: 30 | // voters=(1 2 3) learners=(5) outgoing=(1 2 4 6) learners_next=(4) 31 | // 32 | // This means that before entering the joint config, the configuration 33 | // had voters (1 2 4 6) and perhaps some learners that are already gone. 34 | // The new set of voters is (1 2 3), i.e. (1 2) were kept around, and (4 6) 35 | // are no longer voters; however 4 is poised to become a learner upon leaving 36 | // the joint state. 37 | // We can't tell whether 5 was a learner before entering the joint config, 38 | // but it doesn't matter (we'll pretend that it wasn't) 39 | // 40 | // The code below will construct 41 | // outgoing = add 1; add 2; add 4; add 6 42 | // incoming = remove 1; remove 2; remove 4; remove 6 43 | // outgoing add 1; add 2; add 3; 44 | // incoming add-learner 5; 45 | // add-learner 4; 46 | // So, when starting with an empty config, after applying 'outgoing' we have 47 | // 48 | // quorum=(1 2 4 6) 49 | // 50 | // From which we enter a joint state via 'incoming' 51 | // quorum=(1 2 3)&&(1 2 4 6) learner=(5) learners_next=(4) 52 | // 53 | // as desired. 54 | 55 | let mut outgoing = Vec::new(); 56 | let mut incoming = Vec::new(); 57 | for id in cs.get_voters_outgoing() { 58 | // If there are outgoing voters, first add them one by one so that the 59 | // (non-joint) config has them all. 60 | outgoing.push(new_conf_change_single( 61 | *id, 62 | ConfChangeType::ConfChangeAddNode, 63 | )); 64 | } 65 | // We're done constructing the outgoing slice, now on to the incoming one 66 | // (which will apply on top of the config created by the outgoing slice). 67 | 68 | // First, we'll remove all of the outgoing voters. 69 | for id in cs.get_voters_outgoing() { 70 | incoming.push(new_conf_change_single( 71 | *id, 72 | ConfChangeType::ConfChangeRemoveNode, 73 | )); 74 | } 75 | // Then we'll add the incoming voters and learners. 76 | for id in cs.get_voters() { 77 | incoming.push(new_conf_change_single( 78 | *id, 79 | ConfChangeType::ConfChangeAddNode, 80 | )); 81 | } 82 | for id in cs.get_learners() { 83 | incoming.push(new_conf_change_single( 84 | *id, 85 | ConfChangeType::ConfChangeAddLearnerNode, 86 | )); 87 | } 88 | // Same for LeanersNext; these are nodes we want to be learners but which 89 | // are currently voters in the outgoing config. 90 | for id in cs.get_learners_next() { 91 | incoming.push(new_conf_change_single( 92 | *id, 93 | ConfChangeType::ConfChangeAddLearnerNode, 94 | )) 95 | } 96 | (outgoing, incoming) 97 | } 98 | 99 | // pub fn chain(chg ConfChange, ops: impl ) 100 | 101 | /// takes a Changer (which must represent an empty configuration), and 102 | /// runs a sequence of changes enacting the configuration described in the 103 | /// ConfState 104 | /// 105 | /// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure 106 | /// the Changer only needs a ProgressMap (not a whole Tracker) at which point 107 | /// this can just take last_index and max_inflight directly instead and cook up 108 | /// the results from that alone. 109 | pub fn restore(chg: &mut Changer, cs: &ConfState) -> Result<(Config, ProgressMap), String> { 110 | warn!("execute restore "); 111 | let (outgoing, mut incoming) = to_conf_change_single(cs); 112 | if outgoing.is_empty() { 113 | // No outgoing config, so just apply the incoming changes one by one. 114 | for cc in incoming.iter() { 115 | let cc = &mut vec![cc.clone()]; 116 | let (cfg, progress) = chg.simple(cc)?; 117 | chg.tracker.config = cfg; 118 | chg.tracker.progress = progress; 119 | } 120 | } else { 121 | // The ConfState describes a joint configuration. 122 | // 123 | // First, apply all of the changes of the outgoing config one by one, so 124 | // that it temporarily becomes the incoming active config. For example, 125 | // if the config is (1 2 3)&(2 3 4), this will establish (2 3 4)&(). 126 | for cc in outgoing.iter() { 127 | let cc = &mut vec![cc.clone()]; 128 | let (cfg, progress) = chg.simple(cc)?; 129 | chg.tracker.config = cfg; 130 | chg.tracker.progress = progress; 131 | } 132 | 133 | // Now enter the joint state, which rotates the above additions into the 134 | // outgoing config, and adds the incoming config in. Continuing the 135 | // example above. we'd get (1 2 3)&(2 3 4), i.e. the incoming operations 136 | // would be removing 2,3,4 and then adding in 1,2,3 while transitioning 137 | // into a joint state. 138 | let (cfg, progress) = chg.enter_joint(cs.get_auto_leave(), &mut *incoming)?; 139 | chg.tracker.config = cfg; 140 | chg.tracker.progress = progress; 141 | } 142 | 143 | Ok((chg.tracker.config.clone(), chg.tracker.progress.clone())) 144 | } 145 | 146 | #[cfg(test)] 147 | mod tests { 148 | use crate::conf_change::conf_change::Changer; 149 | use crate::conf_change::restore::restore; 150 | use crate::raftpb::raft::ConfState; 151 | use crate::tracker::ProgressTracker; 152 | use protobuf::reflect::ProtobufValue; 153 | use rand::prelude::SliceRandom; 154 | use rand::Rng; 155 | 156 | #[test] 157 | fn t_restore() { 158 | // flexi_logger::Logger::with_env().start(); 159 | let count = 1000; 160 | let f = |cs: &mut ConfState| -> bool { 161 | let mut chg = Changer { 162 | tracker: ProgressTracker::new(10), 163 | last_index: 0, 164 | }; 165 | let (cfg, prs) = { 166 | match restore(&mut chg, cs) { 167 | Ok((cfg, prs)) => (cfg, prs), 168 | Err(e) => { 169 | error!("{}", e); 170 | return false; 171 | } 172 | } 173 | }; 174 | 175 | chg.tracker.config = cfg; 176 | chg.tracker.progress = prs; 177 | 178 | cs.voters.sort(); 179 | cs.learners.sort(); 180 | cs.voters_outgoing.sort(); 181 | cs.learners_next.sort(); 182 | let mut cs2 = chg.tracker.config_state(); 183 | cs2.voters.sort(); 184 | cs2.learners.sort(); 185 | cs2.voters_outgoing.sort(); 186 | cs2.learners_next.sort(); 187 | // NB: cs.Equivalent does the same "sorting" dance internally, but let's 188 | // test it a bit here instead of relying on it. 189 | if cs.get_auto_leave() == false { 190 | cs.set_auto_leave(false); 191 | } 192 | if cs2.get_auto_leave() == false { 193 | cs2.set_auto_leave(false); 194 | } 195 | if *cs == cs2 { 196 | return true; 197 | } 198 | false 199 | }; 200 | 201 | let new_conf_state = |voters: Option>, 202 | learners: Option>, 203 | voters_outgoing: Option>, 204 | learners_next: Option>, 205 | auto_leave: bool| 206 | -> ConfState { 207 | let mut cs = ConfState::new(); 208 | if voters.is_some() { 209 | cs.set_voters(voters.unwrap()); 210 | } 211 | if learners.is_some() { 212 | cs.set_learners(learners.unwrap()); 213 | } 214 | if voters_outgoing.is_some() { 215 | cs.set_voters_outgoing(voters_outgoing.unwrap()); 216 | } 217 | if learners_next.is_some() { 218 | cs.set_learners_next(learners_next.unwrap()); 219 | } 220 | cs.set_auto_leave(auto_leave); 221 | cs 222 | }; 223 | for mut cs in vec![ 224 | ConfState::new(), 225 | new_conf_state(Some(vec![1, 2, 3]), None, None, None, false), 226 | new_conf_state(Some(vec![1, 2, 3]), Some(vec![4, 5, 6]), None, None, false), 227 | new_conf_state( 228 | Some(vec![1, 2, 3]), 229 | Some(vec![5]), 230 | Some(vec![1, 2, 4, 6]), 231 | Some(vec![4]), 232 | false, 233 | ), 234 | ] 235 | .iter_mut() 236 | { 237 | assert!(f(&mut cs)); 238 | } 239 | 240 | for _ in 0..count { 241 | let mut cs = generate_rnd_conf_change(); 242 | println!("{:?}", cs); 243 | assert!(f(&mut cs)); 244 | } 245 | } 246 | 247 | // Generate create a random (valid) ConfState for use with quickcheck. 248 | fn generate_rnd_conf_change() -> ConfState { 249 | let conv = |sl: &Vec| -> Vec { 250 | // We want IDs but the incoming slice is zero-indexed, so add one to 251 | // each. 252 | let mut out = [0].repeat(sl.len()); 253 | for i in 0..sl.len() { 254 | out[i] = sl[i] + 1; 255 | } 256 | out 257 | }; 258 | 259 | let mut r = rand::thread_rng(); 260 | // NB: never generate the empty ConfState, that one should be unit tested. 261 | let n_voters = r.gen_range(0..5) + 1; 262 | let n_learners = r.gen_range(0..5); 263 | 264 | // The number of voters that are in the outgoing config but not in the 265 | // incoming one. (We'll additionally retain a random number of the 266 | // incoming voters below). 267 | let n_removed_voters = r.gen_range(0..3); 268 | 269 | // Voters, learners, and removed voters must not overlap. A "removed voter" 270 | // is one that we have in the outgoing config but not the incoming one. 271 | let mut ids = 272 | (1..=2 * (n_voters + n_learners + n_removed_voters) as u64).collect::>(); 273 | ids.shuffle(&mut r); 274 | // println!("ids {:?}, {}", ids, 2 * (n_voters + n_learners + n_removed_voters)); 275 | let mut cs = ConfState::new(); 276 | cs.voters = ids.drain(..n_voters).collect(); 277 | 278 | if n_learners > 0 { 279 | cs.learners = ids.drain(..n_learners).collect::>(); 280 | } 281 | 282 | // Roll the dice on how many of the incoming voters we decide were also 283 | // previously voters. 284 | // 285 | // NB: this code avoids creating non-nil empty slices (here and below). 286 | let n_outgoing_retained_voters = r.gen_range(0..(n_voters + 1)); 287 | if n_outgoing_retained_voters > 0 || n_removed_voters > 0 { 288 | cs.voters_outgoing 289 | .extend_from_slice(&cs.voters[..n_outgoing_retained_voters]); 290 | cs.voters_outgoing 291 | .extend_from_slice(&ids[..n_removed_voters]); 292 | } 293 | 294 | // Only outgoing voters that are not also incoming voters can be in 295 | // learners_next (they represent demotions). 296 | if n_removed_voters > 0 { 297 | let n_learners = r.gen_range(0..n_removed_voters + 1); 298 | if n_learners > 0 { 299 | cs.learners_next = ids[..n_learners].to_vec(); 300 | } 301 | } 302 | 303 | cs.set_auto_leave(cs.voters_outgoing.len() > 0 && r.gen_range(0..2) == 1); 304 | cs 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /src/conf_change/testdata/joint_autoleave.txt: -------------------------------------------------------------------------------- 1 | # Test the autoleave argument to EnterJoint. It defaults to false in the 2 | # datadriven tests. The flag has no associated semantics in this package, 3 | # it is simply passed through. 4 | cmd: simple 5 | args: v=(1) 6 | voters=(1) 7 | 1: StateProbe match=0 next=0 8 | 9 | -------------------------------- 10 | # Autoleave is reflected in the config. 11 | cmd: enter-joint 12 | args: autoleave=(true) v=(2) v=(3) 13 | voters=(1 2 3)&&(1) autoleave 14 | 1: StateProbe match=0 next=0 15 | 2: StateProbe match=0 next=1 16 | 3: StateProbe match=0 next=1 17 | 18 | -------------------------------- 19 | # Can't enter-joint twice, even if autoleave changes. 20 | cmd: enter-joint 21 | args: autoleave=(false) 22 | config is already joint 23 | 24 | -------------------------------- 25 | cmd: leave-joint 26 | voters=(1 2 3) 27 | 1: StateProbe match=0 next=0 28 | 2: StateProbe match=0 next=1 29 | 3: StateProbe match=0 next=1 -------------------------------------------------------------------------------- /src/conf_change/testdata/joint_idempotency.txt: -------------------------------------------------------------------------------- 1 | # Verify that operations upon entering the joint state are idempotent, i.e. 2 | # removing an absent node is fine, etc. 3 | 4 | cmd: simple 5 | args: v=(1) 6 | voters=(1) 7 | 1: StateProbe match=0 next=0 8 | 9 | -------------------------------- 10 | cmd: enter-joint 11 | args: r=(1) r=(2) r=(9) v=(2) v=(3) v=(4) v=(2) v=(3) v=(4) l=(2) l=(2) r=(4) r=(4) l=(1) l=(1) 12 | voters=(3)&&(1) learners=(2) learners_next=(1) 13 | 1: StateProbe match=0 next=0 14 | 2: StateProbe match=0 next=1 learner 15 | 3: StateProbe match=0 next=1 16 | 17 | -------------------------------- 18 | cmd: leave-joint 19 | voters=(3) learners=(1 2) 20 | 1: StateProbe match=0 next=0 learner 21 | 2: StateProbe match=0 next=1 learner 22 | 3: StateProbe match=0 next=1 -------------------------------------------------------------------------------- /src/conf_change/testdata/joint_learners_next.txt: -------------------------------------------------------------------------------- 1 | # Verify that when a voter is demoted in a joint config, it will show up in 2 | # learners_next until the joint config is left, and only then will the progress 3 | # turn into that of a learner, without resetting the progress. Note that this 4 | # last fact is verified by `next`, which can tell us which "round" the progress 5 | # was originally created in. 6 | 7 | cmd: simple 8 | args: v=(1) 9 | voters=(1) 10 | 1: StateProbe match=0 next=0 11 | 12 | -------------------------------- 13 | cmd: enter-joint 14 | args: v=(2) l=(1) 15 | voters=(2)&&(1) learners_next=(1) 16 | 1: StateProbe match=0 next=0 17 | 2: StateProbe match=0 next=1 18 | 19 | -------------------------------- 20 | cmd: leave-joint 21 | voters=(2) learners=(1) 22 | 1: StateProbe match=0 next=0 learner 23 | 2: StateProbe match=0 next=1 -------------------------------------------------------------------------------- /src/conf_change/testdata/joint_safety.txt: -------------------------------------------------------------------------------- 1 | cmd: leave-joint 2 | can't leave a non-joint config 3 | 4 | -------------------------------- 5 | cmd: enter-joint 6 | can't make a zero-voter config joint 7 | 8 | -------------------------------- 9 | cmd: enter-joint 10 | args: v=(1) 11 | can't make a zero-voter config joint 12 | 13 | -------------------------------- 14 | cmd: simple 15 | args: v=(1) 16 | voters=(1) 17 | 1: StateProbe match=0 next=3 18 | 19 | -------------------------------- 20 | cmd: leave-joint 21 | can't leave a non-joint config 22 | 23 | -------------------------------- 24 | # Can enter into joint config. 25 | cmd: enter-joint 26 | voters=(1)&&(1) 27 | 1: StateProbe match=0 next=3 28 | 29 | -------------------------------- 30 | cmd: enter-joint 31 | config is already joint 32 | 33 | -------------------------------- 34 | cmd: leave-joint 35 | voters=(1) 36 | 1: StateProbe match=0 next=3 37 | 38 | -------------------------------- 39 | cmd: leave-joint 40 | can't leave a non-joint config 41 | 42 | -------------------------------- 43 | # Can enter again, this time with some ops. 44 | cmd: enter-joint 45 | args: r=(1) v=(2) v=(3) l=(4) 46 | voters=(2 3)&&(1) learners=(4) 47 | 1: StateProbe match=0 next=3 48 | 2: StateProbe match=0 next=9 49 | 3: StateProbe match=0 next=9 50 | 4: StateProbe match=0 next=9 learner 51 | 52 | -------------------------------- 53 | cmd: enter-joint 54 | config is already joint 55 | 56 | -------------------------------- 57 | cmd: enter-joint 58 | args: v=(12) 59 | config is already joint 60 | 61 | -------------------------------- 62 | cmd: simple 63 | args: l=(15) 64 | can't apply simple config change in joint config 65 | 66 | -------------------------------- 67 | cmd: leave-joint 68 | voters=(2 3) learners=(4) 69 | 2: StateProbe match=0 next=9 70 | 3: StateProbe match=0 next=9 71 | 4: StateProbe match=0 next=9 learner 72 | 73 | -------------------------------- 74 | cmd: simple 75 | args: l=(9) 76 | voters=(2 3) learners=(4 9) 77 | 2: StateProbe match=0 next=9 78 | 3: StateProbe match=0 next=9 79 | 4: StateProbe match=0 next=9 learner 80 | 9: StateProbe match=0 next=14 learner -------------------------------------------------------------------------------- /src/conf_change/testdata/simple_idempotency.txt: -------------------------------------------------------------------------------- 1 | cmd: simple 2 | args: v=(1) 3 | voters=(1) 4 | 1: StateProbe match=0 next=0 5 | 6 | -------------------------------- 7 | cmd: simple 8 | args: v=(1) 9 | voters=(1) 10 | 1: StateProbe match=0 next=0 11 | 12 | -------------------------------- 13 | cmd: simple 14 | args: v=(2) 15 | voters=(1 2) 16 | 1: StateProbe match=0 next=0 17 | 2: StateProbe match=0 next=2 18 | 19 | -------------------------------- 20 | cmd: simple 21 | args: l=(1) 22 | voters=(2) learners=(1) 23 | 1: StateProbe match=0 next=0 learner 24 | 2: StateProbe match=0 next=2 25 | 26 | -------------------------------- 27 | cmd: simple 28 | args: l=(1) 29 | voters=(2) learners=(1) 30 | 1: StateProbe match=0 next=0 learner 31 | 2: StateProbe match=0 next=2 32 | 33 | -------------------------------- 34 | cmd: simple 35 | args: r=(1) 36 | voters=(2) 37 | 2: StateProbe match=0 next=2 38 | 39 | -------------------------------- 40 | cmd: simple 41 | args: r=(1) 42 | voters=(2) 43 | 2: StateProbe match=0 next=2 44 | 45 | -------------------------------- 46 | cmd: simple 47 | args: v=(3) 48 | voters=(2 3) 49 | 2: StateProbe match=0 next=2 50 | 3: StateProbe match=0 next=7 51 | 52 | -------------------------------- 53 | cmd: simple 54 | args: r=(3) 55 | voters=(2) 56 | 2: StateProbe match=0 next=2 57 | 58 | -------------------------------- 59 | cmd: simple 60 | args: r=(3) 61 | voters=(2) 62 | 2: StateProbe match=0 next=2 63 | 64 | -------------------------------- 65 | cmd: simple 66 | args: r=(4) 67 | voters=(2) 68 | 2: StateProbe match=0 next=2 -------------------------------------------------------------------------------- /src/conf_change/testdata/simple_promote_demote.txt: -------------------------------------------------------------------------------- 1 | # Set up three voters for this test. 2 | cmd: simple 3 | args: v=(1) 4 | voters=(1) 5 | 1: StateProbe match=0 next=0 6 | 7 | -------------------------------- 8 | cmd: simple 9 | args: v=(2) 10 | voters=(1 2) 11 | 1: StateProbe match=0 next=0 12 | 2: StateProbe match=0 next=1 13 | 14 | -------------------------------- 15 | cmd: simple 16 | args: v=(3) 17 | voters=(1 2 3) 18 | 1: StateProbe match=0 next=0 19 | 2: StateProbe match=0 next=1 20 | 3: StateProbe match=0 next=2 21 | 22 | -------------------------------- 23 | # Can atomically demote and promote without a hitch. 24 | # This is pointless, but possible. 25 | cmd: simple 26 | args: l=(1) v=(1) 27 | voters=(1 2 3) 28 | 1: StateProbe match=0 next=0 29 | 2: StateProbe match=0 next=1 30 | 3: StateProbe match=0 next=2 31 | 32 | -------------------------------- 33 | # Can demote a voter. 34 | cmd: simple 35 | args: l=(2) 36 | voters=(1 3) learners=(2) 37 | 1: StateProbe match=0 next=0 38 | 2: StateProbe match=0 next=1 learner 39 | 3: StateProbe match=0 next=2 40 | 41 | -------------------------------- 42 | # Can atomically promote and demote the same voter. 43 | # This is pointless, but possible. 44 | cmd: simple 45 | args: v=(2) l=(2) 46 | voters=(1 3) learners=(2) 47 | 1: StateProbe match=0 next=0 48 | 2: StateProbe match=0 next=1 learner 49 | 3: StateProbe match=0 next=2 50 | 51 | -------------------------------- 52 | # Can promote a voter. 53 | cmd: simple 54 | args: v=(2) 55 | voters=(1 2 3) 56 | 1: StateProbe match=0 next=0 57 | 2: StateProbe match=0 next=1 58 | 3: StateProbe match=0 next=2 59 | -------------------------------------------------------------------------------- /src/conf_change/testdata/simple_safety.txt: -------------------------------------------------------------------------------- 1 | cmd: simple 2 | args: l=(1) 3 | removed all voters 4 | 5 | -------------------------------- 6 | cmd: simple 7 | args: v=(1) 8 | voters=(1) 9 | 1: StateProbe match=0 next=1 10 | 11 | -------------------------------- 12 | cmd: simple 13 | args: v=(2) l=(3) 14 | voters=(1 2) learners=(3) 15 | 1: StateProbe match=0 next=1 16 | 2: StateProbe match=0 next=2 17 | 3: StateProbe match=0 next=2 learner 18 | 19 | -------------------------------- 20 | cmd: simple 21 | args: r=(1) v=(5) 22 | more than one voter changed without entering joint config 23 | 24 | -------------------------------- 25 | cmd: simple 26 | args: r=(1) r=(2) 27 | removed all voters 28 | 29 | -------------------------------- 30 | cmd: simple 31 | args: v=(3) v=(4) 32 | more than one voter changed without entering joint config 33 | 34 | -------------------------------- 35 | cmd: simple 36 | args: l=(1) v=(5) 37 | more than one voter changed without entering joint config 38 | 39 | -------------------------------- 40 | cmd: simple 41 | args: l=(1) l=(2) 42 | removed all voters 43 | 44 | -------------------------------- 45 | cmd: simple 46 | args: l=(2) l=(3) l=(4) l=(5) 47 | voters=(1) learners=(2 3 4 5) 48 | 1: StateProbe match=0 next=1 49 | 2: StateProbe match=0 next=2 learner 50 | 3: StateProbe match=0 next=2 learner 51 | 4: StateProbe match=0 next=8 learner 52 | 5: StateProbe match=0 next=8 learner 53 | 54 | -------------------------------- 55 | cmd: simple 56 | args: r=(1) 57 | removed all voters 58 | 59 | -------------------------------- 60 | cmd: simple 61 | args: r=(2) r=(3) r=(4) r=(5) 62 | voters=(1) 63 | 1: StateProbe match=0 next=1 -------------------------------------------------------------------------------- /src/conf_change/testdata/update.txt: -------------------------------------------------------------------------------- 1 | # Nobody cares about ConfChangeUpdateNode, but at least use it once. It is used 2 | # by etcd as a convenient way to pass a blob through their conf change machinery 3 | # that updates information tracked outside of raft. 4 | cmd: simple 5 | args: v=(1) 6 | voters=(1) 7 | 1: StateProbe match=0 next=0 8 | 9 | -------------------------------- 10 | cmd: simple 11 | args: v=(2) u=(1) 12 | voters=(1 2) 13 | 1: StateProbe match=0 next=0 14 | 2: StateProbe match=0 next=1 15 | 16 | -------------------------------- 17 | cmd: simple 18 | args: u=(1) u=(2) u=(3) u=(1) u=(2) u=(3) 19 | voters=(1 2) 20 | 1: StateProbe match=0 next=0 21 | 2: StateProbe match=0 next=1 -------------------------------------------------------------------------------- /src/conf_change/testdata/zero.txt: -------------------------------------------------------------------------------- 1 | # NodeID zero is ignored. 2 | cmd: simple 3 | args: v=(1) r=(0) v=(0) l=(0) 4 | output: 5 | voters=(1) 6 | 1: StateProbe match=0 next=0 7 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(is_sorted)] 2 | #![feature(custom_test_frameworks)] 3 | #![feature(in_band_lifetimes)] 4 | 5 | #[macro_use] 6 | extern crate log; 7 | #[macro_use] 8 | extern crate nom; 9 | 10 | pub mod conf_change; 11 | pub mod node; 12 | pub(crate) mod nom_data_test; 13 | mod paper_test; 14 | pub mod protocol; 15 | pub mod quorum; 16 | pub mod raft; 17 | mod raft_flow_control_test; 18 | pub mod raft_log; 19 | mod raft_snap_test; 20 | pub mod raftpb; 21 | pub mod rawnode; 22 | pub mod read_only; 23 | pub mod status; 24 | pub mod storage; 25 | pub mod tracker; 26 | pub mod unstable; 27 | pub(crate) mod util; 28 | pub(crate) mod raft_test; 29 | mod async_rt; 30 | mod async_ch; 31 | mod tests_util; 32 | 33 | use async_rt::{sleep, wait, wait_timeout}; 34 | -------------------------------------------------------------------------------- /src/nom_data_test/mod.rs: -------------------------------------------------------------------------------- 1 | use bytes::Buf; 2 | use bytes::Bytes; 3 | use std::fs::{read_dir, read_to_string}; 4 | use std::io::BufRead; 5 | use std::path::Path; 6 | 7 | pub fn walk(path: &str, mut f: F) 8 | where 9 | F: FnMut(&str), 10 | { 11 | for entry in read_dir(path).unwrap() { 12 | let path = entry.unwrap().path(); 13 | if !path.is_file() { 14 | //if !path.is_file() || !path.ends_with("joint_commit.txt") { 15 | continue; 16 | } 17 | f(path.to_str().unwrap()) 18 | } 19 | } 20 | 21 | pub fn execute_test, F>(path: P, split: &str, mut f: F) 22 | where 23 | F: FnMut(&TestData) -> String, 24 | 25 | { 26 | use bytes::Buf; 27 | let mut data = vec![]; 28 | let txt = read_to_string(path).unwrap(); 29 | let lines = txt.split(split).collect::>(); 30 | let mut print_buf = vec![]; 31 | for line in lines { 32 | let mut rd = Bytes::from(line.to_string()).reader(); 33 | let mut buf = String::new(); 34 | let mut cmd = TestData { 35 | title: "".to_string(), 36 | cmd: "".to_string(), 37 | cmd_args: vec![], 38 | output: "".to_string(), 39 | }; 40 | while let Ok(n) = rd.read_line(&mut buf) { 41 | if n == 0 { 42 | break; 43 | } 44 | if buf.starts_with("#") { 45 | buf.clear(); 46 | continue; 47 | } 48 | buf = buf.trim_end().to_string(); 49 | if buf.len() == 0 { 50 | buf.clear(); 51 | continue; 52 | } 53 | if buf.starts_with("title: ") { 54 | cmd.title = buf.as_str()["title: ".len()..].to_string(); 55 | } else if buf.starts_with("cmd: ") { 56 | cmd.cmd = buf.as_str()["cmd: ".len()..].to_string(); 57 | } else if buf.starts_with("args: ") { 58 | let args = buf.as_str()["args: ".len()..].to_string(); 59 | for arg in args.split_terminator(" ").collect::>() { 60 | let mut cmd_arg = CmdArg { 61 | key: "".to_string(), 62 | vals: vec![], 63 | }; 64 | let arg = arg.split("=").collect::>(); 65 | cmd_arg.key = arg[0].to_string(); 66 | cmd_arg.vals = arg[1] 67 | .trim_start_matches('(') 68 | .trim_end_matches(')') 69 | .split(",") 70 | .filter(|s| s.trim() != "") 71 | .map(|s| s.to_string()) 72 | .collect::>(); 73 | cmd.cmd_args.push(cmd_arg); 74 | } 75 | } else if buf.starts_with("output:") { 76 | } else { 77 | cmd.output.push_str(buf.as_str()); 78 | cmd.output.push_str("\n"); 79 | } 80 | buf.clear(); 81 | } 82 | cmd.output = cmd.output.trim_end().to_string(); 83 | // println!("title: {}, cmd: {}, args: {:?}, output: {}", cmd.title, cmd.cmd, cmd.cmd_args, cmd.output); 84 | data.push(cmd); 85 | print_buf.push(line); 86 | } 87 | 88 | for (i, datum) in data.iter_mut().enumerate() { 89 | println!("t_{}", i); 90 | println!("{}", print_buf[i]); 91 | println!("{:?}", datum); 92 | assert_eq!(f(datum), datum.output); 93 | } 94 | } 95 | 96 | #[derive(Debug, PartialEq)] 97 | pub struct TestData { 98 | pub title: String, 99 | pub cmd: String, 100 | pub cmd_args: Vec, 101 | pub output: String, 102 | } 103 | 104 | #[derive(Debug, Default, PartialEq)] 105 | pub struct CmdArg { 106 | pub key: String, 107 | pub vals: Vec, 108 | } 109 | -------------------------------------------------------------------------------- /src/protocol/mod.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/protocol/raft.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | package raftpb; 3 | 4 | 5 | 6 | enum EntryType { 7 | EntryNormal = 0; 8 | optional uint64 Term = 2 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations 9 | optional uint64 Index = 3 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations 10 | optional EntryType Type = 1 [(gogoproto.nullable) = false]; 11 | optional bytes Data = 4; 12 | } 13 | 14 | message SnapshotMetadata { 15 | optional ConfState conf_state = 1 [(gogoproto.nullable) = false]; 16 | optional uint64 index = 2 [(gogoproto.nullable) = false]; 17 | optional uint64 term = 3 [(gogoproto.nullable) = false]; 18 | } 19 | 20 | message Snapshot { 21 | optional bytes data = 1; 22 | optional SnapshotMetadata metadata = 2 [(gogoproto.nullable) = false]; 23 | } 24 | -------------------------------------------------------------------------------- /src/quorum/data_driven_test.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #[cfg(test)] 16 | mod tests { 17 | use crate::nom_data_test::{execute_test, walk, TestData}; 18 | use crate::quorum::joint::JointConfig; 19 | use crate::quorum::majority::MajorityConfig; 20 | use crate::quorum::quick_test::alternative_majority_committed_index; 21 | use crate::quorum::quorum::{to_string, AckedIndexer, Index, MapAckIndexer}; 22 | use std::collections::{HashMap, HashSet}; 23 | use std::fmt::Write; 24 | use std::iter::FromIterator; 25 | 26 | // parses and executes and the test cases in ./testdata/*. An entry 27 | // in such a file specifies the command, which is either of "committed" to check 28 | // committed_index or "vote" to verify a VoteResult. The underlying configuration 29 | // and inputs are specified via the arguments 'cfg' and 'cfgj' (for the majority 30 | // config and, optionally, majority config joint to the first one) and `idx` 31 | // (for CommittedIndex) and 'votes' (for VoteResult). 32 | // 33 | // Internally, the harness runs some additional checks on each test case for 34 | // which it is known that the result shouldn't change. For example, 35 | // interchanging the majority c configurations of a joint quorum must not 36 | // influence the result; if it does, this is noted in the test's output. 37 | #[test] 38 | fn t_data_driven() { 39 | // flexi_logger::Logger::with_env().start(); 40 | walk("src/quorum/testdata", |p| { 41 | execute_test(p, "--------------------------------", |data| -> String { 42 | // Two majority configs. The first one is always used (though it may 43 | // be empty) and the second one is used if used iff joint is true. 44 | let mut joint = false; 45 | let mut ids = Vec::::new(); 46 | let mut idsj = Vec::::new(); 47 | // The committed indexes for the nodes in the config in the order in 48 | // which they appear in (ids,idsj), without repetition. An underscore 49 | // denotes an omission (i.e. no information for this voter); this is 50 | // different from 0, For example, 51 | // 52 | // cfg=(1,2) cfgj=(2,3,4) idx=(_,5,_7) initializes the idx for voter 2 53 | // to 5 and that for voter 4 to 7 (and no others). 54 | // 55 | // cfgj=zero is specified to instruct the test harness to treat cfgj 56 | // as zero instead of not specified (i.e. it will trigger a joint 57 | // quorum test instead of a majority quorum test for cfg only). 58 | let mut idxs = Vec::::new(); 59 | // votes. these are initialized similar to idxs except the only values 60 | // used are 1 (voted against) and 2 (voted for). This looks awkward, 61 | // but it convenient because it allows sharing code between the two. 62 | let mut votes = Vec::::new(); 63 | 64 | // parse the args. 65 | for cmd_arg in &data.cmd_args { 66 | for val in &cmd_arg.vals { 67 | match cmd_arg.key.as_str() { 68 | "cfg" => { 69 | ids.push(val.parse().unwrap()); 70 | } 71 | "cfgj" => { 72 | joint = true; 73 | if val == &"zero" { 74 | assert_eq!(cmd_arg.vals.len(), 1); 75 | } else { 76 | idsj.push(val.parse().unwrap()); 77 | } 78 | } 79 | "idx" => { 80 | // register placeholders as zeros. 81 | if val != &"_" { 82 | idxs.push(val.parse().unwrap()); 83 | // This is a restriction caused by the above 84 | // special-casing for _. 85 | assert_ne!(idxs.last().unwrap(), &0, "cannot use 0 as idx"); 86 | } 87 | } 88 | "votes" => { 89 | if val == &"y" { 90 | votes.push(2); 91 | } else if val == &"n" { 92 | votes.push(1); 93 | } else if val == &"_" { 94 | votes.push(0); 95 | } else { 96 | panic!(format!("unknown vote: {}", val)); 97 | } 98 | } 99 | other => panic!(format!("unknown arg {:?}", cmd_arg)), 100 | } 101 | } 102 | } 103 | 104 | // Build the two majority configs. 105 | let mut c = MajorityConfig { 106 | votes: HashSet::from_iter(ids.clone().into_iter()), 107 | }; 108 | let mut cj = MajorityConfig { 109 | votes: HashSet::from_iter(idsj.clone().into_iter()), 110 | }; 111 | 112 | // Helper that returns an AckedIndexer which has the specified indexes 113 | // mapped to the right IDs. 114 | let make_lookuper = 115 | |idxs: &Vec, ids: &Vec, idsj: &Vec| -> MapAckIndexer { 116 | let mut l: HashMap = HashMap::new(); 117 | let mut p = 0; 118 | let mut _ids: Vec = Vec::new(); 119 | _ids.extend(ids); 120 | _ids.extend(idsj); 121 | for id in &_ids { 122 | if l.contains_key(id) { 123 | continue; 124 | } 125 | if p < idxs.len() { 126 | // NB: this creates zero entries for placeholders that we remove later. 127 | // The upshot of doing it that way is to avoid having to specify placeholders 128 | // multiple times when omitting voters present in both halves of 129 | // a joint config. 130 | l.insert(*id, idxs[p]); 131 | p += 1; 132 | } 133 | } 134 | 135 | // zero entries are created by _ placeholders; we don't want 136 | // them in the lookuper because "no entry" is different from 137 | // "zero entry". Note that we prevent tests from specifying 138 | // zero commit Indexes, so that there's no confusion between 139 | // the two concepts. 140 | l.retain(|_, val| *val != 0); 141 | l 142 | }; 143 | 144 | if data.cmd == "vote" { 145 | let mut joint_config = JointConfig::new(); 146 | joint_config.incoming = c.clone(); 147 | joint_config.outgoing = cj.clone(); 148 | let voters = joint_config.ids(); 149 | assert_eq!( 150 | voters.len(), 151 | votes.len(), 152 | "mismatch input (explicit for _) fro votes {:?}: {:?}", 153 | voters, 154 | votes 155 | ); 156 | } 157 | 158 | let mut buf = String::new(); 159 | match data.cmd.as_str() { 160 | "committed" => { 161 | let l = make_lookuper(&idxs, &ids, &idsj); 162 | // branch based on wether this is a majority or joint quorum. 163 | // test case. 164 | if !joint { 165 | let idx = c.committed_index(&l); 166 | buf.write_str(c.describe(&l).as_str()); 167 | println!("MapAckIndexer {:?}, ack_id:{}", l, idx); 168 | // These alternative computations should return the same 169 | // result. If not, print to the output. 170 | let a_idx = alternative_majority_committed_index(c.clone(), &l); 171 | if a_idx != idx { 172 | buf.write_str( 173 | format!("{} <-- via alternative computation\n", a_idx).as_str(), 174 | ); 175 | } 176 | // Joining a majority with the empty majority should give same result. 177 | let a_idx = 178 | JointConfig::new2(c.clone(), MajorityConfig::new()).committed(&l); 179 | if a_idx != idx { 180 | buf.write_str( 181 | format!("{} >-- via zero-joint quorum\n", a_idx).as_str(), 182 | ); 183 | } 184 | // Joining a majority with it self should give the same result. 185 | let a_idx = JointConfig::new2(c.clone(), c.clone()).committed(&l); 186 | if a_idx != idx { 187 | buf.write_str( 188 | format!("{} >-- via self-joint quorum\n", a_idx).as_str(), 189 | ); 190 | } 191 | 192 | let overlay = |c: MajorityConfig, 193 | l: &dyn AckedIndexer, 194 | id: u64, 195 | idx: Index| 196 | -> MapAckIndexer { 197 | let mut ll = MapAckIndexer::new(); 198 | for iid in c.iter() { 199 | if *iid == id { 200 | ll.insert(*iid, idx); 201 | } else if let Some(idx) = l.acked_index(iid) { 202 | ll.insert(*iid, *idx); 203 | } 204 | } 205 | ll 206 | }; 207 | for id in c.iter() { 208 | let iidx = l.acked_index(id).map(|idx| *idx).unwrap_or_else(|| 0); 209 | if idx > iidx && iidx > 0 { 210 | // If the committed index was definitely above the currently 211 | // inspected idx, the result shouldn't change if we lower it 212 | // further. 213 | let lo = overlay(c.clone(), &l, *id, iidx - 1); 214 | let a_idx = c.committed_index(&lo); 215 | if a_idx != idx { 216 | buf.write_str( 217 | format!("{} <-- overlaying {}-->{}", a_idx, id, iidx) 218 | .as_str(), 219 | ); 220 | } 221 | 222 | let lo = overlay(c.clone(), &l, *id, 0); 223 | let a_idx = c.committed_index(&lo); 224 | if a_idx != idx { 225 | buf.write_str( 226 | format!("{} <-- overlaying {}-->0", a_idx, id).as_str(), 227 | ); 228 | } 229 | } 230 | } 231 | buf.write_str(to_string(idx).as_str()); 232 | } else { 233 | let mut cc = JointConfig::new2(c.clone(), cj.clone()); 234 | buf.write_str(cc.describe(&l).as_str()); 235 | let idx = cc.committed(&l); 236 | // Interchanging the majority shouldn't make a difference. If it does, print. 237 | let a_idx = JointConfig::new2(c.clone(), cj.clone()).committed(&l); 238 | if a_idx != idx { 239 | buf.write_str(format!("{} <-- via symmetry\n", a_idx).as_str()); 240 | } 241 | buf.write_str(to_string(idx).as_str()); 242 | } 243 | } 244 | "vote" => { 245 | let ll = make_lookuper(&votes, &ids, &idsj); 246 | println!( 247 | "ids: {:?}, idsj: {:?}, votes: {:?}, ll: {:?}", 248 | ids, idsj, votes, ll 249 | ); 250 | let mut l = HashMap::new(); 251 | for (id, v) in ll.iter() { 252 | l.insert(*id, *v != 1); // NB: 1 == false, 2 == true 253 | } 254 | if !joint { 255 | // Test a majority quorum 256 | buf.write_str(&format!("{:?}", c.vote_result(&l))); 257 | } else { 258 | // Run a joint quorum test case. 259 | let r = JointConfig::new2(c.clone(), cj.clone()).vote_result(&l); 260 | // Interchanging the majorities shouldn't make a difference. If it does, print. 261 | let ar = JointConfig::new2(cj.clone(), c.clone()).vote_result(&l); 262 | assert_eq!(r, ar); 263 | buf.write_str(format!("{:?}", r).as_str()); 264 | } 265 | } 266 | _ => {} 267 | } 268 | buf 269 | }); 270 | }); 271 | } 272 | } 273 | -------------------------------------------------------------------------------- /src/quorum/joint.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use crate::quorum::majority::MajorityConfig; 16 | use crate::quorum::quorum::VoteResult::{VoteLost, VotePending}; 17 | use crate::quorum::quorum::{AckedIndexer, Index, VoteResult}; 18 | use std::collections::{HashMap, HashSet}; 19 | use std::fmt::{self, Display, Error, Formatter}; 20 | use std::process::id; 21 | 22 | /// JointConfig is a configuration of two groups of (possibly overlapping) 23 | /// majority configurations. Decisions require the support of both majorities. 24 | /// Here Thanks tikv 25 | #[derive(Clone, PartialEq, Debug)] 26 | pub struct JointConfig { 27 | pub(crate) incoming: MajorityConfig, 28 | pub(crate) outgoing: MajorityConfig, 29 | } 30 | 31 | impl JointConfig { 32 | pub fn new() -> Self { 33 | JointConfig { 34 | incoming: MajorityConfig::new(), 35 | outgoing: MajorityConfig::new(), 36 | } 37 | } 38 | pub fn new2(incoming: MajorityConfig, outgoing: MajorityConfig) -> Self { 39 | JointConfig { incoming, outgoing } 40 | } 41 | } 42 | 43 | impl Default for JointConfig { 44 | fn default() -> Self { 45 | JointConfig::new() 46 | } 47 | } 48 | 49 | impl Display for JointConfig { 50 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 51 | if !self.outgoing.is_empty() { 52 | write!(f, "{}&&{}", self.incoming, self.outgoing) 53 | } else { 54 | write!(f, "{}", self.incoming) 55 | } 56 | } 57 | } 58 | 59 | impl JointConfig { 60 | /// IDs returns a newly initialized map representing the set of voters present 61 | /// in the joint configuration. 62 | pub fn ids(&self) -> HashSet { 63 | let mut hash_set = HashSet::new(); 64 | hash_set.extend(self.incoming.iter()); 65 | hash_set.extend(self.outgoing.iter()); 66 | hash_set 67 | } 68 | 69 | /// TODO 70 | /// Describe returns a (multi-line) representation of the commit indexes for the 71 | /// given lookuper. 72 | pub fn describe(&self, l: &T) -> String { 73 | MajorityConfig::from(self.ids()).describe(l) 74 | } 75 | 76 | /// committed_index returns the largest committed index for the given joint 77 | /// quorum. An index is jointly committed if it is committed in both constituent 78 | /// majorities 79 | pub fn committed(&self, l: &T) -> Index { 80 | let idx0 = self.incoming.committed_index(l); 81 | let idx1 = self.outgoing.committed_index(l); 82 | if idx0 < idx1 { 83 | return idx0; 84 | } 85 | idx1 86 | } 87 | 88 | pub fn vote_result(&self, votes: &HashMap) -> VoteResult { 89 | let r1 = self.incoming.vote_result(votes); 90 | let r2 = self.outgoing.vote_result(votes); 91 | if r1 == r2 { 92 | return r1; 93 | } 94 | if r1 == VoteLost || r2 == VoteLost { 95 | // If either config has lost, loss is the only possible outcome. 96 | return VoteLost; 97 | } 98 | // TODO: Why? 99 | // One side won, the other one is pending, so the whole outcome is 100 | VotePending 101 | } 102 | 103 | /// clears all IDs. 104 | pub fn clear(&mut self) { 105 | self.incoming.clear(); 106 | self.outgoing.clear(); 107 | } 108 | 109 | /// Returns true if (and only if) there is only one voting member 110 | /// (i.e. the leader) in the current configuration. 111 | #[inline] 112 | pub fn is_singleton(&self) -> bool { 113 | self.outgoing.is_empty() && self.incoming.len() == 1 114 | } 115 | 116 | /// Check if an id is a voter. 117 | #[inline] 118 | pub fn contains(&self, id: u64) -> bool { 119 | self.incoming.contains(&id) || self.outgoing.contains(&id) 120 | } 121 | 122 | #[inline] 123 | pub fn joint(&self) -> bool { 124 | !self.outgoing.is_empty() 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/quorum/majority.rs: -------------------------------------------------------------------------------- 1 | use crate::quorum::quorum::{AckedIndexer, Index, VoteResult}; 2 | use std::cmp::Ordering; 3 | use std::collections::hash_set::Iter; 4 | use std::collections::{HashMap, HashSet}; 5 | use std::fmt::{self, Display, Formatter, Write}; 6 | use std::process::id; 7 | 8 | /// MajorityConfig is a set of IDs that uses majority quorums to make decisions. 9 | #[derive(Clone, PartialEq, Debug)] 10 | pub struct MajorityConfig { 11 | pub(crate) votes: HashSet, 12 | } 13 | 14 | impl From> for MajorityConfig { 15 | fn from(h: HashSet) -> Self { 16 | MajorityConfig { votes: h } 17 | } 18 | } 19 | 20 | impl MajorityConfig { 21 | pub fn new() -> Self { 22 | MajorityConfig { 23 | votes: HashSet::new(), 24 | } 25 | } 26 | 27 | /// returns a (multi-line) representation of the commit indexes for the 28 | /// given lookuper. 29 | pub fn describe(&self, l: &T) -> String { 30 | if self.votes.is_empty() { 31 | return "".to_string(); 32 | } 33 | 34 | #[derive(Default, Clone, Copy)] 35 | struct Tup { 36 | id: u64, 37 | idx: Index, 38 | // idx found? 39 | ok: bool, 40 | // length of bar displayed for this up 41 | bar: usize, 42 | } 43 | 44 | // Below, populate .bar so that the i-th largest commit index has bar i (we 45 | // plot this as sort of a progress bar). The actual code is a bit more 46 | // complicated and also makes sure that equal index => equal bar. 47 | let n = self.votes.len(); 48 | let mut info: Vec = vec![Tup::default()].repeat(n); 49 | for (i, id) in self.iter().enumerate() { 50 | let idx = l.acked_index(id); 51 | info[i].id = *id; 52 | info[i].idx = *idx.or_else(|| Some(&0)).unwrap(); 53 | info[i].ok = idx.is_some(); 54 | } 55 | // sort by index 56 | info.sort_by(|a, b| { 57 | if a.idx == b.idx { 58 | a.id.cmp(&b.id) 59 | } else { 60 | a.idx.cmp(&b.idx) 61 | } 62 | }); 63 | 64 | // Populate .bar. 65 | for i in 0..info.len() { 66 | if i > 0 && info[i - 1].idx < info[i].idx { 67 | info[i].bar = i; 68 | } 69 | } 70 | 71 | // sort by id 72 | info.sort_by(|a, b| a.id.cmp(&b.id)); 73 | 74 | let mut buf = String::new(); 75 | // print 76 | 77 | buf.write_str((" ".repeat(n) + " idx\n").as_str()) 78 | .unwrap(); 79 | 80 | for i in 0..info.len() { 81 | let bar = info[i].bar; 82 | if !info[i].ok { 83 | buf.write_str("?").unwrap(); 84 | buf.write_str(" ".repeat(n).as_str()).unwrap(); 85 | } else { 86 | buf.write_str(&*("x".repeat(bar) + ">" + " ".repeat(n - bar).as_str())) 87 | .unwrap(); 88 | } 89 | buf.write_str(format!(" {:>5} (id={})\n", info[i].idx, info[i].id).as_str()) 90 | .unwrap(); 91 | } 92 | buf 93 | } 94 | 95 | /// commit_index computes the committed index from those supplied via the 96 | /// provide acked_index (for the active config). 97 | pub fn committed_index(&self, l: &T) -> Index { 98 | if self.is_empty() { 99 | // This plays well with joint quorum which, when one of half is the zero 100 | // MajorityConfig, should behave like the other half. 101 | return u64::max_value(); 102 | } 103 | // Use a on-stack slice to collect the committed indexes when n <= 7 104 | // (otherwise we alloc). The alternative is to stash a slice on 105 | // MajorityConfig, but this impairs usability (as is, MajorityConfig is just 106 | // a map, and that's nice). The assumption is that running with a 107 | // performance is a lesser concern (additionally the performance 108 | // implication of an allocation here are far from drastic). 109 | // TODO: optimized use stack 110 | let n = self.len(); 111 | let mut srt: Vec = [0].repeat(n); 112 | let mut i = 0; 113 | for id in self.iter() { 114 | if let Some(idx) = l.acked_index(&id) { 115 | srt[i as usize] = *idx; 116 | i += 1; 117 | } 118 | } 119 | 120 | srt.sort_by_key(|key| *key); 121 | let pos = n - (n / 2 + 1); 122 | srt[pos] 123 | } 124 | 125 | /// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns 126 | /// a result indicating whether the vote is pending (i.e. neither a quorum of 127 | /// yes/no has been reached), won (a quorum of yes has been reached), or lost (a 128 | /// quorum of no has been reached). 129 | pub fn vote_result(&self, votes: &HashMap) -> VoteResult { 130 | if self.is_empty() { 131 | // By convention, the elections on an empty config win. This comes in 132 | // handy with joint quorums because it'll make a half-populated joint 133 | // quorum behave like a majority quorum 134 | return VoteResult::VoteWon; 135 | } 136 | let (against, agree, missing) = 137 | self.votes 138 | .iter() 139 | .fold((0, 0, 0), |(mut against, mut agree, mut missing), id| { 140 | if let Some(v) = votes.get(id) { 141 | if *v { 142 | agree += 1 143 | } else { 144 | against += 1 145 | } 146 | } else { 147 | missing += 1; 148 | } 149 | (against, agree, missing) 150 | }); 151 | // vote counts for no and yes, responsibility 152 | let q = self.len() / 2 + 1; 153 | debug!("agree:{}, missing:{}, q:{}", agree, missing, q); 154 | if agree >= q { 155 | return VoteResult::VoteWon; 156 | } 157 | if agree + missing >= q { 158 | return VoteResult::VotePending; 159 | } 160 | VoteResult::VoteLost 161 | } 162 | 163 | #[inline] 164 | pub fn as_slice(&self) -> Vec { 165 | let mut s1: Vec = self.iter().map(|v| *v).collect(); 166 | s1.sort_by_key(|v| *v); 167 | s1 168 | } 169 | 170 | #[inline] 171 | pub fn len(&self) -> usize { 172 | self.votes.len() 173 | } 174 | 175 | #[inline] 176 | pub(crate) fn get(&self, id: &u64) -> Option<&u64> { 177 | self.votes.get(id) 178 | } 179 | 180 | #[inline] 181 | pub(crate) fn insert(&mut self, id: u64) { 182 | self.votes.insert(id); 183 | } 184 | 185 | #[inline] 186 | pub(crate) fn remove(&mut self, id: &u64) -> bool { 187 | self.votes.remove(id) 188 | } 189 | 190 | #[inline] 191 | pub(crate) fn contains(&self, id: &u64) -> bool { 192 | self.votes.contains(id) 193 | } 194 | 195 | #[inline] 196 | pub fn is_empty(&self) -> bool { 197 | self.votes.is_empty() 198 | } 199 | 200 | #[inline] 201 | pub(crate) fn clear(&mut self) { 202 | self.votes.clear(); 203 | } 204 | 205 | #[inline] 206 | pub(crate) fn extend(&mut self, other: &Self) { 207 | self.votes.extend(other.iter()) 208 | } 209 | 210 | #[inline] 211 | pub fn iter(&self) -> Iter<'_, u64> { 212 | self.votes.iter() 213 | } 214 | } 215 | 216 | impl From<&Vec> for MajorityConfig { 217 | fn from(v: &Vec) -> Self { 218 | let mut config = MajorityConfig { 219 | votes: HashSet::new(), 220 | }; 221 | for item in v.iter() { 222 | config.votes.insert(*item); 223 | } 224 | config 225 | } 226 | } 227 | 228 | impl From> for MajorityConfig { 229 | fn from(v: Vec) -> Self { 230 | let mut config = MajorityConfig { 231 | votes: HashSet::new(), 232 | }; 233 | for item in v.iter() { 234 | config.votes.insert(*item); 235 | } 236 | config 237 | } 238 | } 239 | 240 | impl Display for MajorityConfig { 241 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 242 | let mut votes: Vec = self.votes.iter().map(|v| *v).collect(); 243 | votes.sort(); 244 | let votes: Vec = votes.iter().map(|v| format!("{}", v)).collect(); 245 | let s: String = votes.join(" "); 246 | write!(f, "({})", s) 247 | } 248 | } 249 | 250 | #[cfg(test)] 251 | mod tests { 252 | use crate::quorum::majority::MajorityConfig; 253 | use crate::quorum::quorum::AckedIndexer; 254 | use crate::quorum::quorum::VoteResult::{VoteLost, VotePending, VoteWon}; 255 | use crate::tracker::progress::Progress; 256 | use crate::tracker::MatchAckIndexer; 257 | use std::collections::HashMap; 258 | 259 | #[test] 260 | fn t_majority() { 261 | let mut majority = MajorityConfig::new(); 262 | majority.votes.insert(0); 263 | majority.votes.insert(1); 264 | assert_eq!("(0 1)", format!("{}", majority)); 265 | let mut majority = MajorityConfig::new(); 266 | assert_eq!("()", format!("{}", majority)); 267 | 268 | let v = &vec![0, 1, 2]; 269 | let majority: MajorityConfig = v.into(); 270 | assert_eq!("(0 1 2)", format!("{}", majority)); 271 | let majority: MajorityConfig = v.into(); 272 | assert_eq!("(0 1 2)", format!("{}", majority)); 273 | 274 | let mut majority = MajorityConfig::new(); 275 | majority.votes.insert(0); 276 | assert_eq!(vec![0], majority.as_slice()); 277 | } 278 | 279 | #[test] 280 | fn t_majority_vote_result() { 281 | let mut majority = MajorityConfig::new(); 282 | for id in 0..5 { 283 | majority.votes.insert(id); 284 | } 285 | let mut votes = HashMap::new(); 286 | assert_eq!(majority.vote_result(&votes), VotePending); 287 | for id in 0..2 { 288 | votes.insert(id, true); 289 | assert_eq!(majority.vote_result(&votes), VotePending); 290 | } 291 | votes.insert(3, true); 292 | assert_eq!(majority.vote_result(&votes), VoteWon); 293 | for id in 0..3 { 294 | votes.insert(id, false); 295 | } 296 | assert_eq!(majority.vote_result(&votes), VoteLost); 297 | } 298 | 299 | #[test] 300 | fn t_majority_committed_index() { 301 | let mut majority = MajorityConfig::new(); 302 | let n = 5; 303 | let tests = vec![ 304 | (vec![(3, 3), (4, 4), (5, 5)], 3), 305 | (vec![(4, 4), (3, 3), (5, 5)], 3), 306 | (vec![(5, 5), (4, 4), (3, 3)], 3), 307 | (vec![(3, 3), (4, 4), (5, 5), (4, 4), (3, 3)], 4), 308 | (vec![(3, 3), (6, 6), (5, 5), (7, 7), (3, 3)], 5), 309 | (vec![(3, 3), (6, 6), (6, 6), (6, 6), (6, 6)], 6), 310 | ]; 311 | for id in 0..n { 312 | majority.votes.insert(id); 313 | } 314 | for (set, w_commit) in tests { 315 | let match_ack_indexer = new_match_ack_indexer(set.clone()); 316 | let index = majority.committed_index(&match_ack_indexer); 317 | assert_eq!(index, w_commit); 318 | } 319 | } 320 | 321 | fn new_match_ack_indexer(v: Vec<(u64, u64)>) -> MatchAckIndexer { 322 | let mut match_ack_indexer = MatchAckIndexer::new(); 323 | v.iter().fold(0, |acc, (m, n)| { 324 | let mut progress = Progress::new(*m, *n); 325 | match_ack_indexer.insert(acc, progress); 326 | acc + 1 327 | }); 328 | match_ack_indexer 329 | } 330 | } 331 | -------------------------------------------------------------------------------- /src/quorum/majority_vote.txt: -------------------------------------------------------------------------------- 1 | # The empty config always announces a won vote. 2 | vote 3 | ---- 4 | VoteWon 5 | 6 | vote cfg=(1) votes=(_) 7 | ---- 8 | VotePending 9 | 10 | vote cfg=(1) votes=(n) 11 | ---- 12 | VoteLost 13 | 14 | vote cfg=(123) votes=(y) 15 | ---- 16 | VoteWon 17 | 18 | 19 | 20 | 21 | vote cfg=(4,8) votes=(_,_) 22 | ---- 23 | VotePending 24 | 25 | # With two voters, a single rejection loses the vote. 26 | vote cfg=(4,8) votes=(n,_) 27 | ---- 28 | VoteLost 29 | 30 | vote cfg=(4,8) votes=(y,_) 31 | ---- 32 | VotePending 33 | 34 | vote cfg=(4,8) votes=(n,y) 35 | ---- 36 | VoteLost 37 | 38 | vote cfg=(4,8) votes=(y,y) 39 | ---- 40 | VoteWon 41 | 42 | 43 | 44 | vote cfg=(2,4,7) votes=(_,_,_) 45 | ---- 46 | VotePending 47 | 48 | vote cfg=(2,4,7) votes=(n,_,_) 49 | ---- 50 | VotePending 51 | 52 | vote cfg=(2,4,7) votes=(y,_,_) 53 | ---- 54 | VotePending 55 | 56 | vote cfg=(2,4,7) votes=(n,n,_) 57 | ---- 58 | VoteLost 59 | 60 | vote cfg=(2,4,7) votes=(y,n,_) 61 | ---- 62 | VotePending 63 | 64 | vote cfg=(2,4,7) votes=(y,y,_) 65 | ---- 66 | VoteWon 67 | 68 | vote cfg=(2,4,7) votes=(y,y,n) 69 | ---- 70 | VoteWon 71 | 72 | vote cfg=(2,4,7) votes=(n,y,n) 73 | ---- 74 | VoteLost 75 | 76 | 77 | 78 | # Test some random example with seven nodes (why not). 79 | vote cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,_,_,_) 80 | ---- 81 | VotePending 82 | 83 | vote cfg=(1,2,3,4,5,6,7) votes=(_,y,y,_,n,y,n) 84 | ---- 85 | VotePending 86 | 87 | vote cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,_,n,y) 88 | ---- 89 | VoteWon 90 | 91 | vote cfg=(1,2,3,4,5,6,7) votes=(y,y,_,n,y,n,n) 92 | ---- 93 | VotePending 94 | 95 | vote cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,n,n,n) 96 | ---- 97 | VoteLost -------------------------------------------------------------------------------- /src/quorum/mod.rs: -------------------------------------------------------------------------------- 1 | mod data_driven_test; 2 | pub mod joint; 3 | pub mod majority; 4 | mod quick_test; 5 | pub mod quorum; 6 | -------------------------------------------------------------------------------- /src/quorum/quick_test.rs: -------------------------------------------------------------------------------- 1 | use crate::quorum::majority::MajorityConfig; 2 | use crate::quorum::quorum::{AckedIndexer, Index}; 3 | use std::collections::HashMap; 4 | 5 | #[cfg(test)] 6 | mod tests { 7 | use crate::quorum::majority::MajorityConfig; 8 | use crate::quorum::quick_test::alternative_majority_committed_index; 9 | use crate::quorum::quorum::{AckedIndexer, Index}; 10 | use rand::prelude::*; 11 | use rand::Rng; 12 | use std::cmp::Ordering; 13 | use std::collections::{HashMap, HashSet}; 14 | 15 | type IdxMap = HashMap; 16 | 17 | fn new_idx_map() -> IdxMap { 18 | small_ran_idx_map(0) 19 | } 20 | 21 | type MemberMap = HashSet; 22 | 23 | fn convert_idx_map_to_member_map(idx_map: &IdxMap) -> MemberMap { 24 | let mut m = HashSet::new(); 25 | idx_map.iter().for_each(|(k, v)| { 26 | m.insert(*k); 27 | }); 28 | m 29 | } 30 | 31 | fn new_member_map() -> MemberMap { 32 | let mut m = HashSet::new(); 33 | small_ran_idx_map(0).iter().for_each(|(k, v)| { 34 | m.insert(*k); 35 | }); 36 | m 37 | } 38 | 39 | // returns a reasonably sized map of ids to commit indexes. 40 | fn small_ran_idx_map(size: usize) -> HashMap { 41 | // Hard-code a reasonably small here (quick will hard-code 50, which 42 | // is not usefull here). 43 | let size = 10; 44 | let mut rng = rand::thread_rng(); 45 | let n: usize = rng.gen_range(0..size); 46 | let mut ids: Vec = (1..size).collect(); 47 | ids.shuffle(&mut rng); 48 | ids.drain(n..); 49 | let mut idxs = [0].repeat(ids.len()); 50 | for idx in idxs.iter_mut() { 51 | *idx = rng.gen_range(0..n); 52 | } 53 | let mut m = HashMap::new(); 54 | for (i, v) in ids.iter().enumerate() { 55 | m.insert(*v as u64, *idxs.get(i).unwrap() as Index); 56 | } 57 | m 58 | } 59 | 60 | #[test] 61 | fn tt_majority() { 62 | let count = 5000; 63 | for i in 0..count { 64 | let idx_map = new_idx_map(); 65 | let member_map = convert_idx_map_to_member_map(&idx_map); 66 | let mut majority = MajorityConfig::new(); 67 | majority.votes = member_map.clone(); 68 | let idx = majority.committed_index(&idx_map); 69 | let expect_idx = alternative_majority_committed_index(majority.clone(), &idx_map); 70 | assert_eq!(idx, expect_idx); 71 | } 72 | } 73 | } 74 | 75 | // This is an alternative implmentation of (MajorityConfig).CommittedIndex(l). 76 | pub(crate) fn alternative_majority_committed_index( 77 | c: MajorityConfig, 78 | l: &T, 79 | ) -> Index { 80 | if c.is_empty() { 81 | return u64::MAX; 82 | } 83 | let mut id_to_idx = HashMap::new(); 84 | c.votes.iter().for_each(|node| { 85 | if let Some(idx) = l.acked_index(node) { 86 | id_to_idx.insert(node, idx); 87 | } 88 | }); 89 | 90 | // Build a map from index to voters who have acked that or any higher index. 91 | let mut idx_to_votes = HashMap::new(); 92 | id_to_idx.iter().for_each(|(id, idx)| { 93 | idx_to_votes.insert(idx, 0); 94 | }); 95 | 96 | for (_, idx) in id_to_idx.iter() { 97 | for (idy, v) in idx_to_votes.iter_mut() { 98 | if ***idy > **idx { 99 | continue; 100 | } 101 | *v += 1; 102 | } 103 | } 104 | 105 | // Find the maximum index that has achieved quorum. 106 | let q = c.len() / 2 + 1; 107 | let mut max_quorum_index = Index::default(); 108 | for (idx, n) in idx_to_votes.clone() { 109 | if n >= q as u64 && *idx > &max_quorum_index { 110 | max_quorum_index = **idx; 111 | } 112 | } 113 | // println!("---->{:?}, {:?}, quorum: {}, max_quorum_index: {:?}", id_to_idx, idx_to_votes, q, max_quorum_index); 114 | max_quorum_index 115 | } 116 | -------------------------------------------------------------------------------- /src/quorum/quorum.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::collections::HashMap; 16 | 17 | // Index is a Raft log position 18 | pub type Index = u64; 19 | 20 | pub fn to_string(index: Index) -> String { 21 | if index == u64::MAX { 22 | "∞".to_string() 23 | } else { 24 | index.to_string() 25 | } 26 | } 27 | 28 | /// AckedIndexer allows looking up a commit index for a given ID of a voter 29 | /// from a corresponding MajorityConfig. 30 | pub trait AckedIndexer { 31 | fn acked_index(&self, voter_id: &u64) -> Option<&Index>; 32 | } 33 | 34 | pub(crate) type MapAckIndexer = HashMap; 35 | 36 | impl AckedIndexer for MapAckIndexer { 37 | fn acked_index(&self, voter_id: &u64) -> Option<&Index> { 38 | self.get(voter_id) 39 | } 40 | } 41 | 42 | /// VoteResult indicates the outcome of a vote. 43 | #[derive(Debug, Clone, PartialEq, Eq)] 44 | pub enum VoteResult { 45 | /// VotePending indicates that the decision of the vote depends on future 46 | /// votes, i.e. neither "yes" or "no" has reached quorum yet. 47 | VotePending, 48 | /// VoteLost indicates that the quorum has votes "no" 49 | VoteLost, 50 | /// VoteWon indicates that the quorum has voted "yes" 51 | VoteWon, 52 | } 53 | -------------------------------------------------------------------------------- /src/quorum/testdata/joint_commit.txt: -------------------------------------------------------------------------------- 1 | # No difference between a simple majority quorum and a simple majority quorum 2 | # joint with an empty majority quorum. (This is asserted for all datadriven tests 3 | # by the framework, so we don't dwell on it more). 4 | # 5 | # Note that by specifying cfgj explicitly we tell the test harness to treat the 6 | # input as a joint quorum and not a majority quorum. If we didn't specify 7 | # cfgj=zero the test would pass just the same, but it wouldn't be exercising the 8 | # joint quorum path. 9 | cmd: committed 10 | args: cfg=(1,2,3) cfgj=zero idx=(100,101,99) 11 | idx 12 | x> 100 (id=1) 13 | xx> 101 (id=2) 14 | > 99 (id=3) 15 | 100 -------------------------------------------------------------------------------- /src/quorum/testdata/joint_vote.txt: -------------------------------------------------------------------------------- 1 | # Empty joint config wins all votes. This isn't used in production. Note that 2 | # by specifying cfgj explicitly we tell the test harness to treat the input as 3 | # a joint quorum and not a majority quorum. 4 | cmd: vote 5 | args: cfgj=zero 6 | VoteWon 7 | 8 | -------------------------------- 9 | # More examples with close to trivial configs. 10 | cmd: vote 11 | args: cfg=(1) cfgj=zero votes=(_) 12 | VotePending 13 | 14 | -------------------------------- 15 | cmd: vote 16 | args: cfg=(1) cfgj=zero votes=(y) 17 | VoteWon 18 | 19 | -------------------------------- 20 | cmd: vote 21 | args: cfg=(1) cfgj=zero votes=(n) 22 | VoteLost 23 | 24 | -------------------------------- 25 | cmd: vote 26 | args: cfg=(1) cfgj=(1) votes=(_) 27 | VotePending 28 | 29 | -------------------------------- 30 | cmd: vote 31 | args: cfg=(1) cfgj=(1) votes=(y) 32 | VoteWon 33 | 34 | -------------------------------- 35 | cmd: vote 36 | args: cfg=(1) cfgj=(1) votes=(n) 37 | VoteLost 38 | 39 | -------------------------------- 40 | cmd: vote 41 | args: cfg=(1) cfgj=(2) votes=(_,_) 42 | VotePending 43 | 44 | -------------------------------- 45 | cmd: vote 46 | args: cfg=(1) cfgj=(2) votes=(y,_) 47 | VotePending 48 | 49 | -------------------------------- 50 | cmd: vote 51 | args: cfg=(1) cfgj=(2) votes=(y,y) 52 | VoteWon 53 | 54 | -------------------------------- 55 | cmd: vote 56 | args: cfg=(1) cfgj=(2) votes=(y,n) 57 | VoteLost 58 | 59 | -------------------------------- 60 | cmd: vote 61 | args: cfg=(1) cfgj=(2) votes=(n,_) 62 | VoteLost 63 | 64 | -------------------------------- 65 | cmd: vote 66 | args: cfg=(1) cfgj=(2) votes=(n,n) 67 | VoteLost 68 | 69 | -------------------------------- 70 | cmd: vote 71 | args: cfg=(1) cfgj=(2) votes=(n,y) 72 | VoteLost 73 | 74 | -------------------------------- 75 | # Two node configs. 76 | cmd: vote 77 | args: cfg=(1,2) cfgj=(3,4) votes=(_,_,_,_) 78 | VotePending 79 | 80 | -------------------------------- 81 | cmd: vote 82 | args: cfg=(1,2) cfgj=(3,4) votes=(y,_,_,_) 83 | VotePending 84 | 85 | -------------------------------- 86 | cmd: vote 87 | args: cfg=(1,2) cfgj=(3,4) votes=(y,y,_,_) 88 | VotePending 89 | 90 | -------------------------------- 91 | cmd: vote 92 | args: cfg=(1,2) cfgj=(3,4) votes=(y,y,n,_) 93 | VoteLost 94 | 95 | -------------------------------- 96 | cmd: vote 97 | args: cfg=(1,2) cfgj=(3,4) votes=(y,y,n,n) 98 | VoteLost 99 | 100 | -------------------------------- 101 | cmd: vote 102 | args: cfg=(1,2) cfgj=(3,4) votes=(y,y,y,n) 103 | VoteLost 104 | 105 | -------------------------------- 106 | cmd: vote 107 | args: cfg=(1,2) cfgj=(3,4) votes=(y,y,y,y) 108 | VoteWon 109 | 110 | -------------------------------- 111 | cmd: vote 112 | args: cfg=(1,2) cfgj=(2,3) votes=(_,_,_) 113 | VotePending 114 | 115 | -------------------------------- 116 | cmd: vote 117 | args: cfg=(1,2) cfgj=(2,3) votes=(_,n,_) 118 | VoteLost 119 | 120 | -------------------------------- 121 | cmd: vote 122 | args: cfg=(1,2) cfgj=(2,3) votes=(y,y,_) 123 | VotePending 124 | 125 | -------------------------------- 126 | cmd: vote 127 | args: cfg=(1,2) cfgj=(2,3) votes=(y,y,n) 128 | VoteLost 129 | 130 | -------------------------------- 131 | cmd: vote 132 | args: cfg=(1,2) cfgj=(2,3) votes=(y,y,y) 133 | VoteWon 134 | 135 | -------------------------------- 136 | cmd: vote 137 | args: cfg=(1,2) cfgj=(1,2) votes=(_,_) 138 | VotePending 139 | 140 | -------------------------------- 141 | cmd: vote 142 | args: cfg=(1,2) cfgj=(1,2) votes=(y,_) 143 | VotePending 144 | 145 | -------------------------------- 146 | cmd: vote 147 | args: cfg=(1,2) cfgj=(1,2) votes=(y,n) 148 | VoteLost 149 | 150 | -------------------------------- 151 | cmd: vote 152 | args: cfg=(1,2) cfgj=(1,2) votes=(n,_) 153 | VoteLost 154 | 155 | -------------------------------- 156 | cmd: vote 157 | args: cfg=(1,2) cfgj=(1,2) votes=(n,n) 158 | VoteLost 159 | 160 | -------------------------------- 161 | # Simple example for overlapping three node configs. 162 | cmd: vote 163 | args: cfg=(1,2,3) cfgj=(2,3,4) votes=(_,_,_,_) 164 | VotePending 165 | 166 | -------------------------------- 167 | cmd: vote 168 | args: cfg=(1,2,3) cfgj=(2,3,4) votes=(_,n,_,_) 169 | VotePending 170 | 171 | -------------------------------- 172 | cmd: vote 173 | args: cfg=(1,2,3) cfgj=(2,3,4) votes=(_,n,n,_) 174 | VoteLost 175 | 176 | -------------------------------- 177 | cmd: vote 178 | args: cfg=(1,2,3) cfgj=(2,3,4) votes=(_,y,y,_) 179 | VoteWon 180 | 181 | -------------------------------- 182 | cmd: vote 183 | args: cfg=(1,2,3) cfgj=(2,3,4) votes=(y,y,_,_) 184 | VotePending 185 | 186 | -------------------------------- 187 | cmd: vote 188 | args: cfg=(1,2,3) cfgj=(2,3,4) votes=(y,y,n,_) 189 | VotePending 190 | 191 | -------------------------------- 192 | cmd: vote 193 | args: cfg=(1,2,3) cfgj=(2,3,4) votes=(y,y,n,n) 194 | VoteLost 195 | 196 | -------------------------------- 197 | cmd: vote 198 | args: cfg=(1,2,3) cfgj=(2,3,4) votes=(y,y,n,y) 199 | VoteWon -------------------------------------------------------------------------------- /src/quorum/testdata/majority_commit.txt: -------------------------------------------------------------------------------- 1 | # The empty quorum commits "everything". This is useful for its use in joint 2 | # quorums. 3 | cmd: committed 4 | output: 5 | ∞ 6 | 7 | 8 | -------------------------------- 9 | title: A single voter quorum is not final when no index is known. 10 | cmd: committed 11 | args: cfg=(1) idx=(_) 12 | output: 13 | idx 14 | ? 0 (id=1) 15 | 0 16 | 17 | 18 | -------------------------------- 19 | title: When an index is known, that's the committed index, and that's final. 20 | cmd: committed 21 | args: cfg=(1) idx=(12) 22 | output: 23 | idx 24 | > 12 (id=1) 25 | 12 26 | 27 | 28 | -------------------------------- 29 | title: With two nodes, start out similarly. 30 | cmd: committed 31 | args: cfg=(1,2) idx=(_,_) 32 | output: 33 | idx 34 | ? 0 (id=1) 35 | ? 0 (id=2) 36 | 0 37 | 38 | 39 | -------------------------------- 40 | # The first committed index becomes known (for n1). Nothing changes in the output 41 | # because idx=12 is not known to be on a quorum (which is both nodes). 42 | cmd: committed 43 | args: cfg=(1,2) idx=(12,_) 44 | output: 45 | idx 46 | x> 12 (id=1) 47 | ? 0 (id=2) 48 | 0 49 | 50 | -------------------------------- 51 | # The second index comes in and finalize the decision. The result will be the 52 | # smaller of the two indexes. 53 | cmd: committed 54 | args: cfg=(1,2) idx=(12,5) 55 | output: 56 | idx 57 | x> 12 (id=1) 58 | > 5 (id=2) 59 | 5 60 | 61 | -------------------------------- 62 | # No surprises for three nodes. 63 | cmd: committed 64 | args: cfg=(1,2,3) idx=(_,_,_) 65 | output: 66 | idx 67 | ? 0 (id=1) 68 | ? 0 (id=2) 69 | ? 0 (id=3) 70 | 0 71 | 72 | -------------------------------- 73 | cmd: committed 74 | args: cfg=(1,2,3) idx=(12,_,_) 75 | output: 76 | idx 77 | xx> 12 (id=1) 78 | ? 0 (id=2) 79 | ? 0 (id=3) 80 | 0 81 | 82 | -------------------------------- 83 | # We see a committed index, but a higher committed index for the last pending 84 | # votes could change (increment) the outcome, so not final yet. 85 | cmd: committed 86 | args: cfg=(1,2,3) idx=(12,5,_) 87 | output: 88 | idx 89 | xx> 12 (id=1) 90 | x> 5 (id=2) 91 | ? 0 (id=3) 92 | 5 93 | 94 | -------------------------------- 95 | # a) the case in which it does: 96 | cmd: committed 97 | args: cfg=(1,2,3) idx=(12,5,6) 98 | output: 99 | idx 100 | xx> 12 (id=1) 101 | > 5 (id=2) 102 | x> 6 (id=3) 103 | 6 104 | 105 | -------------------------------- 106 | # b) the case in which it does not: 107 | cmd: committed 108 | args: cfg=(1,2,3) idx=(12,5,4) 109 | output: 110 | idx 111 | xx> 12 (id=1) 112 | x> 5 (id=2) 113 | > 4 (id=3) 114 | 5 115 | 116 | -------------------------------- 117 | # c) a different case in which the last index is pending but it has no chance of 118 | # swaying the outcome (because nobody in the current quorum agrees on anything 119 | # higher than the candidate): 120 | cmd: committed 121 | args: cfg=(1,2,3) idx=(5,5,_) 122 | output: 123 | idx 124 | x> 5 (id=1) 125 | > 5 (id=2) 126 | ? 0 (id=3) 127 | 5 128 | 129 | -------------------------------- 130 | # With all committed idx known, the result is final. 131 | cmd: committed 132 | args: cfg=(1,2,3) idx=(100,101,103) 133 | output: 134 | idx 135 | > 100 (id=1) 136 | x> 101 (id=2) 137 | xx> 103 (id=3) 138 | 101 139 | 140 | 141 | -------------------------------- 142 | # Some more complicated examples. Similar to case c) above. The result is 143 | # already final because no index higher than 103 is one short of quorum. 144 | cmd: committed 145 | args: cfg=(1,2,3,4,5) idx=(101,104,103,103,_) 146 | output: 147 | idx 148 | x> 101 (id=1) 149 | xxxx> 104 (id=2) 150 | xx> 103 (id=3) 151 | > 103 (id=4) 152 | ? 0 (id=5) 153 | 103 154 | 155 | -------------------------------- 156 | # A similar case which is not final because another vote for >= 103 would change 157 | # the outcome. 158 | cmd: committed 159 | args: cfg=(1,2,3,4,5) idx=(101,102,103,103,_) 160 | output: 161 | idx 162 | x> 101 (id=1) 163 | xx> 102 (id=2) 164 | xxx> 103 (id=3) 165 | > 103 (id=4) 166 | ? 0 (id=5) 167 | 102 -------------------------------------------------------------------------------- /src/quorum/testdata/majority_vote.txt: -------------------------------------------------------------------------------- 1 | # The empty config always announces a won vote. 2 | cmd: vote 3 | output: 4 | VoteWon 5 | 6 | -------------------------------- 7 | cmd: vote 8 | args: cfg=(1) votes=(_) 9 | output: 10 | VotePending 11 | 12 | -------------------------------- 13 | cmd: vote 14 | args: cfg=(1) votes=(n) 15 | output: 16 | VoteLost 17 | 18 | -------------------------------- 19 | cmd: vote 20 | args: cfg=(123) votes=(y) 21 | output: 22 | VoteWon 23 | 24 | -------------------------------- 25 | cmd: vote 26 | args: cfg=(4,8) votes=(_,_) 27 | output: 28 | VotePending 29 | 30 | 31 | -------------------------------- 32 | # With two voters, a single rejection loses the vote. 33 | cmd: vote 34 | args: cfg=(4,8) votes=(n,_) 35 | output: 36 | VoteLost 37 | 38 | -------------------------------- 39 | cmd: vote 40 | args: cfg=(4,8) votes=(y,_) 41 | output: 42 | VotePending 43 | 44 | -------------------------------- 45 | cmd: vote 46 | args: cfg=(4,8) votes=(n,y) 47 | output: 48 | VoteLost 49 | 50 | -------------------------------- 51 | cmd: vote 52 | args: cfg=(4,8) votes=(y,y) 53 | output: 54 | VoteWon 55 | 56 | -------------------------------- 57 | cmd: vote 58 | args: cfg=(2,4,7) votes=(_,_,_) 59 | output: 60 | VotePending 61 | 62 | -------------------------------- 63 | cmd: vote 64 | args: cfg=(2,4,7) votes=(n,_,_) 65 | output: 66 | VotePending 67 | 68 | -------------------------------- 69 | cmd: vote 70 | args: cfg=(2,4,7) votes=(y,_,_) 71 | VotePending 72 | 73 | -------------------------------- 74 | cmd: vote 75 | args: cfg=(2,4,7) votes=(n,n,_) 76 | output: 77 | VoteLost 78 | 79 | -------------------------------- 80 | cmd: vote 81 | args: cfg=(2,4,7) votes=(y,n,_) 82 | output: 83 | VotePending 84 | 85 | -------------------------------- 86 | cmd: vote 87 | args: cfg=(2,4,7) votes=(y,y,_) 88 | output: 89 | VoteWon 90 | 91 | -------------------------------- 92 | cmd: vote 93 | args: cfg=(2,4,7) votes=(y,y,n) 94 | output: 95 | VoteWon 96 | 97 | -------------------------------- 98 | cmd: vote 99 | args: cfg=(2,4,7) votes=(n,y,n) 100 | output: 101 | VoteLost 102 | 103 | -------------------------------- 104 | # Test some random example with seven nodes (why not). 105 | cmd: vote 106 | args: cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,_,_,_) 107 | output: 108 | VotePending 109 | 110 | -------------------------------- 111 | cmd: vote 112 | args: cfg=(1,2,3,4,5,6,7) votes=(_,y,y,_,n,y,n) 113 | VotePending 114 | 115 | -------------------------------- 116 | cmd: vote 117 | args: cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,_,n,y) 118 | VoteWon 119 | 120 | -------------------------------- 121 | cmd: vote 122 | args: cfg=(1,2,3,4,5,6,7) votes=(y,y,_,n,y,n,n) 123 | VotePending 124 | 125 | -------------------------------- 126 | cmd: vote 127 | args: cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,n,n,n) 128 | VoteLost -------------------------------------------------------------------------------- /src/raft_flow_control_test.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #[cfg(test)] 16 | mod tests { 17 | use crate::tests_util::mock::{new_test_raw_node, read_message, MockEntry, MocksEnts}; 18 | use crate::raft::Raft; 19 | use crate::raftpb::raft::MessageType::{MsgAppResp, MsgHeartbeatResp, MsgProp}; 20 | use crate::raftpb::raft::{Entry, Message}; 21 | use crate::storage::{SafeMemStorage, Storage}; 22 | use bytes::Bytes; 23 | use protobuf::RepeatedField; 24 | use crate::tests_util::try_init_log; 25 | 26 | // Ensures: 27 | // 1. `MsgApp` fill the sending windows until full 28 | // 2. when the windows is full, no more `MsgApp` can be sent. 29 | #[test] 30 | fn msg_app_flow_control_full() { 31 | try_init_log(); 32 | let raft = new_test_raw_node(1, vec![1, 2], 5, 1, SafeMemStorage::new()); 33 | let mut wl_raft = raft.wl(); 34 | wl_raft.raft.become_candidate(); 35 | wl_raft.raft.become_leader(); 36 | 37 | { 38 | let mut pr = wl_raft.raft.prs.progress.get_mut(&2).unwrap(); 39 | // force the progress to be in replicate state. 40 | pr.become_replicate(); 41 | } 42 | // fill in the inflights windows 43 | { 44 | for i in 0..wl_raft.raft.prs.max_inflight { 45 | let mut msg = Message::new(); 46 | msg.from = 1; 47 | msg.to = 1; 48 | msg.field_type = MsgProp; 49 | msg.entries = MocksEnts::from("somedata").into(); 50 | wl_raft.step(msg); 51 | let msg = read_message(&mut wl_raft.raft); 52 | assert_eq!(msg.len(), 1, "{}: len(ms) = {}, want: 1", i, msg.len()); 53 | } 54 | } 55 | 56 | // ensure 1 57 | { 58 | let mut pr = wl_raft.raft.prs.progress.get_mut(&2).unwrap(); 59 | assert!( 60 | pr.inflights.full(), 61 | "inflights.full = {}, want: {}", 62 | pr.inflights.full(), 63 | true 64 | ); 65 | } 66 | 67 | //ensure 2 68 | { 69 | for i in 0..10 { 70 | let mut msg = Message::new(); 71 | msg.from = 1; 72 | msg.to = 1; 73 | msg.field_type = MsgProp; 74 | msg.entries = MocksEnts::from("somedata").into(); 75 | wl_raft.step(msg); 76 | let msg = read_message(&mut wl_raft.raft); 77 | assert_eq!(msg.len(), 0, "{}: len(ms) = {}, want: 1", i, msg.len()); 78 | } 79 | } 80 | } 81 | 82 | // Ensures `MsgAppResp` can move 83 | // forward the sending windows correctly: 84 | // 1. valid `MsgAppResp.Index` moves the windows to pass all smaller or euqal index. 85 | // 2. out-of-dated `MsgAppResp` has no effect on the sliding windows. 86 | #[test] 87 | fn msg_app_flow_control_move_forward() { 88 | try_init_log(); 89 | let raft = new_test_raw_node(1, vec![1, 2], 5, 1, SafeMemStorage::new()); 90 | let mut wl_raft = raft.wl(); 91 | wl_raft.raft.become_candidate(); 92 | wl_raft.raft.become_leader(); 93 | { 94 | let mut pr2 = wl_raft.raft.prs.progress.get_mut(&2).unwrap(); 95 | // force the progress to be in replicate state 96 | pr2.become_replicate(); 97 | } 98 | 99 | // fill in the inflights windows. 100 | { 101 | for i in 0..wl_raft.raft.prs.max_inflight { 102 | let mut msg = Message::new(); 103 | msg.from = 1; 104 | msg.to = 1; 105 | msg.field_type = MsgProp; 106 | msg.set_entries(MocksEnts::from("somedata").into()); 107 | wl_raft.step(msg); 108 | let msg = read_message(&mut wl_raft.raft); 109 | assert_eq!(msg.len(), 1, "{}: len(ms) = {}, want: 1", i, msg.len()); 110 | } 111 | } 112 | 113 | // 1 is noop, 2 is the first proposal we just sent. 114 | // so we start with 2. 115 | { 116 | for tt in 2..wl_raft.raft.prs.max_inflight { 117 | // move forward the windows 118 | { 119 | let mut msg = Message::new(); 120 | msg.from = 2; 121 | msg.to = 1; 122 | msg.field_type = MsgAppResp; 123 | msg.index = tt; 124 | assert!(wl_raft.step(msg).is_ok()); 125 | } 126 | } 127 | } 128 | } 129 | 130 | // Ensure a heartbeat response frees one slot if the window is full 131 | #[test] 132 | fn msg_app_flow_control_recv_heartbeat() { 133 | try_init_log(); 134 | let raft = new_test_raw_node(0x1, vec![0x1, 0x2], 5, 1, SafeMemStorage::new()); 135 | let mut wl_raft = raft.wl(); 136 | wl_raft.raft.become_candidate(); 137 | // NOTE: the first index entry log is config change for leader 0x1 138 | wl_raft.raft.become_leader(); 139 | 140 | // force the progress to be in replicate state 141 | wl_raft 142 | .raft 143 | .prs 144 | .progress 145 | .must_get_mut(&0x2) 146 | .become_replicate(); 147 | // fill in the inflights window 148 | for i in 0..wl_raft.raft.prs.max_inflight { 149 | assert!(wl_raft 150 | .step(Message { 151 | from: 0x1, 152 | to: 0x1, 153 | field_type: MsgProp, 154 | entries: MocksEnts::from("somedata").into(), 155 | ..Default::default() 156 | }) 157 | .is_ok()); 158 | read_message(&mut wl_raft.raft); 159 | } 160 | 161 | for tt in 1..5 { 162 | let full = wl_raft.raft.prs.progress.must_get(&0x2).inflights.full(); 163 | assert!(full, "{}: inflights.full = {}, want {}", tt, full, true); 164 | // recv tt `MsgHeartbeatResp` and expect one free slot 165 | for i in 0..tt { 166 | let msg = Message { 167 | from: 0x2, 168 | to: 0x1, 169 | field_type: MsgHeartbeatResp, 170 | ..Default::default() 171 | }; 172 | assert!(wl_raft.step(msg).is_ok()); 173 | read_message(&mut wl_raft.raft); 174 | let full = wl_raft.raft.prs.progress.must_get(&0x2).inflights.full(); 175 | assert_eq!( 176 | full, false, 177 | "{}.{}: inflights.full = {}, want {}", 178 | tt, i, full, false 179 | ); 180 | } 181 | 182 | // one slot 183 | let msg = Message { 184 | from: 0x1, 185 | to: 0x1, 186 | field_type: MsgProp, 187 | entries: MocksEnts::from("somedata").into(), 188 | ..Default::default() 189 | }; 190 | assert!(wl_raft.step(msg).is_ok()); 191 | let ms = read_message(&mut wl_raft.raft); 192 | assert!( 193 | wl_raft.raft.prs.progress.must_get(&0x2).inflights.full(), 194 | "inflights.full = {}", 195 | false 196 | ); 197 | 198 | // and just one slot and inflights is full. 199 | for i in 0..10 { 200 | let mut msg = Message { 201 | from: 0x1, 202 | to: 0x1, 203 | field_type: MsgProp, 204 | entries: MocksEnts::from("somedata").into(), 205 | ..Default::default() 206 | }; 207 | assert!(wl_raft.step(msg).is_ok()); 208 | let ms1 = read_message(&mut wl_raft.raft); 209 | assert_eq!(ms1.len(), 0, "{}.{}: ms.len = {}, want 0", tt, i, ms1.len()); 210 | } 211 | 212 | // clear all pending messages. 213 | let mut msg = Message { 214 | from: 0x2, 215 | to: 0x1, 216 | field_type: MsgHeartbeatResp, 217 | ..Default::default() 218 | }; 219 | assert!(wl_raft.step(msg).is_ok()); 220 | read_message(&mut wl_raft.raft); 221 | } 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /src/raft_snap_test.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #[cfg(test)] 16 | mod tests { 17 | use crate::raftpb::raft::MessageType::{MsgAppResp, MsgProp, MsgSnapStatus}; 18 | use crate::raftpb::raft::{ConfState, Message, Snapshot, SnapshotMetadata}; 19 | use crate::storage::SafeMemStorage; 20 | use crate::tests_util::mock::{ 21 | new_test_core_node, new_test_inner_node, new_test_raw_node, read_message, MocksEnts, 22 | }; 23 | use crate::tests_util::try_init_log; 24 | use crate::tracker::state::StateType; 25 | use env_logger::Env; 26 | use protobuf::{SingularField, SingularPtrField}; 27 | 28 | #[test] 29 | fn sending_snapshot_set_pending_snapshot() { 30 | try_init_log(); 31 | let mut raft = new_test_inner_node(0x1, vec![1], 10, 1, SafeMemStorage::new()); 32 | raft.restore(&new_testing_snap()); 33 | 34 | raft.become_candidate(); 35 | raft.become_leader(); 36 | 37 | // force set the next of node 2, so that 38 | // node 2 needs a snapshot 39 | let first_index = raft.raft_log.first_index(); 40 | raft.prs.progress.must_get_mut(&0x2).next = first_index; 41 | 42 | let index = raft.prs.progress.must_get(&0x2).next - 1; 43 | raft.step(Message { 44 | from: 0x2, 45 | to: 0x1, 46 | field_type: MsgAppResp, 47 | index, 48 | reject: true, 49 | ..Default::default() 50 | }); 51 | 52 | let pending_snapshot = raft.prs.progress.must_get(&0x2).pending_snapshot; 53 | assert_eq!( 54 | pending_snapshot, 11, 55 | "pending_snapshot = {}, want 11", 56 | pending_snapshot 57 | ); 58 | } 59 | 60 | #[test] 61 | fn pending_snapshot_pause_replication() { 62 | try_init_log(); 63 | 64 | let mut raft = new_test_inner_node(0x1, vec![0x1, 0x2], 10, 1, SafeMemStorage::new()); 65 | raft.restore(&new_testing_snap()); 66 | 67 | raft.become_candidate(); 68 | raft.become_leader(); 69 | 70 | raft.prs.progress.must_get_mut(&0x2).become_snapshot(11); 71 | 72 | raft.step(Message { 73 | from: 0x1, 74 | to: 0x1, 75 | field_type: MsgProp, 76 | entries: MocksEnts::from("somedata").into(), 77 | ..Default::default() 78 | }); 79 | let msg = read_message(&mut raft); 80 | assert!(msg.is_empty(), "len(msgs) = {}, want 0", msg.len()); 81 | } 82 | 83 | #[test] 84 | fn snapshot_failure() { 85 | try_init_log(); 86 | 87 | let mut raft = new_test_inner_node(0x1, vec![0x1, 0x2], 10, 1, SafeMemStorage::new()); 88 | raft.restore(&new_testing_snap()); 89 | 90 | raft.become_candidate(); 91 | raft.become_leader(); 92 | 93 | raft.prs.progress.must_get_mut(&0x2).next = 1; 94 | raft.prs.progress.must_get_mut(&0x2).become_snapshot(11); 95 | raft.step(Message { 96 | from: 0x2, 97 | to: 0x1, 98 | field_type: MsgSnapStatus, 99 | reject: true, 100 | ..Default::default() 101 | }); 102 | assert_eq!( 103 | raft.prs.progress.must_get(&0x2).pending_snapshot, 104 | 0, 105 | "pending_snapshot = {}, want 0", 106 | raft.prs.progress.must_get(&0x2).pending_snapshot 107 | ); 108 | assert_eq!( 109 | raft.prs.progress.must_get(&0x2).next, 110 | 1, 111 | "next = {}, want 1", 112 | raft.prs.progress.must_get(&0x2).next 113 | ); 114 | assert!( 115 | raft.prs.progress.must_get(&0x2).probe_sent, 116 | "probe_sent = {}, want true", 117 | raft.prs.progress.must_get(&0x2).probe_sent 118 | ); 119 | } 120 | 121 | #[test] 122 | fn snapshot_succeed() { 123 | try_init_log(); 124 | let mut raft = new_test_inner_node(0x1, vec![0x1, 0x2], 10, 1, SafeMemStorage::new()); 125 | raft.restore(&new_testing_snap()); 126 | 127 | raft.become_candidate(); 128 | raft.become_leader(); 129 | 130 | raft.prs.progress.must_get_mut(&0x2).next = 2; 131 | raft.prs.progress.must_get_mut(&0x2).become_snapshot(11); 132 | 133 | raft.step(Message { 134 | from: 0x2, 135 | to: 0x1, 136 | field_type: MsgSnapStatus, 137 | reject: false, 138 | ..Default::default() 139 | }); 140 | 141 | let pending_snapshot = raft.prs.progress.must_get(&0x2).pending_snapshot; 142 | assert_eq!( 143 | pending_snapshot, 0, 144 | "pending_snapshot = {}, want 0", 145 | pending_snapshot 146 | ); 147 | let next = raft.prs.progress.must_get(&0x2).next; 148 | assert_eq!(next, 12, "next = {}, want 0", next); 149 | let probe_sent = raft.prs.progress.must_get(&0x2).probe_sent; 150 | assert!(probe_sent, "probe_sent={}, want false", probe_sent); 151 | } 152 | 153 | #[test] 154 | fn snapshot_abort() { 155 | try_init_log(); 156 | let mut raft = new_test_inner_node(0x1, vec![0x1, 0x2], 10, 1, SafeMemStorage::new()); 157 | raft.restore(&new_testing_snap()); 158 | raft.become_candidate(); 159 | raft.become_leader(); // new leader will append a noop log entry 160 | raft.prs.progress.must_get_mut(&0x2).next = 1; 161 | raft.prs.progress.must_get_mut(&0x2).become_snapshot(11); 162 | 163 | // A successful MsgAppResp that has a higher/equal index than the 164 | // pending snapshot should abort the pending snapshot. 165 | info!("last index {}", raft.raft_log.last_index()); 166 | raft.step(Message { 167 | from: 0x2, 168 | to: 0x1, 169 | field_type: MsgAppResp, 170 | index: 11, 171 | ..Default::default() 172 | }); 173 | let pending_snapshot = raft.prs.progress.must_get(&0x2).pending_snapshot; 174 | assert_eq!( 175 | pending_snapshot, 0, 176 | "pending_snapshot = {}, want 0", 177 | pending_snapshot 178 | ); 179 | 180 | // The follower entered StateReplicate and the leader send an append 181 | // and optimistically updated the progress (so we see 13 instead of 12). 182 | // There is something to append because the leader appended an empty entry 183 | // to the log at index 12 when it assumed leadership. 184 | let next = raft.prs.progress.must_get(&0x2).next; 185 | assert_eq!(next, 13, "next = {}, want 13", next); 186 | let count = raft.prs.progress.must_get(&0x2).inflights.count(); 187 | assert_eq!(count, 1, "expected an inflight message, got {}", count); 188 | } 189 | 190 | fn new_testing_snap() -> Snapshot { 191 | let mut snap = Snapshot::new(); 192 | let mut conf_state = ConfState::new(); 193 | conf_state.set_voters(vec![1, 2]); 194 | snap.set_metadata(SnapshotMetadata { 195 | index: 11, 196 | term: 11, 197 | conf_state: SingularPtrField::from(Some(conf_state)), 198 | ..Default::default() 199 | }); 200 | snap 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /src/raft_test.rs: -------------------------------------------------------------------------------- 1 | use crate::raftpb::raft::{Message, Entry, MessageType}; 2 | use crate::raft::Raft; 3 | use crate::storage::{SafeMemStorage, Storage}; 4 | use nom::lib::std::collections::HashMap; 5 | 6 | // // Returns the appliable entries and updates the applied index 7 | // fn next_ents(mut raft: Raft, s: &mut SafeMemStorage) -> Vec { 8 | // // transfer all unstable entries to "stable" storage. 9 | // s.wl().append(raft.raft_log.unstable_entries().to_vec()); 10 | // raft.raft_log.stable_to(raft.raft_log.last_index(), raft.raft_log.last_term()); 11 | // 12 | // let ents = raft.raft_log.next_ents(); 13 | // raft.raft_log.applied_to(raft.raft_log.committed); 14 | // return ents; 15 | // } 16 | // 17 | // fn must_append_entry(raft: &mut Raft, mut ents: Vec) where S: Storage { 18 | // assert!(raft.append_entry(&mut ents), "entry unexpectedly dropped"); 19 | // } 20 | // 21 | // trait StateMachine { 22 | // fn step(&mut self, m: Message) -> Result<(), String>; 23 | // fn read_message(&mut self) -> Vec; 24 | // } 25 | // 26 | // struct NetWork { 27 | // peers: HashMap, 28 | // storage: HashMap, 29 | // dropm: HashMap, 30 | // ignorem: HashMap, 31 | // // `msg_hook` is called for each message sent. It may inspect the 32 | // // message and return true to send it for false to drop it 33 | // msg_hook: Box bool>, 34 | // } 35 | // 36 | // impl NetWork { 37 | // pub fn send(&mut self, msgs: Vec) { 38 | // unimplemented!("unimplemented") 39 | // } 40 | // 41 | // pub fn drop(&mut self, from: u64, to: u64, perc: f64) { 42 | // unimplemented!("unimplemented") 43 | // } 44 | // 45 | // pub fn cut(&mut self, one: u64, other: u64) { 46 | // unimplemented!("unimplemented") 47 | // } 48 | // 49 | // pub fn isolated(&mut self, id: u64) { 50 | // unimplemented!("unimplemented") 51 | // } 52 | // 53 | // pub fn ignore(&mut self, t: MessageType) { 54 | // unimplemented!("unimplemented") 55 | // } 56 | // 57 | // pub fn recover(&mut self) { 58 | // self.dropm.clear(); 59 | // self.ignorem.clear(); 60 | // } 61 | // 62 | // pub fn filter(&mut self, msgs: Vec) -> Vec { 63 | // unimplemented!("unimplemented") 64 | // } 65 | // 66 | // } 67 | // 68 | // #[derive(Debug, Clone)] 69 | // struct ConnEm { 70 | // from: u64, 71 | // to: u64, 72 | // } 73 | // 74 | // #[derive(Debug, Clone)] 75 | // struct BlackHole {} 76 | // 77 | // impl StateMachine for BlackHole { 78 | // fn step(&mut self, m: Message) -> Result<(), String> { 79 | // Ok(()) 80 | // } 81 | // 82 | // fn read_message(&mut self) -> Vec { 83 | // vec![] 84 | // } 85 | // } -------------------------------------------------------------------------------- /src/raftpb/.gitignore: -------------------------------------------------------------------------------- 1 | raft.rs -------------------------------------------------------------------------------- /src/raftpb/gogoproto/.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Rust template 3 | # Generated by Cargo 4 | # will have compiled files and executables 5 | /target/ 6 | 7 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 8 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 9 | Cargo.lock 10 | 11 | # These are backup files generated by rustfmt 12 | **/*.rs.bk 13 | raft.rs 14 | 15 | -------------------------------------------------------------------------------- /src/raftpb/gogoproto/gogo.proto: -------------------------------------------------------------------------------- 1 | // Protocol Buffers for Go with Gadgets 2 | // 3 | // Copyright (c) 2013, The GoGo Authors. All rights reserved. 4 | // http://github.com/gogo/protobuf 5 | // 6 | // Redistribution and use in source and binary forms, with or without 7 | // modification, are permitted provided that the following conditions are 8 | // met: 9 | // 10 | // * Redistributions of source code must retain the above copyright 11 | // notice, this list of conditions and the following disclaimer. 12 | // * Redistributions in binary form must reproduce the above 13 | // copyright notice, this list of conditions and the following disclaimer 14 | // in the documentation and/or other materials provided with the 15 | // distribution. 16 | // 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | syntax = "proto2"; 30 | package gogoproto; 31 | 32 | import "google/protobuf/descriptor.proto"; 33 | 34 | option java_package = "com.google.protobuf"; 35 | option java_outer_classname = "GoGoProtos"; 36 | option go_package = "github.com/gogo/protobuf/gogoproto"; 37 | 38 | extend google.protobuf.EnumOptions { 39 | optional bool goproto_enum_prefix = 62001; 40 | optional bool goproto_enum_stringer = 62021; 41 | optional bool enum_stringer = 62022; 42 | optional string enum_customname = 62023; 43 | optional bool enumdecl = 62024; 44 | } 45 | 46 | extend google.protobuf.EnumValueOptions { 47 | optional string enumvalue_customname = 66001; 48 | } 49 | 50 | extend google.protobuf.FileOptions { 51 | optional bool goproto_getters_all = 63001; 52 | optional bool goproto_enum_prefix_all = 63002; 53 | optional bool goproto_stringer_all = 63003; 54 | optional bool verbose_equal_all = 63004; 55 | optional bool face_all = 63005; 56 | optional bool gostring_all = 63006; 57 | optional bool populate_all = 63007; 58 | optional bool stringer_all = 63008; 59 | optional bool onlyone_all = 63009; 60 | 61 | optional bool equal_all = 63013; 62 | optional bool description_all = 63014; 63 | optional bool testgen_all = 63015; 64 | optional bool benchgen_all = 63016; 65 | optional bool marshaler_all = 63017; 66 | optional bool unmarshaler_all = 63018; 67 | optional bool stable_marshaler_all = 63019; 68 | 69 | optional bool sizer_all = 63020; 70 | 71 | optional bool goproto_enum_stringer_all = 63021; 72 | optional bool enum_stringer_all = 63022; 73 | 74 | optional bool unsafe_marshaler_all = 63023; 75 | optional bool unsafe_unmarshaler_all = 63024; 76 | 77 | optional bool goproto_extensions_map_all = 63025; 78 | optional bool goproto_unrecognized_all = 63026; 79 | optional bool gogoproto_import = 63027; 80 | optional bool protosizer_all = 63028; 81 | optional bool compare_all = 63029; 82 | optional bool typedecl_all = 63030; 83 | optional bool enumdecl_all = 63031; 84 | 85 | optional bool goproto_registration = 63032; 86 | optional bool messagename_all = 63033; 87 | 88 | optional bool goproto_sizecache_all = 63034; 89 | optional bool goproto_unkeyed_all = 63035; 90 | } 91 | 92 | extend google.protobuf.MessageOptions { 93 | optional bool goproto_getters = 64001; 94 | optional bool goproto_stringer = 64003; 95 | optional bool verbose_equal = 64004; 96 | optional bool face = 64005; 97 | optional bool gostring = 64006; 98 | optional bool populate = 64007; 99 | optional bool stringer = 67008; 100 | optional bool onlyone = 64009; 101 | 102 | optional bool equal = 64013; 103 | optional bool description = 64014; 104 | optional bool testgen = 64015; 105 | optional bool benchgen = 64016; 106 | optional bool marshaler = 64017; 107 | optional bool unmarshaler = 64018; 108 | optional bool stable_marshaler = 64019; 109 | 110 | optional bool sizer = 64020; 111 | 112 | optional bool unsafe_marshaler = 64023; 113 | optional bool unsafe_unmarshaler = 64024; 114 | 115 | optional bool goproto_extensions_map = 64025; 116 | optional bool goproto_unrecognized = 64026; 117 | 118 | optional bool protosizer = 64028; 119 | optional bool compare = 64029; 120 | 121 | optional bool typedecl = 64030; 122 | 123 | optional bool messagename = 64033; 124 | 125 | optional bool goproto_sizecache = 64034; 126 | optional bool goproto_unkeyed = 64035; 127 | } 128 | 129 | extend google.protobuf.FieldOptions { 130 | optional bool nullable = 65001; 131 | optional bool embed = 65002; 132 | optional string customtype = 65003; 133 | optional string customname = 65004; 134 | optional string jsontag = 65005; 135 | optional string moretags = 65006; 136 | optional string casttype = 65007; 137 | optional string castkey = 65008; 138 | optional string castvalue = 65009; 139 | 140 | optional bool stdtime = 65010; 141 | optional bool stdduration = 65011; 142 | optional bool wktpointer = 65012; 143 | 144 | } 145 | -------------------------------------------------------------------------------- /src/raftpb/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::raftpb::raft::ConfChangeTransition::{ 2 | ConfChangeTransitionAuto, ConfChangeTransitionJointExplicit, ConfChangeTransitionJointImplicit, 3 | }; 4 | use crate::raftpb::raft::ConfChangeType::{ 5 | ConfChangeAddLearnerNode, ConfChangeAddNode, ConfChangeRemoveNode, ConfChangeUpdateNode, 6 | }; 7 | use crate::raftpb::raft::EntryType::{EntryConfChange, EntryConfChangeV2}; 8 | use crate::raftpb::raft::{ConfChange, ConfChangeSingle, ConfChangeV2, ConfState, Entry}; 9 | use crate::util::vote_resp_msg_type; 10 | use bytes::{Buf, Bytes}; 11 | use nom::lib::std::borrow::Cow; 12 | use nom::lib::std::fmt::{Display, Formatter}; 13 | use protobuf::{Message, RepeatedField}; 14 | 15 | pub mod raft; 16 | // pub mod gogoproto; 17 | 18 | // returns a nil error if the inputs describe the same configuration. 19 | // On mismatch, returns a descriptive error showing the difference. 20 | pub fn equivalent(cs1: &ConfState, cs2: &ConfState) -> Result<(), String> { 21 | let orig1 = cs1.clone(); 22 | let orig2 = cs2.clone(); 23 | let mut cs1 = cs1.clone(); 24 | let mut cs2 = cs2.clone(); 25 | cs1.voters.sort(); 26 | cs1.learners.sort(); 27 | cs1.voters_outgoing.sort(); 28 | cs1.learners_next.sort(); 29 | if !cs1.get_auto_leave() { 30 | cs1.set_auto_leave(false); 31 | } 32 | 33 | cs2.voters.sort(); 34 | cs2.learners.sort(); 35 | cs2.voters_outgoing.sort(); 36 | cs2.learners_next.sort(); 37 | if !cs2.get_auto_leave() { 38 | cs2.set_auto_leave(false); 39 | } 40 | 41 | if cs1 != cs2 { 42 | info!("cs1: {:?}\ncs2:{:?}", cs1, cs2); 43 | return Err(format!( 44 | "ConfStates not equivalent after sorting:{:?}\n{:?}\nInputs were:\n{:?}\n{:?}", 45 | cs1, cs2, orig1, orig2 46 | )); 47 | } 48 | 49 | Ok(()) 50 | } 51 | 52 | // ConfChangeI abstracts over ConfChangeV2 and (legacy) ConfChange to allow 53 | // treating them in a unified manner. 54 | pub trait ConfChangeI: Display + protobuf::Message { 55 | fn as_v2(&self) -> ConfChangeV2; 56 | fn as_v1(&self) -> Option<&ConfChange>; 57 | fn to_entry(&self) -> Entry; 58 | } 59 | 60 | impl ConfChangeI for ConfChange { 61 | #[inline] 62 | fn as_v2(&self) -> ConfChangeV2 { 63 | let mut cc2 = ConfChangeV2::new(); 64 | cc2.context = self.context.clone(); 65 | let mut change = ConfChangeSingle::new(); 66 | change.set_field_type(self.get_field_type()); 67 | change.set_node_id(self.get_node_id()); 68 | cc2.set_changes(RepeatedField::from(vec![change])); 69 | cc2 70 | } 71 | 72 | #[inline] 73 | fn as_v1(&self) -> Option<&ConfChange> { 74 | Some(&self) 75 | } 76 | 77 | #[inline] 78 | fn to_entry(&self) -> Entry { 79 | let data = self.write_to_bytes().unwrap(); 80 | let mut entry = Entry::new(); 81 | entry.set_Data(Bytes::from(data)); 82 | entry.set_Type(EntryConfChange); 83 | entry 84 | } 85 | } 86 | 87 | impl Display for ConfChange { 88 | fn fmt(&self, f: &mut Formatter<'_>) -> ::std::fmt::Result { 89 | write!(f, "{}", self) 90 | } 91 | } 92 | 93 | impl ConfChangeI for ConfChangeV2 { 94 | #[inline] 95 | fn as_v2(&self) -> ConfChangeV2 { 96 | self.clone() 97 | } 98 | 99 | #[inline] 100 | fn as_v1(&self) -> Option<&ConfChange> { 101 | None 102 | } 103 | 104 | #[inline] 105 | fn to_entry(&self) -> Entry { 106 | let data = self.write_to_bytes().unwrap(); 107 | let mut entry = Entry::new(); 108 | entry.set_Data(Bytes::from(data)); 109 | entry.set_Type(EntryConfChangeV2); 110 | entry 111 | } 112 | } 113 | 114 | impl Display for ConfChangeV2 { 115 | fn fmt(&self, f: &mut Formatter<'_>) -> ::std::fmt::Result { 116 | write!(f, "{}", self) 117 | } 118 | } 119 | 120 | pub trait ExtendConfChange { 121 | fn leave_joint(&self) -> bool; 122 | fn enter_joint(&self) -> (bool, bool); 123 | } 124 | 125 | impl ExtendConfChange for ConfChangeV2 { 126 | fn leave_joint(&self) -> bool { 127 | let mut cp = self.clone(); 128 | cp.clear_context(); 129 | let empty = ConfChangeV2::default(); 130 | cp.eq(&empty) 131 | } 132 | // EnterJoint returns two bools. The second bool is true if and only if this 133 | // config change will use Joint Consensus, which is the case if it contains more 134 | // than one change or if the use of Joint Consensus was requested explicitly. 135 | // The first bool can only be true if second one is, and indicates whether the 136 | // Joint State will be left automatically. 137 | fn enter_joint(&self) -> (bool, bool) { 138 | // NB: in theory, more config changes could qualify for the "simple" 139 | // protocol but it depends on the config on top of which the changes apply. 140 | // For example, adding two learners is not OK if both nodes are part of the 141 | // base config (i.e. two voters are turned into learners in the process of 142 | // applying the conf change). In practice, these distinctions should not 143 | // matter, so we keep it simple and use Joint Consensus liberally. 144 | if self.get_transition() != ConfChangeTransitionAuto || self.changes.len() > 1 { 145 | // Use Joint Consensus. 146 | let mut auto_leave = false; 147 | match self.get_transition() { 148 | ConfChangeTransitionAuto | ConfChangeTransitionJointImplicit => auto_leave = true, 149 | ConfChangeTransitionJointExplicit => {} 150 | } 151 | return (auto_leave, true); 152 | } 153 | (false, false) 154 | } 155 | } 156 | 157 | pub fn cmp_conf_state(a: &ConfState, b: &ConfState) -> bool { 158 | let mut a = a.clone(); 159 | let mut b = b.clone(); 160 | a.voters.sort(); 161 | b.voters.sort(); 162 | a.learners.sort(); 163 | b.learners.sort(); 164 | a.voters_outgoing.sort(); 165 | b.voters_outgoing.sort(); 166 | a.learners_next.sort(); 167 | b.learners_next.sort(); 168 | 169 | a.get_auto_leave() == b.get_auto_leave() 170 | && a.get_voters() == b.get_voters() 171 | && a.get_voters_outgoing() == b.get_voters_outgoing() 172 | && a.get_learners() == b.get_learners() 173 | } 174 | 175 | pub fn cmp_config_change_v2(a: &ConfChangeV2, b: &ConfChangeV2) -> bool { 176 | a.get_transition() == b.get_transition() 177 | && a.get_changes() == b.get_changes() 178 | && a.get_context() == b.get_context() 179 | } 180 | 181 | pub fn entry_to_conf_changei(entry: &Entry) -> Option> { 182 | if entry.get_Type() == EntryConfChange { 183 | let mut cc = ConfChange::default(); 184 | assert!(cc.merge_from_bytes(entry.get_Data()).is_ok()); 185 | return Some(Box::new(cc)); 186 | } else if entry.get_Type() == EntryConfChangeV2 { 187 | let mut cc = ConfChangeV2::default(); 188 | assert!(cc.merge_from_bytes(entry.get_Data()).is_ok()); 189 | return Some(Box::new(cc)); 190 | } 191 | None 192 | } 193 | 194 | // ConfChangesFromString parses a Space-delimited sequence of operations into a 195 | // slice of ConfChangeSingle. The supported operations are: 196 | // - vn: make n a voter, 197 | // - ln: make n a learner, 198 | // - rn: remove n, and 199 | // - un: update n. 200 | pub fn conf_changes_from_string(s: &str) -> Result, String> { 201 | let mut ccs = Vec::::new(); 202 | for tok in &mut s 203 | .split_ascii_whitespace() 204 | .map(|s| s.chars()) 205 | .collect::>() 206 | { 207 | if tok.count() < 2 { 208 | return Err(format!( 209 | "unknown token {}", 210 | tok.into_iter().collect::() 211 | )); 212 | } 213 | let mut cc = ConfChangeSingle::new(); 214 | match tok.nth(0).unwrap() { 215 | 'v' => cc.set_field_type(ConfChangeAddNode), 216 | 'l' => cc.set_field_type(ConfChangeAddLearnerNode), 217 | 'r' => cc.set_field_type(ConfChangeRemoveNode), 218 | 'u' => cc.set_field_type(ConfChangeUpdateNode), 219 | _ => { 220 | return Err(format!( 221 | "unknown token {}", 222 | tok.into_iter().collect::() 223 | )); 224 | } 225 | } 226 | let id = tok.skip(0).into_iter().collect::(); 227 | cc.set_node_id(id.parse().unwrap()); 228 | ccs.push(cc); 229 | } 230 | Ok(ccs) 231 | } 232 | 233 | #[cfg(test)] 234 | mod tests { 235 | use crate::raftpb::raft::ConfChangeV2; 236 | use bytes::Bytes; 237 | use protobuf::Message; 238 | 239 | #[test] 240 | fn it_works() { 241 | let mut cc = ConfChangeV2::new(); 242 | cc.set_context(Bytes::from("manual")); 243 | let data = cc.write_to_bytes().unwrap(); 244 | let mut expect = ConfChangeV2::default(); 245 | expect.merge_from_bytes(data.as_slice()).unwrap(); 246 | assert_eq!(expect.get_context(), "manual".as_bytes()); 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /src/raftpb/raft.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | package raftpb; 3 | 4 | enum EntryType { 5 | EntryNormal = 0; 6 | EntryConfChange = 1; // corresponds to pb.ConfChange 7 | EntryConfChangeV2 = 2; // corresponds to pb.ConfChangeV2 8 | } 9 | 10 | message Entry { 11 | uint64 Term = 2 ; 12 | uint64 Index = 3 ; 13 | EntryType Type = 1 ; 14 | bytes Data = 4; 15 | } 16 | 17 | message SnapshotMetadata { 18 | ConfState conf_state = 1; 19 | uint64 index = 2; 20 | uint64 term = 3 ; 21 | } 22 | 23 | message Snapshot { 24 | bytes data = 1; 25 | SnapshotMetadata metadata = 2; 26 | } 27 | 28 | enum MessageType { 29 | MsgHup = 0; 30 | MsgBeat = 1; 31 | MsgProp = 2; 32 | MsgApp = 3; 33 | MsgAppResp = 4; 34 | MsgVote = 5; 35 | MsgVoteResp = 6; 36 | MsgSnap = 7; 37 | MsgHeartbeat = 8; 38 | MsgHeartbeatResp = 9; 39 | MsgUnreachable = 10; 40 | MsgSnapStatus = 11; 41 | MsgCheckQuorum = 12; 42 | MsgTransferLeader = 13; 43 | MsgTimeoutNow = 14; 44 | MsgReadIndex = 15; 45 | MsgReadIndexResp = 16; 46 | MsgPreVote = 17; 47 | MsgPreVoteResp = 18; 48 | } 49 | 50 | message Message { 51 | MessageType type = 1 ; 52 | uint64 to = 2 ; 53 | uint64 from = 3 ; 54 | uint64 term = 4 ; 55 | uint64 logTerm = 5 ; 56 | uint64 index = 6 ; 57 | repeated Entry entries = 7 ; 58 | uint64 commit = 8 ; 59 | Snapshot snapshot = 9 ; 60 | bool reject = 10 ; 61 | uint64 rejectHint = 11 ; 62 | bytes context = 12; 63 | } 64 | 65 | message HardState { 66 | uint64 term = 1; 67 | uint64 vote = 2; 68 | uint64 commit = 3; 69 | } 70 | 71 | // ConfChangeTransition specifies the behavior of a configuration change with 72 | // respect to joint consensus. 73 | enum ConfChangeTransition { 74 | // Automatically use the simple protocol if possible, otherwise fall back 75 | // to ConfChangeJointImplicit. Most applications will want to use this. 76 | ConfChangeTransitionAuto = 0; 77 | // Use joint consensus unconditionally, and transition out of them 78 | // automatically (by proposing a zero configuration change). 79 | // 80 | // This option is suitable for applications that want to minimize the time 81 | // spent in the joint configuration and do not store the joint configuration 82 | // in the state machine (outside of InitialState). 83 | ConfChangeTransitionJointImplicit = 1; 84 | // Use joint consensus and remain in the joint configuration until the 85 | // application proposes a no-op configuration change. This is suitable for 86 | // applications that want to explicitly control the transitions, for example 87 | // to use a custom payload (via the Context field). 88 | ConfChangeTransitionJointExplicit = 2; 89 | } 90 | 91 | message ConfState { 92 | // The voters in the incoming config. (If the configuration is not joint, 93 | // then the outgoing config is empty). 94 | repeated uint64 voters = 1; 95 | // The learners in the incoming config. 96 | repeated uint64 learners = 2; 97 | // The voters in the outgoing config. 98 | repeated uint64 voters_outgoing = 3; 99 | // The nodes that will become learners when the outgoing config is removed. 100 | // These nodes are necessarily currently in nodes_joint (or they would have 101 | // been added to the incoming config right away). 102 | repeated uint64 learners_next = 4; 103 | // If set, the config is joint and Raft will automatically transition into 104 | // the final config (i.e. remove the outgoing config) when this is safe. 105 | bool auto_leave = 5; 106 | } 107 | 108 | enum ConfChangeType { 109 | ConfChangeAddNode = 0; 110 | ConfChangeRemoveNode = 1; 111 | ConfChangeUpdateNode = 2; 112 | ConfChangeAddLearnerNode = 3; 113 | } 114 | 115 | message ConfChange { 116 | ConfChangeType type = 2; 117 | uint64 node_id = 3 ; 118 | bytes context = 4; 119 | 120 | // NB: this is used only by etcd to thread through a unique identifier. 121 | // Ideally it should really use the Context instead. No counterpart to 122 | // this field exists in ConfChangeV2. 123 | uint64 id = 1; 124 | } 125 | 126 | // ConfChangeSingle is an individual configuration change operation. Multiple 127 | // such operations can be carried out atomically via a ConfChangeV2. 128 | message ConfChangeSingle { 129 | ConfChangeType type = 1; 130 | uint64 node_id = 2 ; 131 | } 132 | 133 | // ConfChangeV2 messages initiate configuration changes. They support both the 134 | // simple "one at a time" membership change protocol and full Joint Consensus 135 | // allowing for arbitrary changes in membership. 136 | // 137 | // The supplied context is treated as an opaque payload and can be used to 138 | // attach an action on the state machine to the application of the config change 139 | // proposal. Note that contrary to Joint Consensus as outlined in the Raft 140 | // paper[1], configuration changes become active when they are *applied* to the 141 | // state machine (not when they are appended to the log). 142 | // 143 | // The simple protocol can be used whenever only a single change is made. 144 | // 145 | // Non-simple changes require the use of Joint Consensus, for which two 146 | // configuration changes are run. The first configuration change specifies the 147 | // desired changes and transitions the Raft group into the joint configuration, 148 | // in which quorum requires a majority of both the pre-changes and post-changes 149 | // configuration. Joint Consensus avoids entering fragile intermediate 150 | // configurations that could compromise survivability. For example, without the 151 | // use of Joint Consensus and running across three availability zones with a 152 | // replication factor of three, it is not possible to replace a voter without 153 | // entering an intermediate configuration that does not survive the outage of 154 | // one availability zone. 155 | // 156 | // The provided ConfChangeTransition specifies how (and whether) Joint Consensus 157 | // is used, and assigns the task of leaving the joint configuration either to 158 | // Raft or the application. Leaving the joint configuration is accomplished by 159 | // proposing a ConfChangeV2 with only and optionally the Context field 160 | // populated. 161 | // 162 | // For details on Raft membership changes, see: 163 | // 164 | // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf 165 | message ConfChangeV2 { 166 | ConfChangeTransition transition = 1 ; 167 | repeated ConfChangeSingle changes = 2; 168 | bytes context = 3; 169 | } 170 | -------------------------------------------------------------------------------- /src/read_only.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use crate::raft::ReadOnlyOption; 16 | use crate::raftpb::raft::{Message, MessageType}; 17 | use std::borrow::Cow; 18 | use std::collections::HashMap; 19 | 20 | // ReadState provides state for read only query. 21 | // It's caller's responsibility to call ReadIndex first before getting 22 | // this state from ready. it's also caller's duty to differentiate if this 23 | // state is what it requests through request_ctx, eg. given a unique id as 24 | // request_ctx 25 | #[derive(Default, Debug, Clone, Eq, PartialEq)] 26 | pub struct ReadState { 27 | pub index: u64, 28 | pub request_ctx: Vec, 29 | } 30 | 31 | #[derive(Default, Debug, Clone)] 32 | pub(crate) struct ReadIndexStatus { 33 | pub req: Message, 34 | pub index: u64, 35 | // NB: this never records 'false', but it's more convenient to use this 36 | // instead of a HashMap due to the API of quorum.VoteResult. If 37 | // this becomes performance sensitive enough (doubtful), quorum.VoteResult 38 | // can change to an API that is closer to that of CommittedIndex. 39 | pub acks: HashMap, 40 | } 41 | 42 | #[derive(Clone)] 43 | pub struct ReadOnly { 44 | pub(crate) option: ReadOnlyOption, 45 | pub(crate) pending_read_index: HashMap, ReadIndexStatus>, 46 | pub(crate) read_index_queue: Vec>, 47 | } 48 | 49 | impl ReadOnly { 50 | pub(crate) fn new(option: ReadOnlyOption) -> Self { 51 | ReadOnly { 52 | option, 53 | pending_read_index: Default::default(), 54 | read_index_queue: vec![], 55 | } 56 | } 57 | 58 | // add_request adds a record only request into readonly struct. 59 | // `index` is the commit index of the raft state machine when it received 60 | // the read only request. 61 | // `m` is the original read only request message from the local or remote node. 62 | pub(crate) fn add_request(&mut self, index: u64, m: Message) { 63 | let s = m.get_entries()[0].get_Data().to_vec(); 64 | let read_index_status = ReadIndexStatus { 65 | req: m, 66 | index, 67 | acks: Default::default(), 68 | }; 69 | self.pending_read_index 70 | .entry(s.clone()) 71 | .or_insert(read_index_status); 72 | self.read_index_queue.push(s); 73 | } 74 | 75 | // recv_ack notifies the read_only struct that the raft state machine received 76 | // an acknowledgment of the heartbeat that attached with the read only request 77 | // context. 78 | pub(crate) fn recv_ack(&mut self, id: u64, context: Vec) -> Option<&HashMap> { 79 | if let Some(mut entry) = self.pending_read_index.get_mut(&context) { 80 | entry.acks.insert(id, true); 81 | return Some(&entry.acks); 82 | } 83 | None 84 | } 85 | 86 | // Advances the read only request queue kept by the read_only struct. 87 | // It dequeues the requests until it finds the read only request that has 88 | // the same context as the given `m`. 89 | pub(crate) fn advance(&mut self, m: Message) -> Vec { 90 | let mut rss: Vec = vec![]; 91 | let mut i = 0; 92 | let mut found = false; 93 | for ok_ctx in &self.read_index_queue { 94 | i += 1; 95 | let rs = self.pending_read_index.get(ok_ctx); 96 | if rs.is_none() { 97 | panic!("cannot find corresponding read state from pending map"); 98 | } 99 | let rs = rs.unwrap(); 100 | rss.push(rs.clone()); 101 | if ok_ctx.as_slice() == m.get_context() { 102 | found = true; 103 | break; 104 | } 105 | } 106 | if found { 107 | self.read_index_queue.drain(..i); 108 | rss.iter().for_each(|rs| { 109 | self.pending_read_index 110 | .remove(rs.req.get_entries()[0].get_Data()); 111 | }); 112 | return rss; 113 | } 114 | vec![] 115 | } 116 | 117 | // last_pending_request returns the context of the last pending read only 118 | // request in readonly struct 119 | pub(crate) fn last_pending_request(&self) -> Option> { 120 | self.read_index_queue.last().map(|v| v.clone()) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/status.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use crate::node::SoftState; 16 | use crate::raft::{Raft, StateType}; 17 | use crate::raftpb::raft::HardState; 18 | use crate::storage::Storage; 19 | use crate::tracker::progress::ProgressMap; 20 | use crate::tracker::Config; 21 | use std::fmt::{Display, Formatter}; 22 | 23 | /// Contains information about this Raft peer and its view of the system. 24 | /// The Progress is only populated on the leader. 25 | #[derive(Clone, Debug)] 26 | pub struct Status { 27 | pub(crate) base_status: BaseStatus, 28 | pub config: Config, 29 | pub progress: ProgressMap, 30 | } 31 | 32 | impl Display for Status { 33 | fn fmt(&self, f: &mut Formatter<'_>) -> ::std::fmt::Result { 34 | write!(f, "{:?}", self) 35 | } 36 | } 37 | 38 | /// Contains basic information about the Raft peer. It does not allocate 39 | #[derive(Clone, Debug)] 40 | pub struct BaseStatus { 41 | id: u64, 42 | hard_state: HardState, 43 | soft_state: SoftState, 44 | applied: u64, 45 | lead_transferee: u64, 46 | } 47 | 48 | impl From<&Raft> for BaseStatus { 49 | fn from(raft: &Raft) -> Self { 50 | BaseStatus { 51 | id: raft.id, 52 | hard_state: raft.hard_state(), 53 | soft_state: raft.soft_state(), 54 | applied: raft.raft_log.applied, 55 | lead_transferee: raft.lead_transferee, 56 | } 57 | } 58 | } 59 | 60 | impl From<&Raft> for Status { 61 | fn from(raft: &Raft) -> Self { 62 | let mut s = Status { 63 | base_status: BaseStatus::from(raft), 64 | config: Default::default(), 65 | progress: Default::default(), 66 | }; 67 | if s.base_status.soft_state.raft_state == StateType::Leader { 68 | s.progress = raft.prs.progress.clone(); 69 | } 70 | s.config = raft.prs.config.clone(); 71 | s 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/tests_util.rs: -------------------------------------------------------------------------------- 1 | use env_logger::Env; 2 | use std::io::Write; 3 | 4 | #[cfg(any(test))] 5 | pub(crate) fn try_init_log() { 6 | // env_logger::try_init_from_env(Env::new().default_filter_or("info")); 7 | let mut env = env_logger::Env::default().filter_or(env_logger::DEFAULT_FILTER_ENV, "trace"); 8 | env_logger::Builder::from_env(env) 9 | .format(|buf, record| { 10 | writeln!( 11 | buf, 12 | "{} {} [{}:{}], {}", 13 | chrono::Local::now().format("%Y-%m-%d %H:%M:%S"), 14 | record.level(), 15 | record.file().unwrap_or(""), 16 | record.line().unwrap(), 17 | &record.args() 18 | ) 19 | }) 20 | .try_init(); 21 | } 22 | 23 | 24 | #[cfg(any(test))] 25 | pub(crate) mod mock { 26 | use std::collections::HashMap; 27 | use bytes::Bytes; 28 | use protobuf::RepeatedField; 29 | use crate::raft::{Config, NO_LIMIT, Raft, ReadOnlyOption}; 30 | use crate::raft_log::RaftLog; 31 | use crate::raftpb::raft::{Entry, Message, MessageType, Snapshot}; 32 | use crate::rawnode::{RawCoreNode, SafeRawNode}; 33 | use crate::storage::{SafeMemStorage, Storage}; 34 | 35 | pub fn read_message(raft: &mut Raft) -> Vec { 36 | let msg = raft.msgs.clone(); 37 | raft.msgs.clear(); 38 | msg 39 | } 40 | 41 | pub struct MocksEnts(Entry); 42 | 43 | impl Into for MocksEnts { 44 | fn into(self) -> Entry { 45 | self.0 46 | } 47 | } 48 | 49 | impl Into> for MocksEnts { 50 | fn into(self) -> RepeatedField { 51 | RepeatedField::from_vec(vec![self.0]) 52 | } 53 | } 54 | 55 | impl From<&str> for MocksEnts { 56 | fn from(buf: &str) -> Self { 57 | let v = Vec::from(buf); 58 | let mut entry = Entry::new(); 59 | entry.set_Data(Bytes::from(v)); 60 | MocksEnts(entry) 61 | } 62 | } 63 | 64 | pub struct MockEntry(Entry); 65 | 66 | impl MockEntry { 67 | pub fn set_data(mut self, buf: Vec) -> MockEntry { 68 | self.0.set_Data(Bytes::from(buf)); 69 | self 70 | } 71 | 72 | pub fn set_index(mut self, index: u64) -> MockEntry { 73 | self.0.set_Index(index); 74 | self 75 | } 76 | } 77 | 78 | impl Into for MockEntry { 79 | fn into(self) -> Entry { 80 | self.0 81 | } 82 | } 83 | 84 | impl From> for MockEntry { 85 | fn from(v: Vec) -> Self { 86 | let mut entry = Entry::new(); 87 | entry.set_Data(Bytes::from(v)); 88 | MockEntry(entry) 89 | } 90 | } 91 | 92 | impl From<&str> for MockEntry { 93 | fn from(buf: &str) -> Self { 94 | let data = Vec::from(buf); 95 | let mut entry = Entry::new(); 96 | entry.set_Data(Bytes::from(data)); 97 | MockEntry(entry) 98 | } 99 | } 100 | 101 | pub fn new_entry(index: u64, term: u64) -> Entry { 102 | let mut entry = Entry::new(); 103 | entry.set_Index(index); 104 | entry.set_Term(term); 105 | entry 106 | } 107 | 108 | pub fn new_entry_set(set: Vec<(u64, u64)>) -> Vec { 109 | set.iter() 110 | .map(|(index, term)| new_entry(*index, *term)) 111 | .collect() 112 | } 113 | 114 | pub fn new_entry_set2(set: Vec<(u64, u64, &str)>) -> Vec { 115 | set.iter().map(|(index, term, data)| { 116 | let mut entry = new_entry(*index, *term); 117 | let data = Vec::from(*data); 118 | entry.set_Data(Bytes::from(data)); 119 | entry 120 | }).collect() 121 | } 122 | 123 | pub fn new_empty_entry_set() -> Vec { 124 | Vec::new() 125 | } 126 | 127 | pub fn new_snapshot(index: u64, term: u64) -> Snapshot { 128 | let mut snapshot = Snapshot::new(); 129 | snapshot.mut_metadata().set_index(index); 130 | snapshot.mut_metadata().set_term(term); 131 | snapshot 132 | } 133 | 134 | pub fn new_memory() -> SafeMemStorage { 135 | let storage = SafeMemStorage::new(); 136 | storage 137 | } 138 | 139 | pub fn new_log() -> RaftLog { 140 | RaftLog::new(new_memory()) 141 | } 142 | 143 | pub fn new_log_with_storage(storage: T) -> RaftLog { 144 | RaftLog::new(storage) 145 | } 146 | 147 | pub fn new_test_raw_node( 148 | id: u64, 149 | peers: Vec, 150 | election_tick: u64, 151 | heartbeat_tick: u64, 152 | s: SafeMemStorage, 153 | ) -> SafeRawNode { 154 | SafeRawNode::new2(new_test_conf(id, peers, election_tick, heartbeat_tick), s) 155 | } 156 | 157 | pub fn new_test_core_node( 158 | id: u64, 159 | peers: Vec, 160 | election_tick: u64, 161 | heartbeat_tick: u64, 162 | s: SafeMemStorage, 163 | ) -> RawCoreNode { 164 | RawCoreNode::new(new_test_conf(id, peers, election_tick, heartbeat_tick), s) 165 | } 166 | 167 | pub fn new_test_inner_node( 168 | id: u64, 169 | peers: Vec, 170 | election_tick: u64, 171 | heartbeat_tick: u64, 172 | s: SafeMemStorage, 173 | ) -> Raft { 174 | Raft::new(new_test_conf(id, peers, election_tick, heartbeat_tick), s) 175 | } 176 | 177 | pub fn new_test_conf(id: u64, peers: Vec, election_tick: u64, heartbeat_tick: u64) -> Config { 178 | Config { 179 | id, 180 | peers, 181 | learners: vec![], 182 | election_tick, 183 | heartbeat_tick, 184 | applied: 0, 185 | max_size_per_msg: NO_LIMIT, 186 | max_committed_size_per_ready: 0, 187 | max_uncommitted_entries_size: 0, 188 | max_inflight_msgs: 1 << 3, 189 | check_quorum: false, 190 | pre_vote: false, 191 | read_only_option: ReadOnlyOption::ReadOnlySafe, 192 | disable_proposal_forwarding: false, 193 | } 194 | } 195 | 196 | 197 | // Returns the appliable entries and updates the applied index 198 | fn next_ents(mut raft: Raft, s: &mut SafeMemStorage) -> Vec { 199 | // transfer all unstable entries to "stable" storage. 200 | s.wl().append(raft.raft_log.unstable_entries().to_vec()); 201 | raft.raft_log.stable_to(raft.raft_log.last_index(), raft.raft_log.last_term()); 202 | 203 | let ents = raft.raft_log.next_ents(); 204 | raft.raft_log.applied_to(raft.raft_log.committed); 205 | return ents; 206 | } 207 | 208 | fn must_append_entry(raft: &mut Raft, mut ents: Vec) where S: Storage { 209 | assert!(raft.append_entry(&mut ents), "entry unexpectedly dropped"); 210 | } 211 | 212 | trait StateMachine { 213 | fn step(&mut self, m: Message) -> Result<(), String>; 214 | fn read_message(&mut self) -> Vec; 215 | } 216 | 217 | struct NetWork { 218 | peers: HashMap, 219 | storage: HashMap, 220 | dropm: HashMap, 221 | ignorem: HashMap, 222 | // `msg_hook` is called for each message sent. It may inspect the 223 | // message and return true to send it for false to drop it 224 | msg_hook: Box bool>, 225 | } 226 | 227 | impl NetWork { 228 | pub fn send(&mut self, msgs: Vec) { 229 | unimplemented!("unimplemented") 230 | } 231 | 232 | pub fn drop(&mut self, from: u64, to: u64, perc: f64) { 233 | unimplemented!("unimplemented") 234 | } 235 | 236 | pub fn cut(&mut self, one: u64, other: u64) { 237 | unimplemented!("unimplemented") 238 | } 239 | 240 | pub fn isolated(&mut self, id: u64) { 241 | unimplemented!("unimplemented") 242 | } 243 | 244 | pub fn ignore(&mut self, t: MessageType) { 245 | unimplemented!("unimplemented") 246 | } 247 | 248 | pub fn recover(&mut self) { 249 | self.dropm.clear(); 250 | self.ignorem.clear(); 251 | } 252 | 253 | pub fn filter(&mut self, msgs: Vec) -> Vec { 254 | unimplemented!("unimplemented") 255 | } 256 | } 257 | 258 | #[derive(Debug, Clone)] 259 | struct ConnEm { 260 | from: u64, 261 | to: u64, 262 | } 263 | 264 | #[derive(Debug, Clone)] 265 | struct BlackHole {} 266 | 267 | impl StateMachine for BlackHole { 268 | fn step(&mut self, m: Message) -> Result<(), String> { 269 | Ok(()) 270 | } 271 | 272 | fn read_message(&mut self) -> Vec { 273 | vec![] 274 | } 275 | } 276 | 277 | 278 | pub fn ids_by_size(size: u64) -> Vec { 279 | (1..=size).collect::>() 280 | } 281 | } -------------------------------------------------------------------------------- /src/tracker/inflights.rs: -------------------------------------------------------------------------------- 1 | use nom::lib::std::fmt::{Display, Formatter}; 2 | 3 | // Inflights limits the number of MsgApp(represented by the largest index 4 | // contained within) sent to followers but not yet acknowledged by them. Callers 5 | // use Full() to check whether more messages can be sent, call Add() whenever 6 | // the are sending a new append, and release "quota" via free_le() whenever an 7 | // ack is received. 8 | #[derive(Default, PartialEq, Clone, Debug)] 9 | pub struct Inflights { 10 | // the starting index in the buffer 11 | start: usize, 12 | // number of inflights in the buffer 13 | count: usize, 14 | // the size of the buffer 15 | size: usize, 16 | // buffer contains the index of the last entry 17 | // inside one message 18 | buffer: Vec, 19 | } 20 | 21 | impl Display for Inflights { 22 | fn fmt(&self, f: &mut Formatter<'_>) -> ::std::fmt::Result { 23 | write!( 24 | f, 25 | "start:{}, count:{}, size:{}, is_full: {}, buffer:{:?}", 26 | self.start, 27 | self.count, 28 | self.size, 29 | self.full(), 30 | self.buffer 31 | ) 32 | } 33 | } 34 | 35 | impl Inflights { 36 | pub fn new(size: u64) -> Self { 37 | Inflights { 38 | start: 0, 39 | count: 0, 40 | size: size as usize, 41 | buffer: vec![0].repeat(size as usize), 42 | } 43 | } 44 | 45 | // Add notifies the Inflights that a new message with the given index is being 46 | // dispatched. Full() must be called prior to Add() to verify that there is room 47 | // for one more message, and consecutive calls to add Add() must provide a 48 | // monotonic sequence of indexes. 49 | pub fn add(&mut self, inflight: u64) { 50 | if self.full() { 51 | panic!("cannot add into a Full inflights"); 52 | } 53 | let mut next = self.start + self.count; 54 | let mut size = self.size; 55 | if next >= size { 56 | next -= size; 57 | } 58 | if next >= self.buffer.len() { 59 | self.grow(); 60 | } 61 | self.buffer[next] = inflight; 62 | self.count += 1; 63 | if self.full() { 64 | info!("[has full {}]", self.count()); 65 | } 66 | } 67 | 68 | /// The inflight buffer by doubling up tp `inflights.size`. We grow on demand 69 | /// instead of preallocating to `inflights.size` to handle system which have 70 | /// thousands of Raft groups per process. 71 | pub fn grow(&mut self) { 72 | let mut new_size = self.buffer.len() * 2; 73 | if new_size == 0 { 74 | new_size = 1; 75 | } else if new_size > self.size { 76 | new_size = self.size; 77 | } 78 | let mut new_buffer = Vec::with_capacity(new_size); 79 | new_buffer.extend_from_slice(&self.buffer); 80 | self.buffer = new_buffer; 81 | } 82 | 83 | /// Frees the inflights smaller or equal to the given `to` flight. 84 | pub fn free_le(&mut self, to: u64) { 85 | if self.count == 0 || to < self.buffer[self.start] { 86 | // out of the left side of the window 87 | return; 88 | } 89 | 90 | let mut idx = self.start; 91 | let mut i = 0; 92 | while i < self.count { 93 | if to < self.buffer[idx] { 94 | // found the first large inflight 95 | break; 96 | } 97 | let size = self.size; 98 | idx += 1; 99 | if idx >= size { 100 | idx -= size; 101 | } 102 | i += 1; 103 | } 104 | // free i inflights and set new start index 105 | self.count -= i; 106 | self.start = idx; 107 | if self.count == 0 { 108 | // inflights is empty, reset the start index so that we don't grow the 109 | // buffer unnecessarily. 110 | self.start = 0; 111 | } 112 | } 113 | 114 | // FreeFirstOne releases the first inflight. This is a no-op if nothing is inflight. 115 | pub fn free_first_one(&mut self) { 116 | self.free_le(self.buffer[self.start]); 117 | } 118 | 119 | pub fn full(&self) -> bool { 120 | self.count == self.size 121 | } 122 | 123 | pub fn count(&self) -> usize { 124 | self.count 125 | } 126 | 127 | pub(crate) fn reset(&mut self) { 128 | self.count = 0; 129 | self.start = 0; 130 | } 131 | } 132 | 133 | #[cfg(test)] 134 | mod tests { 135 | use crate::tracker::inflights::Inflights; 136 | 137 | #[test] 138 | fn it_inflights_add() { 139 | let mut inf = Inflights { 140 | start: 0, 141 | count: 0, 142 | size: 10, 143 | buffer: vec![0].repeat(10), 144 | }; 145 | (0..5).for_each(|i| inf.add(i)); 146 | let want_inf = Inflights { 147 | start: 0, 148 | count: 5, 149 | size: 10, 150 | buffer: vec![0, 1, 2, 3, 4, 0, 0, 0, 0, 0], 151 | }; 152 | assert_eq!(inf, want_inf); 153 | 154 | (5..10).for_each(|i| inf.add(i)); 155 | let want_inf = Inflights { 156 | start: 0, 157 | count: 10, 158 | size: 10, 159 | buffer: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 160 | }; 161 | assert_eq!(inf, want_inf); 162 | 163 | // rotating case 164 | let mut in2 = Inflights { 165 | start: 5, 166 | count: 0, 167 | size: 10, 168 | buffer: vec![0].repeat(10), 169 | }; 170 | (0..5).for_each(|i| in2.add(i)); 171 | let want_inf = Inflights { 172 | start: 5, 173 | count: 5, 174 | size: 10, 175 | buffer: vec![0, 0, 0, 0, 0, 0, 1, 2, 3, 4], 176 | }; 177 | assert_eq!(in2, want_inf); 178 | 179 | (5..10).for_each(|i| in2.add(i)); 180 | let want_inf = Inflights { 181 | start: 5, 182 | count: 10, 183 | size: 10, 184 | buffer: vec![5, 6, 7, 8, 9, 0, 1, 2, 3, 4], 185 | }; 186 | assert_eq!(in2, want_inf); 187 | } 188 | 189 | #[test] 190 | fn it_inflights_free_to() { 191 | let mut inf = Inflights::new(10); 192 | (0..10).for_each(|i| inf.add(i)); 193 | inf.free_le(4); 194 | let want_inf = Inflights { 195 | start: 5, 196 | count: 5, 197 | size: 10, 198 | buffer: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 199 | }; 200 | assert_eq!(inf, want_inf); 201 | 202 | inf.free_le(4); 203 | assert_eq!(inf, want_inf); 204 | 205 | inf.free_le(8); 206 | let want_inf = Inflights { 207 | start: 9, 208 | count: 1, 209 | size: 10, 210 | buffer: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 211 | }; 212 | assert_eq!(inf, want_inf); 213 | 214 | // rotating case 215 | (10..15).for_each(|i| inf.add(i)); 216 | let want_inf = Inflights { 217 | start: 9, 218 | count: 6, 219 | size: 10, 220 | buffer: vec![10, 11, 12, 13, 14, 5, 6, 7, 8, 9], 221 | }; 222 | assert_eq!(inf, want_inf); 223 | 224 | inf.free_le(12); 225 | let want_inf = Inflights { 226 | start: 3, 227 | count: 2, 228 | size: 10, 229 | buffer: vec![10, 11, 12, 13, 14, 5, 6, 7, 8, 9], 230 | }; 231 | assert_eq!(inf, want_inf); 232 | 233 | inf.free_le(14); 234 | let want_inf = Inflights { 235 | start: 0, 236 | count: 0, 237 | size: 10, 238 | buffer: vec![10, 11, 12, 13, 14, 5, 6, 7, 8, 9], 239 | }; 240 | assert_eq!(inf, want_inf); 241 | } 242 | 243 | #[test] 244 | fn it_inflights_free_first_one() { 245 | let mut inf = Inflights::new(10); 246 | (0..10).for_each(|i| inf.add(i)); 247 | inf.free_first_one(); 248 | let want_inf = Inflights { 249 | start: 1, 250 | count: 9, 251 | size: 10, 252 | buffer: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 253 | }; 254 | assert_eq!(inf, want_inf); 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /src/tracker/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::quorum::joint::JointConfig; 2 | use crate::quorum::majority::MajorityConfig; 3 | use crate::quorum::quorum::VoteResult::VoteWon; 4 | use crate::quorum::quorum::{AckedIndexer, Index, VoteResult}; 5 | use crate::raftpb::raft::ConfState; 6 | use crate::tracker::progress::{Progress, ProgressMap}; 7 | 8 | use std::collections::{HashMap, HashSet}; 9 | use std::fmt::{Display, Formatter}; 10 | use std::iter::Cloned; 11 | 12 | pub mod inflights; 13 | pub mod progress; 14 | pub mod state; 15 | 16 | // Config reflects the configuration tracked in a ProgressTacker. 17 | #[derive(Default, Clone, PartialEq, Debug)] 18 | pub struct Config { 19 | pub voters: JointConfig, 20 | // auto_leave is true if the configuration is joint and a transition to the 21 | // incoming configuration should be carried out automatically by Raft when 22 | // this is possible. If false, the configuration will be joint until the 23 | // application initiates than transition manually. 24 | pub auto_leave: bool, 25 | // Learner is a set of Ids corresponding to the learners active in th 26 | // current configutation. 27 | // 28 | // Invariant: Learners and Voters does not intersect, i.e if a peer is in 29 | // either half of the joint config, it can't be a learner; if it is a 30 | // learner it can't be in either half of the joint config. This invariant 31 | // simplifies the implementation since it allows peers to have clarity about 32 | // its current role without taking into account joint consensus. 33 | pub learners: HashSet, 34 | // When we return a voter into a learner during a joint consensus transition, 35 | // we cannot add the learner directly when entering the joint state. This is 36 | // because this would violate the invariant that the intersect of 37 | // voters and learners is empty. For example, assume a Voter is removed and 38 | // imediately re-added as a learner (or in other words, it it demoted): 39 | // 40 | // Initially, the configuration will be 41 | // 42 | // voters: {1, 2, 3} 43 | // learners: {} 44 | // 45 | // and we want to demote 3. Entering the joint configuration, we naively get 46 | // 47 | // voters: {1, 2} & {1, 2, 3} 48 | // learners: {3} 49 | // 50 | // but this violates invariant (3 is both voter and learner). Instead, 51 | // we get 52 | // 53 | // voters: {1, 2} & {1, 2, 3} 54 | // learners: {} 55 | // next_learners: {3} 56 | // 57 | // Where 3 is not still purely a voter, but we are remembering the intention 58 | // to make it a learner upon transitioning into the final configuration: 59 | // 60 | // voters: {1, 2} 61 | // learners: {3} 62 | // next_learners: {} 63 | // 64 | // Note that next_learners is not used while adding a learner that is not 65 | // also a voter in the joint config. In this case, the learner is added 66 | // right away when entering the joint configuration, so that it is caught up 67 | // as soon as possible. 68 | pub learners_next: HashSet, 69 | } 70 | 71 | impl Display for Config { 72 | fn fmt(&self, f: &mut Formatter<'_>) -> ::std::fmt::Result { 73 | write!(f, "voters={}", self.voters).unwrap(); 74 | if !self.learners.is_empty() { 75 | write!( 76 | f, 77 | " learners={}", 78 | MajorityConfig { 79 | votes: self.learners.clone() 80 | } 81 | ) 82 | .unwrap(); 83 | } 84 | if !self.learners_next.is_empty() { 85 | write!( 86 | f, 87 | " learners_next={}", 88 | MajorityConfig { 89 | votes: self.learners_next.clone() 90 | } 91 | ) 92 | .unwrap(); 93 | } 94 | if self.auto_leave { 95 | write!(f, " autoleave").unwrap(); 96 | } 97 | Ok(()) 98 | } 99 | } 100 | 101 | /// ProgressTracker tracks the currently active configuration and the information 102 | /// known about the nodes and learners in it. In particular, it tracks the match 103 | /// index for each peer when in turn allows reasoning abound the committed index. 104 | #[derive(Debug, PartialEq)] 105 | pub struct ProgressTracker { 106 | pub config: Config, 107 | pub progress: ProgressMap, 108 | pub votes: HashMap, 109 | pub max_inflight: u64, 110 | } 111 | 112 | impl Clone for ProgressTracker { 113 | fn clone(&self) -> Self { 114 | let mut to = ProgressTracker::new(self.max_inflight); 115 | to.config = self.config.clone(); 116 | let mut progress_inner = HashMap::new(); 117 | progress_inner.extend( 118 | self.progress 119 | .iter() 120 | .map(|(key, value)| (*key, value.clone())), 121 | ); 122 | to.progress = ProgressMap::new(progress_inner); 123 | to.votes = self.votes.clone(); 124 | to 125 | } 126 | } 127 | 128 | impl ProgressTracker { 129 | pub fn new(max_inflight: u64) -> ProgressTracker { 130 | let mut p = ProgressTracker { 131 | config: Default::default(), 132 | progress: ProgressMap::default(), 133 | votes: Default::default(), 134 | max_inflight, 135 | }; 136 | p 137 | } 138 | 139 | // ConfState returns a ConfState representing the active configuration. 140 | pub fn config_state(&self) -> ConfState { 141 | let mut conf_state = ConfState::new(); 142 | conf_state.set_voters(self.config.voters.incoming.as_slice()); 143 | conf_state.set_voters_outgoing(self.config.voters.outgoing.as_slice()); 144 | conf_state.set_learners( 145 | self.config 146 | .learners 147 | .iter() 148 | .map(|learner| *learner) 149 | .collect(), 150 | ); 151 | conf_state.set_learners_next( 152 | self.config 153 | .learners_next 154 | .iter() 155 | .map(|learner| *learner) 156 | .collect(), 157 | ); 158 | conf_state.set_auto_leave(self.config.auto_leave); 159 | conf_state 160 | } 161 | 162 | // is_singleton returns true if (and only if) there is only one voting number 163 | // (i.e. the leader) in the current configuration. 164 | pub fn is_singleton(&self) -> bool { 165 | self.config.voters.is_singleton() 166 | } 167 | 168 | // committed returns the largest log index known to be committed based on what 169 | // the voting members of the group have acknowledged. 170 | pub fn committed(&mut self) -> u64 { 171 | self.config 172 | .voters 173 | .committed(&MatchAckIndexer::from(&self.progress)) 174 | } 175 | 176 | // visit invokes the supplied closure for all tracked progresses in stable order. 177 | pub fn visit(&mut self, mut f: F) 178 | where 179 | F: FnMut(u64, &mut Progress), 180 | { 181 | let n = self.progress.len(); 182 | // We need to sort the IDs and don't want to allocate since this is hot code. 183 | // The optimized here mirrors that in `(MajorityConfig).CommittedIndex`, 184 | // see there for details 185 | // TODO optimized 186 | let mut ids: Vec = Vec::new(); 187 | ids.extend(self.progress.keys().into_iter()); 188 | ids.sort_by_key(|k| *k); 189 | for id in ids { 190 | let progress = self.progress.get_mut(&id).unwrap(); 191 | f(id, progress); 192 | } 193 | } 194 | 195 | #[inline] 196 | pub fn visit_nodes(&self) -> Vec { 197 | let mut ids: Vec = self.progress.keys().map(|id| *id).collect::>(); 198 | ids.sort_by_key(|k| *k); 199 | ids 200 | } 201 | 202 | // returns true if the quorum is active from the view of the local 203 | // raft state machine. Otherwise, it returns false. 204 | pub fn quorum_active(&mut self) -> bool { 205 | let mut votes = HashMap::new(); 206 | self.visit(|id, progress| { 207 | if progress.is_learner { 208 | return; 209 | } 210 | votes.insert(id, progress.recent_active); 211 | }); 212 | self.config.voters.vote_result(&votes) == VoteWon 213 | } 214 | 215 | // returns a sorted slice of voters. 216 | pub fn voter_nodes(&self) -> Vec { 217 | let mut nodes: Vec = self.config.voters.ids().iter().map(|id| *id).collect(); 218 | nodes.sort_by_key(|id| *id); 219 | nodes 220 | } 221 | 222 | // returns a sorted slice of voters 223 | pub fn learner_nodes(&self) -> Vec { 224 | let mut nodes: Vec = self.config.learners.iter().map(|id| *id).collect(); 225 | nodes.sort_by_key(|id| *id); 226 | nodes 227 | } 228 | 229 | // prepares for a new round of vote counting via record_vote. 230 | pub fn reset_votes(&mut self) { 231 | self.votes.clear(); 232 | } 233 | 234 | // records that the node with the given id voted for this Raft 235 | // instance if v == true (and declined it otherwise) 236 | pub fn record_vote(&mut self, id: u64, v: bool) { 237 | self.votes.entry(id).or_insert(v); 238 | } 239 | 240 | // returns the number of granted and rejected votes, and whether the election outcome is known 241 | pub fn tally_votes(&self) -> (usize, usize, VoteResult) { 242 | // Make sure to populate granted/rejected correctly even if the votes slice 243 | // contains members no larger part of the configuration. This doesn't really 244 | // matter in the way the numbers are used (they're information), but might 245 | // as well get it right. 246 | let mut granted = 0; 247 | let mut rejected = 0; 248 | for (id, progress) in self.progress.iter() { 249 | if progress.is_learner { 250 | continue; 251 | } 252 | match self.votes.get(id) { 253 | Some(v) => { 254 | if *v { 255 | granted += 1; 256 | } else { 257 | rejected += 1; 258 | } 259 | } 260 | None => {} 261 | } 262 | } 263 | let res = self.config.voters.vote_result(&self.votes); 264 | info!("grant: {}, rejected: {}, res: {:?}", granted, rejected, res); 265 | (granted, rejected, res) 266 | } 267 | } 268 | 269 | pub(crate) type MatchAckIndexer = HashMap; 270 | 271 | // implements IndexLookuper 272 | impl AckedIndexer for MatchAckIndexer { 273 | fn acked_index(&self, voter_id: &u64) -> Option<&u64> { 274 | self.get(voter_id).map(|pr| &pr._match) 275 | } 276 | } 277 | 278 | impl From<&ProgressMap> for MatchAckIndexer { 279 | fn from(progress: &ProgressMap) -> Self { 280 | let mut match_ack_indexer: MatchAckIndexer = Default::default(); 281 | match_ack_indexer.clone_from(progress.to_map()); 282 | match_ack_indexer 283 | } 284 | } 285 | -------------------------------------------------------------------------------- /src/tracker/state.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{self, Display, Error, Formatter}; 2 | 3 | // StateType is the state of a tracked follower. 4 | #[derive(Clone, Debug, PartialEq)] 5 | pub enum StateType { 6 | // StateProbe indicates that a follower whose last index isn't known. Such a 7 | // follower is "probe" (i.e. an append sent periodically) to narrow down 8 | // its last index. In the ideal (and common) case, only one round of probing 9 | // is necessary as the follower will react with a hint. Followers that are 10 | // probed over extend periods of time are often offline. 11 | Probe, 12 | // StateReplicate is the steady in which a follower eagerly receives 13 | // log entries to append to its log. 14 | Replicate, 15 | // StateSnapshot indicates a follower that needs log entries not avaliable 16 | // from the leader's Raft log. Such a follower needs a full snapshot to 17 | // return a StateReplicate 18 | Snapshot, 19 | } 20 | 21 | impl Default for StateType { 22 | fn default() -> Self { 23 | StateType::Probe 24 | } 25 | } 26 | 27 | impl Display for StateType { 28 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 29 | match self { 30 | StateType::Probe => write!(f, "StateProbe"), 31 | StateType::Replicate => { 32 | write!(f, "StateReplicate") 33 | } 34 | StateType::Snapshot => write!(f, "StateSnapshot"), 35 | } 36 | } 37 | } 38 | 39 | #[cfg(test)] 40 | mod tests { 41 | use crate::tracker::state::StateType; 42 | #[test] 43 | fn it_works() { 44 | assert_eq!(format!("{}", StateType::Probe), "StateProbe"); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/unstable.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use crate::raftpb::raft::{Entry, Snapshot}; 16 | 17 | // unstable.entries[i] has raft log position i+unstable.offset. 18 | // Note that unstable.offset may be less than highest log 19 | // position in storage; this means that the next write to storage 20 | // might need to truncate the log before persisting unstable.entries. 21 | #[derive(Default, Clone)] 22 | pub(crate) struct Unstable { 23 | // the incoming unstable snapshot, if any. 24 | pub(crate) snapshot: Option, 25 | // all entries that have not been yet been written to storage. 26 | pub(crate) entries: Vec, 27 | // the first index of `entries` first entry 28 | pub(crate) offset: u64, 29 | } 30 | 31 | impl Unstable { 32 | // Returns the index of the first possible entry in entries if it has a snapshot. 33 | pub(crate) fn maybe_first_index(&self) -> Option { 34 | self.snapshot 35 | .as_ref() 36 | .map(|snapshot| snapshot.get_metadata().get_index() + 1) 37 | } 38 | 39 | // Returns last index if it has at least one unstable entry or snapshot 40 | pub(crate) fn maybe_last_index(&self) -> Option { 41 | if !self.entries.is_empty() { 42 | return Some(self.offset + self.entries.len() as u64 - 1); 43 | } 44 | self.snapshot 45 | .as_ref() 46 | .map(|snapshot| snapshot.get_metadata().get_index()) 47 | } 48 | 49 | // Returns the term of the entry at index i, if there is any. 50 | pub(crate) fn maybe_term(&self, i: u64) -> Option { 51 | if i < self.offset { 52 | if let Some(snapshot) = self.snapshot.as_ref() { 53 | if snapshot.get_metadata().get_index() == i { 54 | return Some(snapshot.get_metadata().get_term()); 55 | } 56 | } 57 | return None; 58 | } 59 | match self.maybe_last_index() { 60 | Some(index) => { 61 | if i > index { 62 | None 63 | } else { 64 | Some(self.entries[(i - self.offset) as usize].Term) 65 | } 66 | } 67 | None => None, 68 | } 69 | } 70 | 71 | // If self.entries had written to storage then clears these entries by stable_to 72 | pub(crate) fn stable_to(&mut self, i: u64, t: u64) { 73 | if let Some(gt) = self.maybe_term(i) { 74 | // if i < offset, term is matched with the snapshot 75 | // only update the unstable entries if term is matched with 76 | // an unstable entry 77 | if gt == t && i >= self.offset { 78 | let start = i + 1 - self.offset; 79 | // TODO: Optz entries memory 80 | self.entries.drain(..start as usize); 81 | self.offset = i + 1; 82 | } 83 | } 84 | } 85 | 86 | 87 | // As same to stable_to, if self.snapshot had written to storage then reset snapshot 88 | pub(crate) fn stable_snap_to(&mut self, i: u64) { 89 | if let Some(ref snapshot) = self.snapshot { 90 | if snapshot.get_metadata().get_index() == i { 91 | self.snapshot = None; 92 | } 93 | } 94 | } 95 | 96 | pub(crate) fn restore(&mut self, s: Snapshot) { 97 | self.offset = s.get_metadata().get_index() + 1; 98 | self.entries.clear(); 99 | self.snapshot = Some(s); 100 | } 101 | 102 | pub(crate) fn truncate_and_append(&mut self, ents: &[Entry]) { 103 | match ents[0].get_Index() { 104 | after if after == self.offset + self.entries.len() as u64 => { 105 | // after is the next index in the self.entries 106 | // directly append 107 | self.entries.extend_from_slice(ents); 108 | } 109 | after if after <= self.offset => { 110 | info!("replace the unstable entries from index {}", after); 111 | // The log is being truncated to before our current offset 112 | // portion, so set the offset and replace the entries 113 | self.offset = after; 114 | self.entries.clear(); 115 | self.entries.extend_from_slice(ents); 116 | } 117 | after => { 118 | // truncate to after and copy to self.entries 119 | // then append 120 | info!("truncate the unstable entries before index {}", after); 121 | self.entries.truncate((after - self.offset) as usize); 122 | self.entries.extend_from_slice(&ents); 123 | } 124 | } 125 | } 126 | 127 | pub(crate) fn slice(&self, lo: u64, hi: u64) -> Vec { 128 | self.must_check_out_of_bounds(lo, hi); 129 | self.entries[(lo - self.offset) as usize..(hi - self.offset) as usize].to_vec() 130 | } 131 | 132 | // self.offset <= lo <= hi <= self.offset + self.entries.len() 133 | fn must_check_out_of_bounds(&self, lo: u64, hi: u64) { 134 | if lo > hi { 135 | panic!("invalid unstable.slice {} > {}", lo, hi); 136 | } 137 | let upper = self.offset + self.entries.len() as u64; 138 | if lo < self.offset || hi > upper { 139 | panic!( 140 | "unstable.slice[{}, {}] out of bound [{}, {}]", 141 | lo, hi, self.offset, upper 142 | ); 143 | } 144 | } 145 | } 146 | 147 | #[cfg(test)] 148 | mod tests { 149 | use crate::raftpb::raft::{Entry, Snapshot}; 150 | use crate::unstable::Unstable; 151 | 152 | #[test] 153 | fn it_works() { 154 | assert_eq!(2 + 2, 4); 155 | } 156 | 157 | #[test] 158 | fn it_unstable_maybe_first_index() { 159 | // (entries, offset, snapshot, w_ok, w_index) 160 | let tests = vec![ 161 | // no snapshot 162 | (vec![new_entry(5, 1)], 0, None, false, 0), 163 | (vec![], 0, None, false, 0), 164 | // has snapshot 165 | (vec![new_entry(5, 1)], 5, Some(new_snapshot(4, 1)), true, 5), 166 | (vec![], 5, Some(new_snapshot(4, 1)), true, 5), 167 | ]; 168 | for (i, (entries, offset, snapshot, w_ok, w_index)) in tests.iter().enumerate() { 169 | let mut u = Unstable { 170 | snapshot: snapshot.clone(), 171 | entries: entries.clone(), 172 | offset: *offset, 173 | }; 174 | match u.maybe_first_index() { 175 | Some(i) => { 176 | assert_eq!(i, *w_index); 177 | } 178 | None => assert!(!*w_ok), 179 | } 180 | } 181 | } 182 | 183 | #[test] 184 | fn it_maybe_last_index() { 185 | // (entries, offset, snapshot, w_ok, w_index) 186 | let tests = vec![ 187 | // last in entries 188 | (vec![new_entry(5, 1)], 5, None, true, 5), 189 | (vec![new_entry(5, 1)], 5, Some(new_snapshot(4, 1)), true, 5), 190 | // last in snapshot 191 | (vec![], 5, Some(new_snapshot(4, 1)), true, 4), 192 | // empty unstable 193 | (vec![], 0, None, false, 0), 194 | ]; 195 | for (i, (entries, offset, snapshot, w_ok, w_index)) in tests.iter().enumerate() { 196 | let u = Unstable { 197 | snapshot: snapshot.clone(), 198 | entries: entries.clone(), 199 | offset: *offset, 200 | }; 201 | match u.maybe_last_index() { 202 | Some(i) => { 203 | assert_eq!(i, *w_index); 204 | } 205 | None => assert!(!*w_ok), 206 | } 207 | } 208 | } 209 | 210 | #[test] 211 | fn it_unstable_maybe_term() { 212 | // (entries, offset, snapshot, index, w_ok, w_term) 213 | let tests = vec![ 214 | // term from entries 215 | (vec![new_entry(5, 1)], 5, None, 5, true, 1), 216 | (vec![new_entry(5, 1)], 5, None, 6, false, 0), 217 | ( 218 | vec![new_entry(5, 1)], 219 | 5, 220 | Some(new_snapshot(4, 1)), 221 | 5, 222 | true, 223 | 1, 224 | ), 225 | ( 226 | vec![new_entry(5, 1)], 227 | 5, 228 | Some(new_snapshot(4, 1)), 229 | 6, 230 | false, 231 | 0, 232 | ), 233 | // term from snapshot 234 | ( 235 | vec![new_entry(5, 1)], 236 | 5, 237 | Some(new_snapshot(4, 1)), 238 | 4, 239 | true, 240 | 1, 241 | ), 242 | ( 243 | vec![new_entry(5, 1)], 244 | 5, 245 | Some(new_snapshot(4, 1)), 246 | 3, 247 | false, 248 | 0, 249 | ), 250 | (vec![], 5, Some(new_snapshot(4, 1)), 5, false, 0), 251 | (vec![], 5, Some(new_snapshot(4, 1)), 4, true, 1), 252 | (vec![], 0, None, 5, false, 0), 253 | ]; 254 | for (i, (entries, offset, snapshot, index, w_ok, w_term)) in tests.iter().enumerate() { 255 | let u = Unstable { 256 | snapshot: snapshot.clone(), 257 | entries: entries.clone(), 258 | offset: *offset, 259 | }; 260 | match u.maybe_term(*index) { 261 | Some(i) => assert_eq!(i, *w_term), 262 | None => assert!(!*w_ok), 263 | } 264 | } 265 | } 266 | 267 | #[test] 268 | fn it_unstable_restore() { 269 | let mut u = Unstable { 270 | snapshot: Some(new_snapshot(4, 1)), 271 | entries: vec![new_entry(5, 1)], 272 | offset: 5, 273 | }; 274 | let s = new_snapshot(6, 2); 275 | u.restore(s.clone()); 276 | assert_eq!(u.offset, s.get_metadata().get_index() + 1); 277 | assert!(u.entries.is_empty()); 278 | assert_eq!(u.snapshot.unwrap(), s); 279 | } 280 | 281 | #[test] 282 | fn it_unstable_stable_to() { 283 | // (entries, offset, snapshot, index, term, w_offset, w_len) 284 | let tests = vec![ 285 | (vec![], 0, None, 5, 1, 0, 0), 286 | (vec![new_entry(5, 1)], 5, None, 5, 1, 6, 0), // stable to the first entry 287 | (new_batch_entry(vec![(5, 1), (6, 1)]), 5, None, 5, 1, 6, 1), // stable to the first entry 288 | (vec![new_entry(6, 2)], 6, None, 6, 1, 6, 1), // stable to the first entry and term mismatch 289 | (vec![new_entry(5, 1)], 5, None, 4, 1, 5, 1), // stable to old entry 290 | (vec![new_entry(5, 1)], 5, None, 4, 2, 5, 1), // stable to old entry 291 | // with snapshot 292 | ( 293 | vec![new_entry(5, 1)], 294 | 5, 295 | Some(new_snapshot(4, 1)), 296 | 5, 297 | 1, 298 | 6, 299 | 0, 300 | ), // stable to the first entry 301 | ( 302 | new_batch_entry(vec![(5, 1), (6, 1)]), 303 | 5, 304 | Some(new_snapshot(4, 1)), 305 | 5, 306 | 1, 307 | 6, 308 | 1, 309 | ), // stable to the first entry 310 | ( 311 | vec![new_entry(6, 2)], 312 | 6, 313 | Some(new_snapshot(5, 1)), 314 | 6, 315 | 1, 316 | 6, 317 | 1, 318 | ), // stable to the first entry and term mismatch 319 | ( 320 | vec![new_entry(5, 1)], 321 | 5, 322 | Some(new_snapshot(4, 1)), 323 | 4, 324 | 1, 325 | 5, 326 | 1, 327 | ), // stable to snapshot 328 | ( 329 | vec![new_entry(5, 2)], 330 | 5, 331 | Some(new_snapshot(4, 2)), 332 | 4, 333 | 1, 334 | 5, 335 | 1, 336 | ), // stable to old entry 337 | ]; 338 | for (i, (entries, offset, snapshot, index, term, w_offset, w_len)) in 339 | tests.iter().enumerate() 340 | { 341 | let mut u = Unstable { 342 | snapshot: snapshot.clone(), 343 | entries: entries.clone(), 344 | offset: *offset, 345 | }; 346 | u.stable_to(*index, *term); 347 | assert_eq!(u.offset, *w_offset); 348 | assert_eq!(u.entries.len(), *w_len); 349 | } 350 | } 351 | 352 | #[test] 353 | fn it_unstable_stable_truncate_and_append() { 354 | // (entries, offset, snapshot, to_append, w_offset, w_entries) 355 | let tests: Vec<(_, _, Option, _, _, _)> = vec![ 356 | // append to the end 357 | ( 358 | vec![new_entry(5, 1)], 359 | 5, 360 | None, 361 | new_batch_entry(vec![(6, 1), (7, 1)]), 362 | 5, 363 | new_batch_entry(vec![(5, 1), (6, 1), (7, 1)]), 364 | ), 365 | // replace the unstable entries 366 | ( 367 | vec![new_entry(5, 1)], 368 | 5, 369 | None, 370 | new_batch_entry(vec![(5, 2), (6, 2)]), 371 | 5, 372 | new_batch_entry(vec![(5, 2), (6, 2)]), 373 | ), 374 | ( 375 | vec![new_entry(5, 1)], 376 | 5, 377 | None, 378 | new_batch_entry(vec![(4, 2), (5, 2), (6, 2)]), 379 | 4, 380 | new_batch_entry(vec![(4, 2), (5, 2), (6, 2)]), 381 | ), 382 | // truncate the existing entries and append 383 | ( 384 | new_batch_entry(vec![(5, 1), (6, 1), (7, 1)]), 385 | 5, 386 | None, 387 | new_batch_entry(vec![(6, 2)]), 388 | 5, 389 | new_batch_entry(vec![(5, 1), (6, 2)]), 390 | ), 391 | ( 392 | new_batch_entry(vec![(5, 1), (6, 1), (7, 1)]), 393 | 5, 394 | None, 395 | new_batch_entry(vec![(7, 2), (8, 2)]), 396 | 5, 397 | new_batch_entry(vec![(5, 1), (6, 1), (7, 2), (8, 2)]), 398 | ), 399 | ]; 400 | 401 | for (entries, offset, snapshot, to_append, w_offset, w_entries) in tests { 402 | let mut u = Unstable { 403 | snapshot, 404 | entries, 405 | offset, 406 | }; 407 | u.truncate_and_append(to_append.as_slice()); 408 | assert_eq!(u.offset, w_offset); 409 | assert_eq!(u.entries, w_entries); 410 | } 411 | } 412 | 413 | fn new_entry(index: u64, term: u64) -> Entry { 414 | let mut entry = Entry::new(); 415 | entry.set_Term(term); 416 | entry.set_Index(index); 417 | entry 418 | } 419 | 420 | fn new_batch_entry(batch: Vec<(u64, u64)>) -> Vec { 421 | batch 422 | .iter() 423 | .map(|(index, term)| new_entry(*index, *term)) 424 | .collect() 425 | } 426 | 427 | fn new_snapshot(index: u64, term: u64) -> Snapshot { 428 | let mut snapshot = Snapshot::new(); 429 | snapshot.mut_metadata().set_index(index); 430 | snapshot.mut_metadata().set_term(term); 431 | snapshot 432 | } 433 | } 434 | -------------------------------------------------------------------------------- /src/util/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The etcd Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use crate::raftpb::raft::{Entry, HardState, MessageType}; 16 | use protobuf::Message; 17 | 18 | pub fn is_local_message(msg_type: MessageType) -> bool { 19 | msg_type == MessageType::MsgHup 20 | || msg_type == MessageType::MsgBeat 21 | || msg_type == MessageType::MsgUnreachable 22 | || msg_type == MessageType::MsgSnapStatus 23 | || msg_type == MessageType::MsgCheckQuorum 24 | } 25 | 26 | // TODO: add more information 27 | pub fn is_response_message(msg_type: MessageType) -> bool { 28 | msg_type == MessageType::MsgAppResp 29 | || msg_type == MessageType::MsgVoteResp 30 | || msg_type == MessageType::MsgHeartbeatResp 31 | || msg_type == MessageType::MsgUnreachable 32 | || msg_type == MessageType::MsgPreVoteResp 33 | } 34 | 35 | // TODO: 36 | pub fn is_hard_state_equal(a: &HardState, b: &HardState) -> bool { 37 | a.get_term() == b.get_term() && a.get_vote() == b.get_vote() || a.get_commit() == b.get_commit() 38 | } 39 | 40 | // [0..max_size] 41 | pub fn limit_size(ents: Vec, max_size: u64) -> Vec { 42 | if ents.is_empty() { 43 | return vec![]; 44 | } 45 | let mut size = ents[0].compute_size() as u64; 46 | let mut limit = 1; 47 | while limit < ents.len() { 48 | size += ents[limit].compute_size() as u64; 49 | if size > max_size { 50 | break; 51 | } 52 | limit += 1; 53 | } 54 | ents[..limit].to_vec() 55 | } 56 | 57 | pub fn vote_resp_msg_type(msgt: MessageType) -> MessageType { 58 | match msgt { 59 | MessageType::MsgVote => MessageType::MsgVoteResp, 60 | MessageType::MsgPreVote => MessageType::MsgPreVoteResp, 61 | _ => panic!("not a vote message: {:?}", msgt), 62 | } 63 | } 64 | --------------------------------------------------------------------------------